解决此问题的最佳方法是什么:
我有一组数组,每个数组中包含3-4个字符,如下所示:
{p, {a, {t, {m, q, b, u, n, r, c v o s } } } }
我也有一系列字典单词。
查找字符数组是否可以组合以构成词典单词之一的最佳/最快方法是什么?例如,上述阵列可以使的话:
“拍”,“鼠”,“AT”,“来”,“流浪汉”(笑) ,但不是“凸起”或“垫”
我应该循环通过字典来查看如果可以制作单词或从字母中获取所有组合,则将其与字典进行比较
我周围有一些Scrabble代码,因此我可以将它们组合在一起。我使用的字典是sowpods(267751个单词)。下面的代码将字典作为文本文件读取,每行带有一个大写单词。
代码是C#:
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; using System.Diagnostics; namespace SO_6022848 { public struct Letter { public const string Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; public static implicit operator Letter(char c) { return new Letter() { Index = Chars.IndexOf(c) }; } public int Index; public char ToChar() { return Chars[Index]; } public override string ToString() { return Chars[Index].ToString(); } } public class Trie { public class Node { public string Word; public bool IsTerminal { get { return Word != null; } } public Dictionary<Letter, Node> Edges = new Dictionary<Letter, Node>(); } public Node Root = new Node(); public Trie(string[] words) { for (int w = 0; w < words.Length; w++) { var word = words[w]; var node = Root; for (int len = 1; len <= word.Length; len++) { var letter = word[len - 1]; Node next; if (!node.Edges.TryGetValue(letter, out next)) { next = new Node(); if (len == word.Length) { next.Word = word; } node.Edges.Add(letter, next); } node = next; } } } } class Program { static void GenWords(Trie.Node n, HashSet<Letter>[] sets, int currentArrayIndex, List<string> wordsFound) { if (currentArrayIndex < sets.Length) { foreach (var edge in n.Edges) { if (sets[currentArrayIndex].Contains(edge.Key)) { if (edge.Value.IsTerminal) { wordsFound.Add(edge.Value.Word); } GenWords(edge.Value, sets, currentArrayIndex + 1, wordsFound); } } } } static void Main(string[] args) { const int minArraySize = 3; const int maxArraySize = 4; const int setCount = 10; const bool generateRandomInput = true; var trie = new Trie(File.ReadAllLines("sowpods.txt")); var watch = new Stopwatch(); var trials = 10000; var wordCountSum = 0; var rand = new Random(37); for (int t = 0; t < trials; t++) { HashSet<Letter>[] sets; if (generateRandomInput) { sets = new HashSet<Letter>[setCount]; for (int i = 0; i < setCount; i++) { sets[i] = new HashSet<Letter>(); var size = minArraySize + rand.Next(maxArraySize - minArraySize + 1); while (sets[i].Count < size) { sets[i].Add(Letter.Chars[rand.Next(Letter.Chars.Length)]); } } } else { sets = new HashSet<Letter>[] { new HashSet<Letter>(new Letter[] { 'P', 'Q', 'R', 'S' }), new HashSet<Letter>(new Letter[] { 'A', 'B', 'C' }), new HashSet<Letter>(new Letter[] { 'T', 'U', 'V' }), new HashSet<Letter>(new Letter[] { 'M', 'N', 'O' }) }; } watch.Start(); var wordsFound = new List<string>(); for (int i = 0; i < sets.Length - 1; i++) { GenWords(trie.Root, sets, i, wordsFound); } watch.Stop(); wordCountSum += wordsFound.Count; if (!generateRandomInput && t == 0) { foreach (var word in wordsFound) { Console.WriteLine(word); } } } Console.WriteLine("Elapsed per trial = {0}", new TimeSpan(watch.Elapsed.Ticks / trials)); Console.WriteLine("Average word count per trial = {0:0.0}", (float)wordCountSum / trials); } } }
这是使用测试数据时的输出:
PA PAT PAV QAT RAT RATO RAUN SAT SAU SAV SCUM AT AVO BUM BUN CUM TO UM UN Elapsed per trial = 00:00:00.0000725 Average word count per trial = 19.0
使用随机数据时的输出(不打印每个单词):
Elapsed per trial = 00:00:00.0002910 Average word count per trial = 62.2
编辑: 我通过以下两项更改使其速度更快:将单词存储在trie的每个终端节点上,这样就不必重建它了。并将输入的字母存储为散列集数组而不是数组数组,这样Contains()调用很快。