Like some others, I have used a lookup table but with a twist. A
Dictionary
with a specialized grouping key.
The key is a
<Tuple<int, long, long>
made up of length of words, and a pair of base 26
Long
s. As there are 26 characters to english words, I've split the lowercase alphabet into halfs - a to l (0 to 12) to the lower & m to z (13 to 26) to the upper.
The
Find
method run in milliseconds, almost instantly, with a list of over 350,000 words. Word length does not affect performance.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using ZeroFormatter;
namespace BuildLookupFile
{
static class Program
{
static void Main(string[] args)
{
Console.WriteLine("Anagram Lookup File Builder");
Console.WriteLine("===========================");
var helper = new AnagramsHelper();
var sw = new Stopwatch();
Console.WriteLine("\r\nLoading & building Lookup Table...");
sw.Start();
helper.LoadWords("words.txt");
var elapsed = sw.Elapsed;
Console.WriteLine("* Loaded & processed in {0:N6} seconds",
elapsed.TotalMilliseconds / 1000);
Console.WriteLine("\r\nStatistics");
Console.WriteLine("----------");
Console.WriteLine("* {0:N0} words", helper.count);
Console.WriteLine("\r\nSaving to file...");
sw.Restart();
helper.SaveTable("words.bin");
elapsed = sw.Elapsed;
Console.WriteLine("* Write to disk took in {0:N6} seconds",
elapsed.TotalMilliseconds / 1000);
}
}
public class AnagramsHelper
{
public int count { get; private set; }
private Dictionary<string, List<string>> LookupTable
= new Dictionary<string, List<string>>();
public void LoadWords(string filename)
{
var wordKeys = new Dictionary<string, int>();
string word, key;
List<string> wordList = null;
using (StreamReader sr = File.OpenText(filename))
{
while ((word = sr.ReadLine()) != null)
{
if (word.Length > 0 && !word.Contains("'") && !word.Contains("\""))
{
key = word.GetKey();
if (!LookupTable.ContainsKey(key))
{
wordList = new List<string>();
LookupTable.Add(key, wordList);
}
else
{
wordList = LookupTable[key];
}
count++;
wordList.Add(word);
}
}
}
}
internal void SaveTable(string filename)
{
using (var fs = File.OpenWrite(filename))
LookupTable.Serialize(fs);
}
}
public static class HelperExtensions
{
public static string GetKey(this string word)
{
var chars = word.ToCharArray();
Array.Sort(chars);
return new string(chars);
}
public static void Serialize(this Dictionary<string, List<string>> data, Stream stream)
{
var writer = new BinaryWriter(stream);
writer.Write(ZeroFormatterSerializer.Serialize(data));
writer.Flush();
}
}
}
Imports System.IO
Imports System.Runtime.CompilerServices
Imports ZeroFormatter
Module Module1
Sub Main()
Console.WriteLine("Anagram Lookup File Builder")
Console.WriteLine("===========================")
Dim helper = New AnagramsHelper()
Dim sw = New Stopwatch()
Console.WriteLine(vbCr & vbLf & "Loading & building Lookup Table...")
sw.Start()
helper.LoadWords("words.txt")
Dim elapsed = sw.Elapsed
Console.WriteLine("* Loaded & processed in {0:N6} seconds", elapsed.TotalMilliseconds / 1000)
Console.WriteLine(vbCr & vbLf & "Statistics")
Console.WriteLine("----------")
Console.WriteLine("* {0:N0} words", helper.count)
Console.WriteLine(vbCr & vbLf & "Saving to file...")
sw.Restart()
helper.SaveTable("words.bin")
elapsed = sw.Elapsed
Console.WriteLine("* Write to disk took in {0:N6} seconds", elapsed.TotalMilliseconds / 1000)
End Sub
End Module
Public Class AnagramsHelper
Public Property count() As Integer
Private LookupTable As New Dictionary(Of String, List(Of String))()
Public Sub LoadWords(filename As String)
Dim word As String
Dim key As String
Dim wordList As List(Of String)
Using sr As StreamReader = File.OpenText(filename)
Do
word = sr.ReadLine()
If word?.Length > 0 AndAlso
Not word.Contains("'") AndAlso
Not word.Contains("""") Then
key = word.GetKey()
If Not LookupTable.ContainsKey(key) Then
wordList = New List(Of String)()
LookupTable.Add(key, wordList)
Else
wordList = LookupTable(key)
End If
count += 1
wordList.Add(word)
End If
Loop While word IsNot Nothing
End Using
End Sub
Friend Sub SaveTable(filename As String)
Using fs = File.OpenWrite(filename)
LookupTable.Serialize(fs)
End Using
End Sub
End Class
Public Module HelperExtensions
<Extension>
Public Function GetKey(word As String) As String
Dim chars = word.ToCharArray()
Array.Sort(chars)
Return New String(chars)
End Function
<Extension>
Public Sub Serialize(data As Dictionary(Of String, List(Of String)), stream As Stream)
Dim writer = New BinaryWriter(stream)
writer.Write(ZeroFormatterSerializer.Serialize(data))
writer.Flush()
End Sub
End Module
And here is th output:
Ultra-Fast Anagram Generator
============================
Word list supplied by: https://github.com/dwyl/english-words
Loading & building Lookup Table...
Statistics
----------
* 351,096 words
* Loaded & processed in 7.4770 seconds
* Largest group of Anagrams: 15
* Longest word in Lookup Table:
- dichlorodiphenyltrichloroethane
Search Word: teaser
FOUND 9 anagrams for "teaser"
aretes
asteer
easter
eaters
reseat
saeter
seater
staree
teaser
Time: 0.000072 seconds
Search Word: alerts
FOUND 15 anagrams for "alerts"
alerts
alters
artels
estral
laster
lastre
rastle
ratels
relast
resalt
salter
slater
staler
stelar
talers
Time: 0.000072 seconds
UPDATE: I've found an improvement in performance through simplification. The upside is that load time is improved by 300%, and response time improved by over 200%.
Here are the revised results:
Ultra-Fast Anagram Generator
============================
Word list supplied by: https://github.com/dwyl/english-words
Loading & building Lookup Table...
Statistics
----------
* 351,096 words
* Loaded & processed in 2.5293 seconds
* Largest group of Anagrams: 15
* Longest word in Lookup Table:
- dichlorodiphenyltrichloroethane
Search Word: teaser
FOUND 9 anagrams for "teaser"
aretes
asteer
easter
eaters
reseat
saeter
seater
staree
teaser
Time: 0.000039 seconds
Search Word: alerts
FOUND 15 anagrams for "alerts"
alerts
alters
artels
estral
laster
lastre
rastle
ratels
relast
resalt
salter
slater
staler
stelar
talers
Time: 0.000034 seconds
UPDATE #2.1: Now that the lookup is lightning fast, the next performance bottleneck to resolve is the loading of the lookup table. PIEBOLDconsult is using a SQL DB, so I decided to pre-build the Lookup Table.
The biggest bottleneck encountered in saving and loading the binary version of the lookup table was the .Net
BinaryFormatter
whick took 61+ seconds (best time) to deserialize the binary stream. So, with a little searching I found
ZeroFormatter[
^].
Load time is now reduced from 2.5293 seconds to a
lightning fast at 0.534043 seconds - over 500% quicker and over
1400% quicker than the orignal!
Lastly, also squeezed
beter performance out of the lookup. Now reduced from 0.034ms
to 0.013ms or 261% improvement, and
over 550% faster than the original!
Here is the code for the building of the binary file:
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using ZeroFormatter;
namespace BuildLookupFile
{
static class Program
{
static void Main(string[] args)
{
Console.WriteLine("Anagram Lookup File Builder");
Console.WriteLine("===========================");
var helper = new AnagramsHelper();
var sw = new Stopwatch();
Console.WriteLine("\r\nLoading & building Lookup Table...");
sw.Start();
helper.LoadWords("words.txt");
var elapsed = sw.Elapsed;
Console.WriteLine("* Loaded & processed in {0:N6} seconds",
elapsed.TotalMilliseconds / 1000);
Console.WriteLine("\r\nStatistics");
Console.WriteLine("----------");
Console.WriteLine("* {0:N0} words", helper.count);
Console.WriteLine("\r\nSaving to file...");
sw.Restart();
helper.SaveTable("words.bin");
elapsed = sw.Elapsed;
Console.WriteLine("* Write to disk took in {0:N6} seconds",
elapsed.TotalMilliseconds / 1000);
}
}
public class AnagramsHelper
{
public int count { get; private set; }
private Dictionary<string, List<string>> LookupTable
= new Dictionary<string, List<string>>();
public void LoadWords(string filename)
{
var wordKeys = new Dictionary<string, int>();
string word, key;
List<string> wordList = null;
using (StreamReader sr = File.OpenText(filename))
{
while ((word = sr.ReadLine()) != null)
{
if (word.Length > 0 && !word.Contains("'") && !word.Contains("\""))
{
key = word.GetKey();
if (!LookupTable.ContainsKey(key))
{
wordList = new List<string>();
LookupTable.Add(key, wordList);
}
else
{
wordList = LookupTable[key];
}
count++;
wordList.Add(word);
}
}
}
}
internal void SaveTable(string filename)
{
using (var fs = File.OpenWrite(filename))
LookupTable.Serialize(fs);
}
}
public static class HelperExtensions
{
public static string GetKey(this string word)
{
var chars = word.ToCharArray();
Array.Sort(chars);
return new string(chars);
}
public static void Serialize(this Dictionary<string, List<string>> data, Stream stream)
{
var writer = new BinaryWriter(stream);
writer.Write(ZeroFormatterSerializer.Serialize(data));
writer.Flush();
}
}
}
Imports System.IO
Imports System.Runtime.CompilerServices
Imports ZeroFormatter
Module Module1
Sub Main()
Console.WriteLine("Anagram Lookup File Builder")
Console.WriteLine("===========================")
Dim helper = New AnagramsHelper()
Dim sw = New Stopwatch()
Console.WriteLine(vbCr & vbLf & "Loading & building Lookup Table...")
sw.Start()
helper.LoadWords("words.txt")
Dim elapsed = sw.Elapsed
Console.WriteLine("* Loaded & processed in {0:N6} seconds", elapsed.TotalMilliseconds / 1000)
Console.WriteLine(vbCr & vbLf & "Statistics")
Console.WriteLine("----------")
Console.WriteLine("* {0:N0} words", helper.count)
Console.WriteLine(vbCr & vbLf & "Saving to file...")
sw.Restart()
helper.SaveTable("words.bin")
elapsed = sw.Elapsed
Console.WriteLine("* Write to disk took in {0:N6} seconds", elapsed.TotalMilliseconds / 1000)
End Sub
End Module
Public Class AnagramsHelper
Public Property count() As Integer
Private LookupTable As New Dictionary(Of String, List(Of String))()
Public Sub LoadWords(filename As String)
Dim word As String
Dim key As String
Dim wordList As List(Of String)
Using sr As StreamReader = File.OpenText(filename)
Do
word = sr.ReadLine()
If word?.Length > 0 AndAlso
Not word.Contains("'") AndAlso
Not word.Contains("""") Then
key = word.GetKey()
If Not LookupTable.ContainsKey(key) Then
wordList = New List(Of String)()
LookupTable.Add(key, wordList)
Else
wordList = LookupTable(key)
End If
count += 1
wordList.Add(word)
End If
Loop While word IsNot Nothing
End Using
End Sub
Friend Sub SaveTable(filename As String)
Using fs = File.OpenWrite(filename)
LookupTable.Serialize(fs)
End Using
End Sub
End Class
Public Module HelperExtensions
<Extension>
Public Function GetKey(word As String) As String
Dim chars = word.ToCharArray()
Array.Sort(chars)
Return New String(chars)
End Function
<Extension>
Public Sub Serialize(data As Dictionary(Of String, List(Of String)), stream As Stream)
Dim writer = New BinaryWriter(stream)
writer.Write(ZeroFormatterSerializer.Serialize(data))
writer.Flush()
End Sub
End Module
Here is the output:
Anagram Lookup File Builder
===========================
Loading & building Lookup Table...
* Loaded & processed in 1.243783 seconds
Statistics
----------
* 351,096 words
Saving to file...
* Serialization & writing to disk took 0.300447 seconds
Here are the revised results:
Ultra-Fast Anagram Generator
============================
Word list supplied by: https://github.com/dwyl/english-words
Loading & building Lookup Table...
Statistics
----------
* Loaded & processed in 0.534043 seconds
* 351,096 words
* 311,497 anagram groups
* 283,469 unique anagrams
* Largest group of Anagrams: 15
* Longest word in Lookup Table:
- dichlorodiphenyltrichloroethane
Search Word: teaser
FOUND 9 anagrams for "teaser"
aretes
asteer
easter
eaters
reseat
saeter
seater
staree
teaser
Time: 0.000014 seconds
Search Word: alerts
FOUND 15 anagrams for "alerts"
alerts
alters
artels
estral
laster
lastre
rastle
ratels
relast
resalt
salter
slater
staler
stelar
talers
Time: 0.000013 seconds
UPDATE #3: After a conversation with jörgen Andersson on timings, I realied that there was another lookup performance gain overlooked. I was returning the whole collection from the dictionary after I did the key lookup. I did not need to do that, only referencing the lookup list of results as IEnumerable was required as the dictionary's KVP already contained the results. This gave over 900% improvement gain on the last version and
over 3600% improvement over the original -
now only takes 0.0011 ms!
UPDATE #4: Refactored code for performance. Anagram seach now only takes 0.0007 ms to return vaild words from the dictionary!
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using ZeroFormatter;
namespace Anagram
{
static class Program
{
static Stopwatch sw = new Stopwatch();
static TimeSpan elapsed;
static Dictionary<string, List<string>> LookupTable
= new Dictionary<string, List<string>>();
static void LoadWords()
{
using (var fs = File.OpenRead("words.bin"))
{
LookupTable = ZeroFormatterSerializer
.Deserialize<Dictionary<string, List<string>>>
((new BinaryReader(fs)).ReadBytes((int)fs.Length));
}
var prime = LookupTable.ContainsKey(GetKey("ready"));
}
static string GetKey(string w)
{
var chars = w.ToCharArray();
Array.Sort(chars);
return new string(chars);
}
static IEnumerable<string> Find(string lookupWord)
{
var chars = lookupWord.ToCharArray();
Array.Sort(chars);
var key = new string(chars);
if (LookupTable.ContainsKey(key))
foreach (var item in LookupTable[key])
yield return item;
else
yield break;
}
static void Main()
{
Console.WriteLine("Ultra-Fast Anagram Generator");
Console.WriteLine("============================");
Console.WriteLine("\r\nWord list supplied by: https://github.com/dwyl/english-words");
Console.WriteLine("\r\nLoading & building Lookup Table...");
sw.Start();
LoadWords();
elapsed = sw.Elapsed;
ReportStatisics(elapsed);
HandleUserQueries();
}
private static void ReportStatisics(TimeSpan elapsed)
{
Console.WriteLine("\r\nStatistics");
Console.WriteLine("----------");
var sec = elapsed.TotalMilliseconds / 1000;
var ms = elapsed.TotalMilliseconds;
var tw = LookupTable.Sum(x => x.Value.Count);
var tg = LookupTable.Count;
var ua = LookupTable.Where(x => x.Value.Count == 1).Count();
var lg = LookupTable.OrderByDescending(x => x.Value.Count)
.Select(x => x.Value).FirstOrDefault();
var lw = LookupTable.OrderByDescending(x => x.Value[0].Length)
.Select(x => x.Value).FirstOrDefault();
Console.WriteLine($"\r\n* Loaded in {sec:N8} seconds or {ms} ms\r\n");
Console.WriteLine($"* {tw:N0} words");
Console.WriteLine($"* {tg:N0} anagram groups");
Console.WriteLine($"* {ua:N0} unique anagrams");
Console.WriteLine($"* Largest group of Anagrams: {lg.Count}");
Console.WriteLine($"* Longest word{(lw.Count == 1 ? "" : "s")} in Lookup Table:");
foreach (var word in lw) Console.WriteLine($" - {word}");
}
private static void HandleUserQueries()
{
while (true)
{
Console.Write("\r\nSearch Word: ");
var lookupWord = Console.ReadLine().Trim().ToLower();
if (lookupWord.Length == 0) break;
sw.Restart();
var results = Find(lookupWord);
elapsed = sw.Elapsed;
if (results.Any())
{
int count = 0;
Console.WriteLine("\r\nFOUND:\r\n");
foreach (var item in results)
Console.WriteLine($" {++count,2}: {item}");
Console.WriteLine($"\r\nTotal: {count:N0} anagram{(count == 1 ? "" : "s")} for \"{lookupWord}\"");
}
else
{
Console.WriteLine($"\r\n{lookupWord} has no anagrams.");
}
var sec = elapsed.TotalMilliseconds / 1000;
var ms = elapsed.TotalMilliseconds;
Console.WriteLine($"Time: {sec:N8} seconds or {ms} ms");
}
}
}
}
Imports System.IO
Imports ZeroFormatter
Module Module1
Dim sw As New Stopwatch()
Dim elapsed As TimeSpan
Dim LookupTable As New Dictionary(Of String, List(Of String))()
Private Sub LoadWords()
Using fs = File.OpenRead("words.bin")
LookupTable = ZeroFormatterSerializer _
.Deserialize(Of Dictionary(Of String, List(Of String))) _
((New BinaryReader(fs)).ReadBytes(fs.Length))
End Using
Dim prime = LookupTable.ContainsKey(GetKey("ready"))
End Sub
Private Function GetKey(w As String) As String
Dim chars = w.ToCharArray()
Array.Sort(chars)
Return New String(chars)
End Function
Private Iterator Function Find(lookupWord As String) As IEnumerable(Of String)
Dim chars = lookupWord.ToCharArray()
Array.Sort(chars)
Dim key = New String(chars)
If LookupTable.ContainsKey(key) Then
For Each item In LookupTable(key)
Yield item
Next
End If
End Function
Sub Main()
Console.WriteLine("Ultra-Fast Anagram Generator")
Console.WriteLine("============================")
Console.WriteLine("{0}Word list supplied by: https://github.com/dwyl/english-words", vbCrLf)
Console.WriteLine("{0}Loading & building Lookup Table...", vbCrLf)
sw.Start()
LoadWords()
elapsed = sw.Elapsed
ReportStatisics(elapsed)
HandleUserQueries()
End Sub
Private Sub ReportStatisics(elapsed As TimeSpan)
Console.WriteLine("{0}Statistics", vbCrLf)
Console.WriteLine("----------")
Dim sec = elapsed.TotalMilliseconds / 1000
Dim ms = elapsed.TotalMilliseconds
Dim tw = LookupTable.Sum(Function(x) x.Value.Count)
Dim tg = LookupTable.Count
Dim ua = LookupTable.Where(Function(x) x.Value.Count = 1).Count()
Dim lg = LookupTable.OrderByDescending(Function(x) x.Value.Count) _
.Select(Function(x) x.Value).FirstOrDefault()
Dim lw = LookupTable.OrderByDescending(Function(x) x.Value(0).Length) _
.[Select](Function(x) x.Value).FirstOrDefault()
Console.WriteLine("{0}* Loaded in {1:N8} seconds or {2} ms", vbCrLf, sec, ms)
Console.WriteLine("* {0:N0} words", tw)
Console.WriteLine("* {0:N0} anagram groups", tg)
Console.WriteLine("* {0:N0} unique anagrams", ua)
Console.WriteLine("* Largest group of Anagrams: {0}", lg.Count)
Console.WriteLine("* Longest word{0} in Lookup Table:", If(lw.Count = 1, "", "s"))
For Each word In lw
Console.WriteLine(" - {0}", word)
Next
End Sub
Private Sub HandleUserQueries()
While True
Console.Write("{0}Search Word: ", vbCrLf)
Dim lookupWord = Console.ReadLine().Trim().ToLower()
If lookupWord.Length = 0 Then
Exit While
End If
sw.Restart()
Dim results = Find(lookupWord)
elapsed = sw.Elapsed
If results.Any() Then
Dim count As Integer = 0
Console.WriteLine("{0}FOUND:{0}", vbCrLf)
For Each item In results
count += 1
Console.WriteLine(" {0,2}: {1}", count, item)
Next
Console.WriteLine("{0}Total: {1:N0} anagram{2} for ""{3}""",
vbCrLf, count, If(count = 1, "", "s"), lookupWord)
Else
Console.WriteLine("{0}{1} has no anagrams.", vbCrLf, lookupWord)
End If
Dim sec = elapsed.TotalMilliseconds / 1000
Dim ms = elapsed.TotalMilliseconds
Console.WriteLine("Time: {0:N8} seconds or {1} ms", sec, ms)
End While
End Sub
End Module
Here are the revised results:
Ultra-Fast Anagram Generator
============================
Word list supplied by: https://github.com/dwyl/english-words
Loading & building Lookup Table...
Statistics
----------
* Loaded & processed in 0.53279680 seconds or 532.7968 ms
* 351,096 words
* 311,497 anagram groups
* 283,469 unique anagrams
* Largest group of Anagrams: 15
* Longest word in Lookup Table:
- dichlorodiphenyltrichloroethane
Search Word: teaser
FOUND:
1: aretes
2: asteer
3: easter
4: eaters
5: reseat
6: saeter
7: seater
8: staree
9: teaser
Total: 9 anagrams for "teaser"
Time: 0.00000070 seconds or 0.0007 ms
Search Word: alerts
FOUND:
1: alerts
2: alters
3: artels
4: estral
5: laster
6: lastre
7: rastle
8: ratels
9: relast
10: resalt
11: salter
12: slater
13: staler
14: stelar
15: talers
Total: 15 anagrams for "alerts"
Time: 0.00000090 seconds or 0.0009 ms