Posted on 26/05/2021 11:42:54
They do not want computer logic.
They want a list of input and suggestions they can apply them selves, because it requires a human brain with human decision process to give them what they find logic.
Below the code in question - enjoy :-). The project is attached. Then you can also do your own synonyms.
BR Nicolai
/// <summary>
/// Represents Lucene spell checker
/// </summary>
public class LuceneSpellChecker
{
private readonly SpellChecker.Net.Search.Spell.SpellChecker checker;
private readonly IndexReader indexReader;
private readonly string indexField;
private readonly int numberOfSuggestions;
private bool isIndexed;
/// <summary>
/// Constructs new spell checker instance
/// </summary>
/// <param name="reader"></param>
/// <param name="field"></param>
public LuceneSpellChecker(IndexReader reader, string field)
{
indexReader = reader;
indexField = field;
//checker = new SpellChecker.Net.Search.Spell.SpellChecker(new RAMDirectory(), new JaroWinklerDistance());
checker = new SpellChecker.Net.Search.Spell.SpellChecker(new RAMDirectory());
numberOfSuggestions = Configuration.SystemConfiguration.Instance.GetInt32("/GlobalSettings/System/Repository/LuceneSpellChecker/NumberOfSuggestions");
if (numberOfSuggestions <= 0)
numberOfSuggestions = 10;
}
private void EnsureIndexed()
{
if (!isIndexed)
{
checker.IndexDictionary(new LuceneDictionary(indexReader, indexField));
isIndexed = true;
}
}
/// <summary>
/// Suggest similar words.
/// </summary>
/// <param name="searchString">Word to find alternative suggestions for</param>
public IEnumerable<string> SuggestSimilar(string searchString)
{
return SuggestSimilar(searchString, false);
}
/// <summary>
/// Suggest similar words.
/// </summary>
/// <param name="searchString">Word to find alternative suggestions for</param>
/// <param name="analyzed">If the field that is looked at for suggestions is analyzed</param>
/// <returns></returns>
public IEnumerable<string> SuggestSimilar(string searchString, bool analyzed)
{
EnsureIndexed();
if (analyzed)
{
searchString = searchString.ToLowerInvariant();
}
List<string> searchTerms = new List<string>();
Analyzer analyzer = new mylucene.Analysis.Standard.StandardAnalyzer(mylucene.Util.Version.LUCENE_30);
using (var tokenStream = analyzer.TokenStream("inputquery", new System.IO.StringReader(searchString)))
{
tokenStream.Reset();
while (tokenStream.IncrementToken())
{
var termAttr = tokenStream.GetAttribute<ITermAttribute>();
searchTerms.Add(termAttr.Term);
}
}
//var searchTerms = searchString.Split(new[] { ' ', '\r', '\n' }, System.StringSplitOptions.RemoveEmptyEntries).ToList();
int depth = 1;
bool foundSuggestions = false;
string combinedWordTermMatch = string.Empty;
SortedDictionary<int, string> suggestionsCombined = new SortedDictionary<int, string>(); //Our final result - will hold a combination of results for each word in the original search string
foreach (string searchTerm in searchTerms) //Iterate the single words
{
int suggestionsToRequest = numberOfSuggestions;
if (searchTerms.Count > depth)
{
suggestionsToRequest = 1;
}
List<string> singlewordSuggestions;
if (depth > 1 && depth == searchTerms.Count) //If we have a search string of several words, we will use the first suggestion(s) for first (2nd, 3rd, etc) word in search string, and a list of suggestions for last word
{
singlewordSuggestions = GetTermsSuggestionsFromSearch(suggestionsCombined[0], searchTerm);
if (singlewordSuggestions.Count == 0)
{
string tempCombinedWordTermMatch = string.Empty;
singlewordSuggestions = GetTermSuggestions(searchTerm, suggestionsToRequest, string.Empty, out tempCombinedWordTermMatch); //Find suggestions for the singleword
}
}
else
{
string combinedWord = string.Empty;
string tempCombinedWordTermMatch = string.Empty;
if (searchTerms.Count > 1)
{
combinedWord = searchTerms[0] + searchTerms[1];
suggestionsToRequest = 10;
}
singlewordSuggestions = GetTermSuggestions(searchTerm, suggestionsToRequest, combinedWord, out tempCombinedWordTermMatch); //Find suggestions for the singleword
if (string.IsNullOrEmpty(combinedWordTermMatch))
{
combinedWordTermMatch = tempCombinedWordTermMatch;
}
}
if (singlewordSuggestions == null)
{
singlewordSuggestions = new List<string>();
}
if (singlewordSuggestions.Count > 0)
{
foundSuggestions = true;
}
//If we do not have a suggestion for place i, use the original word in this place
var wordToAdd = searchTerm;
for (var i = 0; i < numberOfSuggestions; i++) //For each expected result, we will add a record to the result
{
if (singlewordSuggestions.Count > i)
{
if (searchTerms.Count > 1 && depth == 1)
{
wordToAdd = singlewordSuggestions[0];
}
else
{
wordToAdd = singlewordSuggestions[i];
}
}
//Add or update the result with our finding for this single word and insert the right place
if (suggestionsCombined.ContainsKey(i))
{
suggestionsCombined[i] += ' ' + wordToAdd;
}
else
{
suggestionsCombined.Add(i, wordToAdd);
}
}
depth++;
}
if (foundSuggestions && suggestionsCombined.Count > 0)
{
List<string> result = new List<string>();
if (!string.IsNullOrEmpty(combinedWordTermMatch))
{
result.Add(combinedWordTermMatch);
}
//if (searchTerms.Count > 1)
//{
// string combinedWord = searchTerms[0] + searchTerms[1];
// string combinedWordSuggestion = GetTermSuggestions(combinedWord, 1)?.FirstOrDefault();
// if (!string.IsNullOrEmpty(combinedWordSuggestion) && combinedWordSuggestion.StartsWith(combinedWord, StringComparison.OrdinalIgnoreCase))
// {
// result.Add(combinedWordSuggestion);
// }
//}
foreach (var suggestion in suggestionsCombined)
{
if (!result.Contains(suggestion.Value, StringComparer.OrdinalIgnoreCase))
{
result.Add(suggestion.Value);
}
}
return result.Take(numberOfSuggestions);
}
else
{
return Enumerable.Empty<string>();
}
}
internal List<string> GetTermSuggestions(string word, int neededSuggestions, string combinedTwoWordTerm, out string combinedWordTermMatch)
{
combinedWordTermMatch = string.Empty;
//Terms - find existing terms in the field that starts with the passed word
List<string> termSuggestions = new List<string>(numberOfSuggestions);
TermEnum terms = indexReader.Terms(new Term(indexField, word));
int maxSuggestsCpt = 0;
do
{
var term = terms.Term.Text;
if (!string.IsNullOrEmpty(combinedTwoWordTerm) && term.StartsWith(combinedTwoWordTerm, System.StringComparison.OrdinalIgnoreCase))
{
combinedWordTermMatch = term;
}
if (term.StartsWith(word, System.StringComparison.OrdinalIgnoreCase))
{
if (!termSuggestions.Contains(term, StringComparer.OrdinalIgnoreCase))
{
termSuggestions.Add(term);
maxSuggestsCpt++;
}
}
if (maxSuggestsCpt >= neededSuggestions || maxSuggestsCpt == 0) //if maxSuggestsCpt = 0 means that there are no terms in this list starting with the search word - no reason to iterate
break;
}
while (terms.Next());
if (termSuggestions.Count() >= neededSuggestions || word.Length < 2) //If there is enough suggestions or the word is one letter only
{
return termSuggestions;
}
//Add suggestions to the list of existing terms
int missingSuggestions = neededSuggestions - termSuggestions.Count;
List<string> metrics = GetSimilarSuggestions(word, missingSuggestions);
foreach (string suggestion in metrics)
{
termSuggestions.Add(suggestion);
}
return termSuggestions.Distinct().ToList();
}
internal List<string> GetSimilarSuggestions(string word, int numberOfSuggestions)
{
//Suggestions
var suggestions = checker.SuggestSimilar(word, numberOfSuggestions, indexReader, indexField, true);
var jaro = new JaroWinklerDistance();
var leven = new LevenshteinDistance();
var ngram = new NGramDistance();
var metrics = suggestions.Select(s => new
{
suggestion = s,
freq = indexReader.DocFreq(new Term(indexField, s)),
jaro = jaro.GetDistance(word, s),
leven = leven.GetDistance(word, s),
ngram = ngram.GetDistance(word, s)
})
.OrderByDescending(metric =>
(
(metric.freq / 10f) +
metric.jaro +
metric.leven +
metric.ngram
)
/ 4f
)
.ToList();
return metrics.Select(m => m.suggestion).ToList();
}
internal List<string> GetTermsSuggestionsFromSearch(string termToSearch, string word)
{
List<string> termSuggestions = new List<string>();
List<string> spellCheckedSuggestions = new List<string>(numberOfSuggestions);
List<string> fallbacTermSuggestions = new List<string>(numberOfSuggestions);
Analyzer analyzer = new mylucene.Analysis.Standard.StandardAnalyzer(mylucene.Util.Version.LUCENE_30);
//QueryParser parser = new QueryParser(mylucene.Util.Version.LUCENE_30, indexField, analyzer);
var parser = new MultiFieldQueryParser(mylucene.Util.Version.LUCENE_30, new[] { indexField }, analyzer);
parser.DefaultOperator = QueryParser.Operator.AND;
var query = parser.Parse(termToSearch);
var booleanQuery = new BooleanQuery();
booleanQuery.Add(query, Occur.MUST);
var filter = new QueryWrapperFilter(booleanQuery);
bool debugAdded = false;
using (Searcher searcher = new IndexSearcher(indexReader))
{
string spellCheckedWord = string.Empty;
var spellingSuggestion = GetSimilarSuggestions(word, 1);
if (spellingSuggestion != null)
{
spellCheckedWord = spellingSuggestion.FirstOrDefault();
}
TopScoreDocCollector collector = TopScoreDocCollector.Create(25, true);
searcher.Search(booleanQuery, filter, collector);
//TopDocs docs = searcher.Search(query, 10);
var hits = collector.TopDocs().ScoreDocs;
for (int i = 0; i < hits.Length; i++)
{
ITermFreqVector vector = indexReader.GetTermFreqVector(hits[i].Doc, indexField);
//Get all terms and sort them by frequency - one document at the time.
List<TermFrequency> termFrequencies = new List<TermFrequency>();
var termCounts = vector.GetTermFrequencies();
int termArrayPointer = 0;
foreach (string term in vector?.GetTerms())
{
termFrequencies.Add(new TermFrequency(termCounts[termArrayPointer], term));
termArrayPointer++;
}
if (!debugAdded)
{
//string terms = string.Empty;
//foreach (var term in termFrequencies.OrderByDescending(o => o.Frequency))
//{
// terms += $"{term.Term} ({term.Frequency}) ";
//}
//termSuggestions.Add($"DEBUG (F:{termToSearch} c:{hits.Count()} tf:{termFrequencies.Count} q:{query.ToString()}) {terms}");
debugAdded = true;
}
foreach (var term in termFrequencies.OrderByDescending(o => o.Frequency))
{
if (term.Term.StartsWith(word, StringComparison.OrdinalIgnoreCase))
{
termSuggestions.Add(term.Term);
}
else if (!string.IsNullOrEmpty(spellCheckedWord) && term.Term.StartsWith(spellCheckedWord, StringComparison.OrdinalIgnoreCase))
{
spellCheckedSuggestions.Add(term.Term);
}
else
{
if (fallbacTermSuggestions.Count < numberOfSuggestions)
{
fallbacTermSuggestions.Add(term.Term);
}
}
}
if (termSuggestions.Count > numberOfSuggestions)
{
break;
}
}
}
if (termSuggestions.Count < numberOfSuggestions)
{
//Suggestions are missing. Add falback suggestions:
termSuggestions.AddRange(spellCheckedSuggestions.Take(numberOfSuggestions - termSuggestions.Count));
termSuggestions.AddRange(fallbacTermSuggestions.Take(numberOfSuggestions - termSuggestions.Count));
}
return termSuggestions;
}
internal class TermFrequency
{
public int Frequency;
public string Term;
public TermFrequency(int frequency, string term)
{
Frequency = frequency;
Term = term;
}
}
}