i creating program checks if word simplified word(txt, msg, etc.) , if simplified finds correct spelling txt=text, msg=message. iam using nhunspell suggest method in c# suggest possible results.
the problem if inputted "txt" result text,tat, tot, etc. dont know how select correct word. used levenshtein distance (c# - compare string similarity) results still results 1.
input: txt result: text = 1, ext = 1 tit = 1
can me how meaning or correct spelling of simplified words? example: msg
i have tested input sample data , text has distance of 25 whereas other have distance of 33. here's code:
string input = "txt"; string[] words = new[]{"text","tat","tot"}; var levenshtein = new levenshtein(); const int maxdistance = 30; var distancegroups = words .select(w => new { word = w, distance = levenshtein.ild(w.toupperinvariant(), input) }) .where(x => x.distance <= maxdistance) .groupby(x => x.distance) .orderby(g => g.key) .tolist(); foreach (var topcandidate in distancegroups.first()) console.writeline("word:{0} distance:{1}", topcandidate.word, topcandidate.distance); and here levenshtein class:
public class levenshtein { ///***************************** /// compute levenshtein distance /// memory efficient version ///***************************** public int ild(string srow, string scol) { int rowlen = srow.length; // length of srow int collen = scol.length; // length of scol int rowidx; // iterates through srow int colidx; // iterates through scol char row_i; // ith character of srow char col_j; // jth character of scol int cost; // cost /// test string length if (math.max(srow.length, scol.length) > math.pow(2, 31)) throw (new exception("\nmaximum string length in levenshtein.ild " + math.pow(2, 31) + ".\nyours " + math.max(srow.length, scol.length) + ".")); // step 1 if (rowlen == 0) { return collen; } if (collen == 0) { return rowlen; } /// create 2 vectors int[] v0 = new int[rowlen + 1]; int[] v1 = new int[rowlen + 1]; int[] vtmp; /// step 2 /// initialize first vector (rowidx = 1; rowidx <= rowlen; rowidx++) { v0[rowidx] = rowidx; } // step 3 /// fore each column (colidx = 1; colidx <= collen; colidx++) { /// set 0'th element column number v1[0] = colidx; col_j = scol[colidx - 1]; // step 4 /// fore each row (rowidx = 1; rowidx <= rowlen; rowidx++) { row_i = srow[rowidx - 1]; // step 5 if (row_i == col_j) { cost = 0; } else { cost = 1; } // step 6 /// find minimum int m_min = v0[rowidx] + 1; int b = v1[rowidx - 1] + 1; int c = v0[rowidx - 1] + cost; if (b < m_min) { m_min = b; } if (c < m_min) { m_min = c; } v1[rowidx] = m_min; } /// swap vectors vtmp = v0; v0 = v1; v1 = vtmp; } // step 7 /// value between 0 - 100 /// 0==perfect match 100==totaly different /// /// vectors swaped 1 last time @ end of last loop, /// why result in v0 rather in v1 //system.console.writeline("idist=" + v0[rowlen]); int max = system.math.max(rowlen, collen); return ((100 * v0[rowlen]) / max); } ///***************************** /// compute min ///***************************** private int minimum(int a, int b, int c) { int mi = a; if (b < mi) { mi = b; } if (c < mi) { mi = c; } return mi; } ///***************************** /// compute levenshtein distance ///***************************** public int ld(string snew, string sold) { int[,] matrix; // matrix int snewlen = snew.length; // length of snew int soldlen = sold.length; // length of sold int snewidx; // iterates through snew int soldidx; // iterates through sold char snew_i; // ith character of snew char sold_j; // jth character of sold int cost; // cost /// test string length if (math.max(snew.length, sold.length) > math.pow(2, 31)) throw (new exception("\nmaximum string length in levenshtein.ld " + math.pow(2, 31) + ".\nyours " + math.max(snew.length, sold.length) + ".")); // step 1 if (snewlen == 0) { return soldlen; } if (soldlen == 0) { return snewlen; } matrix = new int[snewlen + 1, soldlen + 1]; // step 2 (snewidx = 0; snewidx <= snewlen; snewidx++) { matrix[snewidx, 0] = snewidx; } (soldidx = 0; soldidx <= soldlen; soldidx++) { matrix[0, soldidx] = soldidx; } // step 3 (snewidx = 1; snewidx <= snewlen; snewidx++) { snew_i = snew[snewidx - 1]; // step 4 (soldidx = 1; soldidx <= soldlen; soldidx++) { sold_j = sold[soldidx - 1]; // step 5 if (snew_i == sold_j) { cost = 0; } else { cost = 1; } // step 6 matrix[snewidx, soldidx] = minimum(matrix[snewidx - 1, soldidx] + 1, matrix[snewidx, soldidx - 1] + 1, matrix[snewidx - 1, soldidx - 1] + cost); } } // step 7 /// value between 0 - 100 /// 0==perfect match 100==totaly different //system.console.writeline("dist=" + matrix[snewlen, soldlen]); int max = system.math.max(snewlen, soldlen); return (100 * matrix[snewlen, soldlen]) / max; } }
Comments
Post a Comment