Skip to content

Commit 12ac3d7

Browse files
fix(v1/algo): invalid q-gram similarity calculation (#22)
1 parent 37da2e1 commit 12ac3d7

File tree

2 files changed

+13
-5
lines changed

2 files changed

+13
-5
lines changed

qgram.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,12 @@ func QgramSimilarity(str1, str2 string, splitLength int) float32 {
5252
splittedStr1 := Shingle(str1, splitLength)
5353
splittedStr2 := Shingle(str2, splitLength)
5454
res := float32(QgramDistanceCustomNgram(splittedStr1, splittedStr2))
55-
return 1 - (res / float32(len(splittedStr1)+len(splittedStr2)))
55+
totalShingles := 0
56+
for _, i := range splittedStr1 {
57+
totalShingles += i
58+
}
59+
for _, i := range splittedStr2 {
60+
totalShingles += i
61+
}
62+
return 1 - (res / float32(totalShingles))
5663
}

string-analysis_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -175,16 +175,17 @@ func TestStringsSimilarity(t *testing.T) {
175175
{"Qgram : Second arg empty", args{"abcde", "", Qgram}, 0.0, false},
176176
{"Qgram : Same args", args{"abcde", "abcde", Qgram}, 1.0, false},
177177
{"Qgram : No characters match", args{"abcd", "effgghh", Qgram}, 0.0, false},
178+
{"Qgram : Repeated shingles", args{"aaaaaa", "aaa", Qgram}, 0.57142854, false},
178179
{"Qgram : CRATE/TRACE", args{"CRATE", "TRACE", Qgram}, 0.25, false},
179180
{"Qgram : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Qgram}, 0.39999998, false},
180181
{"Qgram : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Qgram}, 0.36363637, false},
181182
{"Qgram Sentence 1", args{"Radiohead", "Radiohead", Qgram}, 1.0, false},
182183
{"Qgram Sentence 2", args{"ABCD", "ABCE", Qgram}, 0.6666666, false},
183-
{"Qgram Sentence 3", args{"Radiohead", "Carly Rae Jepsen", Qgram}, 0.04545456, false},
184-
{"Qgram Sentence 4", args{"I love horror movies", "Lights out is a horror movie", Qgram}, 0.47619045, false},
185-
{"Qgram Sentence 5", args{"love horror movies", "Lights out horror movie", Qgram}, 0.5833334, false},
184+
{"Qgram Sentence 3", args{"Radiohead", "Carly Rae Jepsen", Qgram}, 0.0869565, false},
185+
{"Qgram Sentence 4", args{"I love horror movies", "Lights out is a horror movie", Qgram}, 0.5217391, false},
186+
{"Qgram Sentence 5", args{"love horror movies", "Lights out horror movie", Qgram}, 0.6153846, false},
186187
{"Qgram Sentence 6", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Qgram}, 0.7619048, false},
187-
{"Qgram Sentence 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Qgram}, 0.5555556, false},
188+
{"Qgram Sentence 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Qgram}, 0.7777778, false},
188189

189190
// TODO: Must refactor compare method to handle NaN values
190191
// {"Qgram Sentence 8", args{"", "", Qgram}, float32(math.NaN()), false},

0 commit comments

Comments
 (0)