Alter similarity calculation

Use a slightly more sophisticated method to determine similarity than
just trying to find duplicated lines, which falls apart fairly quickly.
Instead add value to the histogram while scanning the first file, and
subtract while scanning the second. After this, any entries with a
vvalue of 0 indicate matching lines. The magnitudes of anything elsefrom
zero are summed and used to calculate a similarity fraction.
This commit is contained in:
Ian Molee 2024-04-05 04:54:56 -07:00
parent 03c0840041
commit c8c2d9a9e0
1 changed files with 8 additions and 11 deletions

19
main.go
View File

@ -285,23 +285,20 @@ func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error)
}
histogram := make(map[string]int)
for _, lines := range [][]string{f1, f2} {
for _, line := range lines {
// Skip blank lines, which can throw off the count.
if line == "" {
continue
}
for _, line := range f1 {
histogram[line]++
}
for _, line := range f2 {
histogram[line]--
}
var overlap int
var differences float64
for _, v := range histogram {
if v == 2 {
overlap++
}
differences += math.Abs(float64(v))
}
return float64(overlap) / float64(len(histogram)), nil
similarity := 1 - (differences / float64(len(f1)+len(f2)))
return similarity, nil
}
// Document stores a document ID and a list of associated files.