Alter similarity calculation
Use a slightly more sophisticated method to determine similarity than just trying to find duplicated lines, which falls apart fairly quickly. Instead add value to the histogram while scanning the first file, and subtract while scanning the second. After this, any entries with a vvalue of 0 indicate matching lines. The magnitudes of anything elsefrom zero are summed and used to calculate a similarity fraction.
This commit is contained in:
parent
03c0840041
commit
c8c2d9a9e0
19
main.go
19
main.go
|
|
@ -285,23 +285,20 @@ func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error)
|
|||
}
|
||||
|
||||
histogram := make(map[string]int)
|
||||
for _, lines := range [][]string{f1, f2} {
|
||||
for _, line := range lines {
|
||||
// Skip blank lines, which can throw off the count.
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
for _, line := range f1 {
|
||||
histogram[line]++
|
||||
}
|
||||
for _, line := range f2 {
|
||||
histogram[line]--
|
||||
}
|
||||
|
||||
var overlap int
|
||||
var differences float64
|
||||
for _, v := range histogram {
|
||||
if v == 2 {
|
||||
overlap++
|
||||
}
|
||||
differences += math.Abs(float64(v))
|
||||
}
|
||||
return float64(overlap) / float64(len(histogram)), nil
|
||||
|
||||
similarity := 1 - (differences / float64(len(f1)+len(f2)))
|
||||
return similarity, nil
|
||||
}
|
||||
|
||||
// Document stores a document ID and a list of associated files.
|
||||
|
|
|
|||
Loading…
Reference in New Issue