From c8c2d9a9e09209c37191333cea4ed0c48822e9de Mon Sep 17 00:00:00 2001 From: Ian Molee Date: Fri, 5 Apr 2024 04:54:56 -0700 Subject: [PATCH] Alter similarity calculation Use a slightly more sophisticated method to determine similarity than just trying to find duplicated lines, which falls apart fairly quickly. Instead add value to the histogram while scanning the first file, and subtract while scanning the second. After this, any entries with a vvalue of 0 indicate matching lines. The magnitudes of anything elsefrom zero are summed and used to calculate a similarity fraction. --- main.go | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/main.go b/main.go index 01e6413..287307c 100644 --- a/main.go +++ b/main.go @@ -285,23 +285,20 @@ func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) } histogram := make(map[string]int) - for _, lines := range [][]string{f1, f2} { - for _, line := range lines { - // Skip blank lines, which can throw off the count. - if line == "" { - continue - } + for _, line := range f1 { histogram[line]++ } + for _, line := range f2 { + histogram[line]-- } - var overlap int + var differences float64 for _, v := range histogram { - if v == 2 { - overlap++ - } + differences += math.Abs(float64(v)) } - return float64(overlap) / float64(len(histogram)), nil + + similarity := 1 - (differences / float64(len(f1)+len(f2))) + return similarity, nil } // Document stores a document ID and a list of associated files.