From 42d297263b42180b28adb7e8f0f9fe367742ac13 Mon Sep 17 00:00:00 2001
From: Ian Molee <imolee@gmail.com>
Date: Fri, 24 May 2024 02:13:06 -0700
Subject: [PATCH] Fix comparison algorithm

Fix the file line ID comparison algorithm to use a Jaccard Index. This
correctly identifies the number of documents in the corpus.

Also add a little status output when files are being ordered by
timestamp.
---
 main.go | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/main.go b/main.go
index 1f231db..22357fd 100644
--- a/main.go
+++ b/main.go
@@ -106,7 +106,7 @@ func run(args []string) ([]*Document, *os.File, error) {
 
 	// The files need to be processed in order of time, so determine the
 	// timestamp of each file and sort them by time.
-	fileTimes, times, err := orderFiles(dataFilePath)
+	fileTimes, times, err := orderFiles(dataFilePath, shouldReportStatus)
 	if err != nil {
 		return nil, nil, err
 	}
@@ -319,11 +319,11 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor
 // on the number of line-centric differences between the contents of the two
 // files.
 func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
-	f1, err := dm.fcc.GetLineIDsForFile(f1Number)
+	f1, err := dm.fcc.GetFileLineID(f1Number)
 	if err != nil {
 		return 0, fmt.Errorf("file %d: %w", f1Number, err)
 	}
-	f2, err := dm.fcc.GetLineIDsForFile(f2Number)
+	f2, err := dm.fcc.GetFileLineID(f2Number)
 	if err != nil {
 		return 0, fmt.Errorf("file %d: %w", f2Number, err)
 	}
@@ -417,7 +417,7 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error)
 	return lines, nil
 }
 
-func (fcc *FileContentsCache) GetLineIDsForFile(fileNumber int) ([]uint64, error) {
+func (fcc *FileContentsCache) GetFileLineID(fileNumber int) ([]uint64, error) {
 	if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok {
 		return cachedLineIDs.([]uint64), nil
 	}
@@ -455,17 +455,29 @@ func makeFileLinesID(fileLines []string) []uint64 {
 
 func compareFileLineIDs(f1, f2 []uint64) float64 {
 	var (
-		f2Index    int
+		i, j       = 0, 0
+		count      int
 		similarity int
 	)
-	for _, lineID := range f1 {
-		if f2Index < len(f2) && f2[f2Index] == lineID {
-			similarity += 2
-			f2Index++
+	for i < len(f1) && j < len(f2) {
+		count++
+		if f1[i] == f2[j] {
+			similarity++
+			i++
+			j++
+		} else if f1[i] < f2[j] {
+			i++
+		} else {
+			j++
 		}
 	}
 
-	return float64(similarity) / float64(len(f1)+len(f2))
+	count += len(f1) - i + len(f2) - j
+	if count == 0 {
+		return 0
+	}
+
+	return float64(similarity) / float64(count)
 }
 
 // ClearFilesExcept removes the contents of the fileContentsCache except for the
@@ -518,14 +530,20 @@ func readFileTime(filepath string) (int, error) {
 // the map can be iterated in order of time. This allows stepping through the
 // history of the files from the beginning. Using this, we can construct a
 // "chain" of evolution for a given document.
-func orderFiles(dir string) (map[int][]int, []int, error) {
+func orderFiles(dir string, shouldReportStatus bool) (map[int][]int, []int, error) {
+	if shouldReportStatus {
+		defer fmt.Fprintln(os.Stderr)
+	}
 	timeMap := make(map[int][]int)
 
 	dirEntries, err := os.ReadDir(dir)
 	if err != nil {
 		return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
 	}
-	for _, entry := range dirEntries {
+	for i, entry := range dirEntries {
+		if shouldReportStatus {
+			fmt.Fprintf(os.Stderr, "\rReading directory %d of %d...", i+1, len(dirEntries))
+		}
 		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
 			continue
 		}