Add numeric comparison proof of concept

After speaking with Jacob, he led me to a more efficient way to compare files, which was a hot path for the program execution. Using hashmaps with long strings as keys is pretty inefficient. Faster would to alias lines with a numeric ID, and represent file contents as a list of line IDs. Then comparing file contents could be done simply by comparing two lists of numbers. If the lists are sorted, they can be stepped through using two indices to determine their similarity. Currently, the acutal comparison is broken, and is over-reporting the number of actual documents in the provided corpus by nearly 2x. This is likely because the index for file 1 always increases, where the index for file 2 is conditionally increased. Both indices need to be incremented under different conditions, to allow the one that is "behind" to catch up to the one that is "ahead" (which we can do because the lists are sorted).
2024-05-23 01:03:12 -07:00 · 2024-05-23 01:03:12 -07:00 · 42db2d544f
parent 50edb5d3f7
commit 42db2d544f
2 changed files with 68 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@
 output.*.txt
 .vscode
 files
-files.*/
+files.*
+*.prof
--- a/main.go
+++ b/main.go
@ -9,6 +9,7 @@ import (
 	"os"
 	"path"
 	"runtime"
+	"runtime/pprof"
 	"slices"
 	"strconv"
 	"strings"
@ -43,6 +44,16 @@ func main() {
 		start   = time.Now()
 		padding = "\n"
 	)
+
+	f, err := os.Create("cpu.prof")
+	if err != nil {
+		panic(err)
+	}
+	if err := pprof.StartCPUProfile(f); err != nil {
+		panic(err)
+	}
+	defer pprof.StopCPUProfile()
+
 	documents, output, err := run(os.Args)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "error: %v\n", err)
@ -308,29 +319,16 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor
 // on the number of line-centric differences between the contents of the two
 // files.
 func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
-	f1, err := dm.fcc.GetFileContents(f1Number)
+	f1, err := dm.fcc.GetLineIDsForFile(f1Number)
 	if err != nil {
 		return 0, fmt.Errorf("file %d: %w", f1Number, err)
 	}
-	f2, err := dm.fcc.GetFileContents(f2Number)
+	f2, err := dm.fcc.GetLineIDsForFile(f2Number)
 	if err != nil {
 		return 0, fmt.Errorf("file %d: %w", f2Number, err)
 	}

-	histogram := make(map[string]int)
-	for _, line := range f1 {
-		histogram[line]++
-	}
-	for _, line := range f2 {
-		histogram[line]--
-	}
-
-	var differences float64
-	for _, v := range histogram {
-		differences += math.Abs(float64(v))
-	}
-
-	similarity := 1 - (differences / float64(len(f1)+len(f2)))
+	similarity := compareFileLineIDs(f1, f2)
 	return similarity, nil
 }

@ -382,6 +380,7 @@ func (d *Document) SortAssociatedFiles() {
 type FileContentsCache struct {
 	BaseDir string
 	cache   sync.Map
+	lineIDs sync.Map
 }

 // GetFileContents returns the contents of a file, excluding the first timestamp
@ -418,6 +417,57 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error)
 	return lines, nil
 }

+func (fcc *FileContentsCache) GetLineIDsForFile(fileNumber int) ([]uint64, error) {
+	if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok {
+		return cachedLineIDs.([]uint64), nil
+	}
+
+	lines, err := fcc.GetFileContents(fileNumber)
+	if err != nil {
+		return nil, fmt.Errorf("adding line IDs for file %d: %w", fileNumber, err)
+	}
+
+	lineIDs := makeFileLinesID(lines)
+	fcc.lineIDs.Store(fileNumber, lineIDs)
+	return lineIDs, nil
+}
+
+var (
+	fileLinesCache   sync.Map
+	fileLineIDSource atomic.Uint64
+)
+
+func makeFileLinesID(fileLines []string) []uint64 {
+	fileLineIDs := make([]uint64, 0, len(fileLines))
+	for _, line := range fileLines {
+		lineID, ok := fileLinesCache.Load(line)
+		if !ok {
+			newID := fileLineIDSource.Add(1)
+			fileLinesCache.Store(line, newID)
+			fileLineIDs = append(fileLineIDs, newID)
+		} else {
+			fileLineIDs = append(fileLineIDs, lineID.(uint64))
+		}
+	}
+	slices.Sort(fileLineIDs)
+	return fileLineIDs
+}
+
+func compareFileLineIDs(f1, f2 []uint64) float64 {
+	var (
+		f2Index    int
+		similarity int
+	)
+	for _, lineID := range f1 {
+		if f2Index < len(f2) && f2[f2Index] == lineID {
+			similarity += 2
+			f2Index++
+		}
+	}
+
+	return float64(similarity) / float64(len(f1)+len(f2))
+}
+
 // ClearFilesExcept removes the contents of the fileContentsCache except for the
 // provided file numbers. This helps conserve memory by removing the contents of
 // files that are no longer of interest, which we can be sure of since we are