Fix comparison algorithm

Fix the file line ID comparison algorithm to use a Jaccard Index. This correctly identifies the number of documents in the corpus. Also add a little status output when files are being ordered by timestamp.
Add numeric comparison proof of concept
2024-05-24 02:13:06 -07:00 · 2024-05-23 01:03:12 -07:00
2 changed files with 89 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@
 output.*.txt
 .vscode
 files
-files.*/
+files.*
 *.prof
--- a/main.go
+++ b/main.go
@ -9,6 +9,7 @@ import (
 	"os"
 	"path"
 	"runtime"
 	"runtime/pprof"
 	"slices"
 	"strconv"
 	"strings"
@ -43,6 +44,16 @@ func main() {
 		start   = time.Now()
 		padding = "\n"
 	)
 	f, err := os.Create("cpu.prof")
 	if err != nil {
 		panic(err)
 	}
 	if err := pprof.StartCPUProfile(f); err != nil {
 		panic(err)
 	}
 	defer pprof.StopCPUProfile()
 	documents, output, err := run(os.Args)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "error: %v\n", err)
@ -95,7 +106,7 @@ func run(args []string) ([]*Document, *os.File, error) {
 	// The files need to be processed in order of time, so determine the
 	// timestamp of each file and sort them by time.
-	fileTimes, times, err := orderFiles(dataFilePath)
+	fileTimes, times, err := orderFiles(dataFilePath, shouldReportStatus)
 	if err != nil {
 		return nil, nil, err
 	}
@ -308,29 +319,16 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor
 // on the number of line-centric differences between the contents of the two
 // files.
 func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
-	f1, err := dm.fcc.GetFileContents(f1Number)
+	f1, err := dm.fcc.GetFileLineID(f1Number)
 	if err != nil {
 		return 0, fmt.Errorf("file %d: %w", f1Number, err)
 	}
-	f2, err := dm.fcc.GetFileContents(f2Number)
+	f2, err := dm.fcc.GetFileLineID(f2Number)
 	if err != nil {
 		return 0, fmt.Errorf("file %d: %w", f2Number, err)
 	}
-	histogram := make(map[string]int)
+	similarity := compareFileLineIDs(f1, f2)
 	for _, line := range f1 {
 		histogram[line]++
 	}
 	for _, line := range f2 {
 		histogram[line]--
 	}
 	var differences float64
 	for _, v := range histogram {
 		differences += math.Abs(float64(v))
 	}
 	similarity := 1 - (differences / float64(len(f1)+len(f2)))
 	return similarity, nil
 }
@ -382,6 +380,7 @@ func (d *Document) SortAssociatedFiles() {
 type FileContentsCache struct {
 	BaseDir string
 	cache   sync.Map
 	lineIDs sync.Map
 }
 // GetFileContents returns the contents of a file, excluding the first timestamp
@ -418,6 +417,69 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error)
 	return lines, nil
 }
 func (fcc *FileContentsCache) GetFileLineID(fileNumber int) ([]uint64, error) {
 	if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok {
 		return cachedLineIDs.([]uint64), nil
 	}
 	lines, err := fcc.GetFileContents(fileNumber)
 	if err != nil {
 		return nil, fmt.Errorf("adding line IDs for file %d: %w", fileNumber, err)
 	}
 	lineIDs := makeFileLinesID(lines)
 	fcc.lineIDs.Store(fileNumber, lineIDs)
 	return lineIDs, nil
 }
 var (
 	fileLinesCache   sync.Map
 	fileLineIDSource atomic.Uint64
 )
 func makeFileLinesID(fileLines []string) []uint64 {
 	fileLineIDs := make([]uint64, 0, len(fileLines))
 	for _, line := range fileLines {
 		lineID, ok := fileLinesCache.Load(line)
 		if !ok {
 			newID := fileLineIDSource.Add(1)
 			fileLinesCache.Store(line, newID)
 			fileLineIDs = append(fileLineIDs, newID)
 		} else {
 			fileLineIDs = append(fileLineIDs, lineID.(uint64))
 		}
 	}
 	slices.Sort(fileLineIDs)
 	return fileLineIDs
 }
 func compareFileLineIDs(f1, f2 []uint64) float64 {
 	var (
 		i, j       = 0, 0
 		count      int
 		similarity int
 	)
 	for i < len(f1) && j < len(f2) {
 		count++
 		if f1[i] == f2[j] {
 			similarity++
 			i++
 			j++
 		} else if f1[i] < f2[j] {
 			i++
 		} else {
 			j++
 		}
 	}
 	count += len(f1) - i + len(f2) - j
 	if count == 0 {
 		return 0
 	}
 	return float64(similarity) / float64(count)
 }
 // ClearFilesExcept removes the contents of the fileContentsCache except for the
 // provided file numbers. This helps conserve memory by removing the contents of
 // files that are no longer of interest, which we can be sure of since we are
@ -468,14 +530,20 @@ func readFileTime(filepath string) (int, error) {
 // the map can be iterated in order of time. This allows stepping through the
 // history of the files from the beginning. Using this, we can construct a
 // "chain" of evolution for a given document.
-func orderFiles(dir string) (map[int][]int, []int, error) {
+func orderFiles(dir string, shouldReportStatus bool) (map[int][]int, []int, error) {
 	if shouldReportStatus {
 		defer fmt.Fprintln(os.Stderr)
 	}
 	timeMap := make(map[int][]int)
 	dirEntries, err := os.ReadDir(dir)
 	if err != nil {
 		return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
 	}
-	for _, entry := range dirEntries {
+	for i, entry := range dirEntries {
 		if shouldReportStatus {
 			fmt.Fprintf(os.Stderr, "\rReading directory %d of %d...", i+1, len(dirEntries))
 		}
 		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
 			continue
 		}