From 42d297263b42180b28adb7e8f0f9fe367742ac13 Mon Sep 17 00:00:00 2001 From: Ian Molee Date: Fri, 24 May 2024 02:13:06 -0700 Subject: [PATCH] Fix comparison algorithm Fix the file line ID comparison algorithm to use a Jaccard Index. This correctly identifies the number of documents in the corpus. Also add a little status output when files are being ordered by timestamp. --- main.go | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/main.go b/main.go index 1f231db..22357fd 100644 --- a/main.go +++ b/main.go @@ -106,7 +106,7 @@ func run(args []string) ([]*Document, *os.File, error) { // The files need to be processed in order of time, so determine the // timestamp of each file and sort them by time. - fileTimes, times, err := orderFiles(dataFilePath) + fileTimes, times, err := orderFiles(dataFilePath, shouldReportStatus) if err != nil { return nil, nil, err } @@ -319,11 +319,11 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor // on the number of line-centric differences between the contents of the two // files. func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) { - f1, err := dm.fcc.GetLineIDsForFile(f1Number) + f1, err := dm.fcc.GetFileLineID(f1Number) if err != nil { return 0, fmt.Errorf("file %d: %w", f1Number, err) } - f2, err := dm.fcc.GetLineIDsForFile(f2Number) + f2, err := dm.fcc.GetFileLineID(f2Number) if err != nil { return 0, fmt.Errorf("file %d: %w", f2Number, err) } @@ -417,7 +417,7 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error) return lines, nil } -func (fcc *FileContentsCache) GetLineIDsForFile(fileNumber int) ([]uint64, error) { +func (fcc *FileContentsCache) GetFileLineID(fileNumber int) ([]uint64, error) { if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok { return cachedLineIDs.([]uint64), nil } @@ -455,17 +455,29 @@ func makeFileLinesID(fileLines []string) []uint64 { func compareFileLineIDs(f1, f2 []uint64) float64 { var ( - f2Index int + i, j = 0, 0 + count int similarity int ) - for _, lineID := range f1 { - if f2Index < len(f2) && f2[f2Index] == lineID { - similarity += 2 - f2Index++ + for i < len(f1) && j < len(f2) { + count++ + if f1[i] == f2[j] { + similarity++ + i++ + j++ + } else if f1[i] < f2[j] { + i++ + } else { + j++ } } - return float64(similarity) / float64(len(f1)+len(f2)) + count += len(f1) - i + len(f2) - j + if count == 0 { + return 0 + } + + return float64(similarity) / float64(count) } // ClearFilesExcept removes the contents of the fileContentsCache except for the @@ -518,14 +530,20 @@ func readFileTime(filepath string) (int, error) { // the map can be iterated in order of time. This allows stepping through the // history of the files from the beginning. Using this, we can construct a // "chain" of evolution for a given document. -func orderFiles(dir string) (map[int][]int, []int, error) { +func orderFiles(dir string, shouldReportStatus bool) (map[int][]int, []int, error) { + if shouldReportStatus { + defer fmt.Fprintln(os.Stderr) + } timeMap := make(map[int][]int) dirEntries, err := os.ReadDir(dir) if err != nil { return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err) } - for _, entry := range dirEntries { + for i, entry := range dirEntries { + if shouldReportStatus { + fmt.Fprintf(os.Stderr, "\rReading directory %d of %d...", i+1, len(dirEntries)) + } if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") { continue }