diff --git a/.gitignore b/.gitignore index 475cb0d..32d32dc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ output.*.txt .vscode files -files.*/ +files.* +*.prof diff --git a/main.go b/main.go index fba1887..1f231db 100644 --- a/main.go +++ b/main.go @@ -9,6 +9,7 @@ import ( "os" "path" "runtime" + "runtime/pprof" "slices" "strconv" "strings" @@ -43,6 +44,16 @@ func main() { start = time.Now() padding = "\n" ) + + f, err := os.Create("cpu.prof") + if err != nil { + panic(err) + } + if err := pprof.StartCPUProfile(f); err != nil { + panic(err) + } + defer pprof.StopCPUProfile() + documents, output, err := run(os.Args) if err != nil { fmt.Fprintf(os.Stderr, "error: %v\n", err) @@ -308,29 +319,16 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor // on the number of line-centric differences between the contents of the two // files. func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) { - f1, err := dm.fcc.GetFileContents(f1Number) + f1, err := dm.fcc.GetLineIDsForFile(f1Number) if err != nil { return 0, fmt.Errorf("file %d: %w", f1Number, err) } - f2, err := dm.fcc.GetFileContents(f2Number) + f2, err := dm.fcc.GetLineIDsForFile(f2Number) if err != nil { return 0, fmt.Errorf("file %d: %w", f2Number, err) } - histogram := make(map[string]int) - for _, line := range f1 { - histogram[line]++ - } - for _, line := range f2 { - histogram[line]-- - } - - var differences float64 - for _, v := range histogram { - differences += math.Abs(float64(v)) - } - - similarity := 1 - (differences / float64(len(f1)+len(f2))) + similarity := compareFileLineIDs(f1, f2) return similarity, nil } @@ -382,6 +380,7 @@ func (d *Document) SortAssociatedFiles() { type FileContentsCache struct { BaseDir string cache sync.Map + lineIDs sync.Map } // GetFileContents returns the contents of a file, excluding the first timestamp @@ -418,6 +417,57 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error) return lines, nil } +func (fcc *FileContentsCache) GetLineIDsForFile(fileNumber int) ([]uint64, error) { + if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok { + return cachedLineIDs.([]uint64), nil + } + + lines, err := fcc.GetFileContents(fileNumber) + if err != nil { + return nil, fmt.Errorf("adding line IDs for file %d: %w", fileNumber, err) + } + + lineIDs := makeFileLinesID(lines) + fcc.lineIDs.Store(fileNumber, lineIDs) + return lineIDs, nil +} + +var ( + fileLinesCache sync.Map + fileLineIDSource atomic.Uint64 +) + +func makeFileLinesID(fileLines []string) []uint64 { + fileLineIDs := make([]uint64, 0, len(fileLines)) + for _, line := range fileLines { + lineID, ok := fileLinesCache.Load(line) + if !ok { + newID := fileLineIDSource.Add(1) + fileLinesCache.Store(line, newID) + fileLineIDs = append(fileLineIDs, newID) + } else { + fileLineIDs = append(fileLineIDs, lineID.(uint64)) + } + } + slices.Sort(fileLineIDs) + return fileLineIDs +} + +func compareFileLineIDs(f1, f2 []uint64) float64 { + var ( + f2Index int + similarity int + ) + for _, lineID := range f1 { + if f2Index < len(f2) && f2[f2Index] == lineID { + similarity += 2 + f2Index++ + } + } + + return float64(similarity) / float64(len(f1)+len(f2)) +} + // ClearFilesExcept removes the contents of the fileContentsCache except for the // provided file numbers. This helps conserve memory by removing the contents of // files that are no longer of interest, which we can be sure of since we are