Add numeric comparison proof of concept

After speaking with Jacob, he led me to a more efficient way to compare
files, which was a hot path for the program execution. Using hashmaps
with long strings as keys is pretty inefficient. Faster would to alias
lines with a numeric ID, and represent file contents as a list of line
IDs. Then comparing file contents could be done simply by comparing two
lists of numbers. If the lists are sorted, they can be stepped through
using two indices to determine their similarity.

Currently, the acutal comparison is broken, and is over-reporting the
number of actual documents in the provided corpus by nearly 2x. This is
likely because the index for file 1 always increases, where the index
for file 2 is conditionally increased. Both indices need to be
incremented under different conditions, to allow the one that is
"behind" to catch up to the one that is "ahead" (which we can do because
the lists are sorted).
This commit is contained in:
Ian Molee 2024-05-23 01:03:12 -07:00
parent 50edb5d3f7
commit 42db2d544f
2 changed files with 68 additions and 17 deletions

3
.gitignore vendored
View File

@ -2,4 +2,5 @@
output.*.txt output.*.txt
.vscode .vscode
files files
files.*/ files.*
*.prof

82
main.go
View File

@ -9,6 +9,7 @@ import (
"os" "os"
"path" "path"
"runtime" "runtime"
"runtime/pprof"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
@ -43,6 +44,16 @@ func main() {
start = time.Now() start = time.Now()
padding = "\n" padding = "\n"
) )
f, err := os.Create("cpu.prof")
if err != nil {
panic(err)
}
if err := pprof.StartCPUProfile(f); err != nil {
panic(err)
}
defer pprof.StopCPUProfile()
documents, output, err := run(os.Args) documents, output, err := run(os.Args)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err) fmt.Fprintf(os.Stderr, "error: %v\n", err)
@ -308,29 +319,16 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor
// on the number of line-centric differences between the contents of the two // on the number of line-centric differences between the contents of the two
// files. // files.
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) { func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
f1, err := dm.fcc.GetFileContents(f1Number) f1, err := dm.fcc.GetLineIDsForFile(f1Number)
if err != nil { if err != nil {
return 0, fmt.Errorf("file %d: %w", f1Number, err) return 0, fmt.Errorf("file %d: %w", f1Number, err)
} }
f2, err := dm.fcc.GetFileContents(f2Number) f2, err := dm.fcc.GetLineIDsForFile(f2Number)
if err != nil { if err != nil {
return 0, fmt.Errorf("file %d: %w", f2Number, err) return 0, fmt.Errorf("file %d: %w", f2Number, err)
} }
histogram := make(map[string]int) similarity := compareFileLineIDs(f1, f2)
for _, line := range f1 {
histogram[line]++
}
for _, line := range f2 {
histogram[line]--
}
var differences float64
for _, v := range histogram {
differences += math.Abs(float64(v))
}
similarity := 1 - (differences / float64(len(f1)+len(f2)))
return similarity, nil return similarity, nil
} }
@ -382,6 +380,7 @@ func (d *Document) SortAssociatedFiles() {
type FileContentsCache struct { type FileContentsCache struct {
BaseDir string BaseDir string
cache sync.Map cache sync.Map
lineIDs sync.Map
} }
// GetFileContents returns the contents of a file, excluding the first timestamp // GetFileContents returns the contents of a file, excluding the first timestamp
@ -418,6 +417,57 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error)
return lines, nil return lines, nil
} }
func (fcc *FileContentsCache) GetLineIDsForFile(fileNumber int) ([]uint64, error) {
if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok {
return cachedLineIDs.([]uint64), nil
}
lines, err := fcc.GetFileContents(fileNumber)
if err != nil {
return nil, fmt.Errorf("adding line IDs for file %d: %w", fileNumber, err)
}
lineIDs := makeFileLinesID(lines)
fcc.lineIDs.Store(fileNumber, lineIDs)
return lineIDs, nil
}
var (
fileLinesCache sync.Map
fileLineIDSource atomic.Uint64
)
func makeFileLinesID(fileLines []string) []uint64 {
fileLineIDs := make([]uint64, 0, len(fileLines))
for _, line := range fileLines {
lineID, ok := fileLinesCache.Load(line)
if !ok {
newID := fileLineIDSource.Add(1)
fileLinesCache.Store(line, newID)
fileLineIDs = append(fileLineIDs, newID)
} else {
fileLineIDs = append(fileLineIDs, lineID.(uint64))
}
}
slices.Sort(fileLineIDs)
return fileLineIDs
}
func compareFileLineIDs(f1, f2 []uint64) float64 {
var (
f2Index int
similarity int
)
for _, lineID := range f1 {
if f2Index < len(f2) && f2[f2Index] == lineID {
similarity += 2
f2Index++
}
}
return float64(similarity) / float64(len(f1)+len(f2))
}
// ClearFilesExcept removes the contents of the fileContentsCache except for the // ClearFilesExcept removes the contents of the fileContentsCache except for the
// provided file numbers. This helps conserve memory by removing the contents of // provided file numbers. This helps conserve memory by removing the contents of
// files that are no longer of interest, which we can be sure of since we are // files that are no longer of interest, which we can be sure of since we are