Compare commits

...

2 Commits

Author SHA1 Message Date
Ian Molee 42d297263b Fix comparison algorithm
Fix the file line ID comparison algorithm to use a Jaccard Index. This
correctly identifies the number of documents in the corpus.

Also add a little status output when files are being ordered by
timestamp.
2024-05-24 02:13:06 -07:00
Ian Molee 42db2d544f Add numeric comparison proof of concept
After speaking with Jacob, he led me to a more efficient way to compare
files, which was a hot path for the program execution. Using hashmaps
with long strings as keys is pretty inefficient. Faster would to alias
lines with a numeric ID, and represent file contents as a list of line
IDs. Then comparing file contents could be done simply by comparing two
lists of numbers. If the lists are sorted, they can be stepped through
using two indices to determine their similarity.

Currently, the acutal comparison is broken, and is over-reporting the
number of actual documents in the provided corpus by nearly 2x. This is
likely because the index for file 1 always increases, where the index
for file 2 is conditionally increased. Both indices need to be
incremented under different conditions, to allow the one that is
"behind" to catch up to the one that is "ahead" (which we can do because
the lists are sorted).
2024-05-23 01:03:12 -07:00
2 changed files with 89 additions and 20 deletions

3
.gitignore vendored
View File

@ -2,4 +2,5 @@
output.*.txt output.*.txt
.vscode .vscode
files files
files.*/ files.*
*.prof

106
main.go
View File

@ -9,6 +9,7 @@ import (
"os" "os"
"path" "path"
"runtime" "runtime"
"runtime/pprof"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
@ -43,6 +44,16 @@ func main() {
start = time.Now() start = time.Now()
padding = "\n" padding = "\n"
) )
f, err := os.Create("cpu.prof")
if err != nil {
panic(err)
}
if err := pprof.StartCPUProfile(f); err != nil {
panic(err)
}
defer pprof.StopCPUProfile()
documents, output, err := run(os.Args) documents, output, err := run(os.Args)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err) fmt.Fprintf(os.Stderr, "error: %v\n", err)
@ -95,7 +106,7 @@ func run(args []string) ([]*Document, *os.File, error) {
// The files need to be processed in order of time, so determine the // The files need to be processed in order of time, so determine the
// timestamp of each file and sort them by time. // timestamp of each file and sort them by time.
fileTimes, times, err := orderFiles(dataFilePath) fileTimes, times, err := orderFiles(dataFilePath, shouldReportStatus)
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
@ -308,29 +319,16 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor
// on the number of line-centric differences between the contents of the two // on the number of line-centric differences between the contents of the two
// files. // files.
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) { func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
f1, err := dm.fcc.GetFileContents(f1Number) f1, err := dm.fcc.GetFileLineID(f1Number)
if err != nil { if err != nil {
return 0, fmt.Errorf("file %d: %w", f1Number, err) return 0, fmt.Errorf("file %d: %w", f1Number, err)
} }
f2, err := dm.fcc.GetFileContents(f2Number) f2, err := dm.fcc.GetFileLineID(f2Number)
if err != nil { if err != nil {
return 0, fmt.Errorf("file %d: %w", f2Number, err) return 0, fmt.Errorf("file %d: %w", f2Number, err)
} }
histogram := make(map[string]int) similarity := compareFileLineIDs(f1, f2)
for _, line := range f1 {
histogram[line]++
}
for _, line := range f2 {
histogram[line]--
}
var differences float64
for _, v := range histogram {
differences += math.Abs(float64(v))
}
similarity := 1 - (differences / float64(len(f1)+len(f2)))
return similarity, nil return similarity, nil
} }
@ -382,6 +380,7 @@ func (d *Document) SortAssociatedFiles() {
type FileContentsCache struct { type FileContentsCache struct {
BaseDir string BaseDir string
cache sync.Map cache sync.Map
lineIDs sync.Map
} }
// GetFileContents returns the contents of a file, excluding the first timestamp // GetFileContents returns the contents of a file, excluding the first timestamp
@ -418,6 +417,69 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error)
return lines, nil return lines, nil
} }
func (fcc *FileContentsCache) GetFileLineID(fileNumber int) ([]uint64, error) {
if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok {
return cachedLineIDs.([]uint64), nil
}
lines, err := fcc.GetFileContents(fileNumber)
if err != nil {
return nil, fmt.Errorf("adding line IDs for file %d: %w", fileNumber, err)
}
lineIDs := makeFileLinesID(lines)
fcc.lineIDs.Store(fileNumber, lineIDs)
return lineIDs, nil
}
var (
fileLinesCache sync.Map
fileLineIDSource atomic.Uint64
)
func makeFileLinesID(fileLines []string) []uint64 {
fileLineIDs := make([]uint64, 0, len(fileLines))
for _, line := range fileLines {
lineID, ok := fileLinesCache.Load(line)
if !ok {
newID := fileLineIDSource.Add(1)
fileLinesCache.Store(line, newID)
fileLineIDs = append(fileLineIDs, newID)
} else {
fileLineIDs = append(fileLineIDs, lineID.(uint64))
}
}
slices.Sort(fileLineIDs)
return fileLineIDs
}
func compareFileLineIDs(f1, f2 []uint64) float64 {
var (
i, j = 0, 0
count int
similarity int
)
for i < len(f1) && j < len(f2) {
count++
if f1[i] == f2[j] {
similarity++
i++
j++
} else if f1[i] < f2[j] {
i++
} else {
j++
}
}
count += len(f1) - i + len(f2) - j
if count == 0 {
return 0
}
return float64(similarity) / float64(count)
}
// ClearFilesExcept removes the contents of the fileContentsCache except for the // ClearFilesExcept removes the contents of the fileContentsCache except for the
// provided file numbers. This helps conserve memory by removing the contents of // provided file numbers. This helps conserve memory by removing the contents of
// files that are no longer of interest, which we can be sure of since we are // files that are no longer of interest, which we can be sure of since we are
@ -468,14 +530,20 @@ func readFileTime(filepath string) (int, error) {
// the map can be iterated in order of time. This allows stepping through the // the map can be iterated in order of time. This allows stepping through the
// history of the files from the beginning. Using this, we can construct a // history of the files from the beginning. Using this, we can construct a
// "chain" of evolution for a given document. // "chain" of evolution for a given document.
func orderFiles(dir string) (map[int][]int, []int, error) { func orderFiles(dir string, shouldReportStatus bool) (map[int][]int, []int, error) {
if shouldReportStatus {
defer fmt.Fprintln(os.Stderr)
}
timeMap := make(map[int][]int) timeMap := make(map[int][]int)
dirEntries, err := os.ReadDir(dir) dirEntries, err := os.ReadDir(dir)
if err != nil { if err != nil {
return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err) return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
} }
for _, entry := range dirEntries { for i, entry := range dirEntries {
if shouldReportStatus {
fmt.Fprintf(os.Stderr, "\rReading directory %d of %d...", i+1, len(dirEntries))
}
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") { if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
continue continue
} }