Fix comparison algorithm

Fix the file line ID comparison algorithm to use a Jaccard Index. This
correctly identifies the number of documents in the corpus.

Also add a little status output when files are being ordered by
timestamp.
This commit is contained in:
Ian Molee 2024-05-24 02:13:06 -07:00
parent 42db2d544f
commit 42d297263b
1 changed files with 30 additions and 12 deletions

42
main.go
View File

@ -106,7 +106,7 @@ func run(args []string) ([]*Document, *os.File, error) {
// The files need to be processed in order of time, so determine the
// timestamp of each file and sort them by time.
fileTimes, times, err := orderFiles(dataFilePath)
fileTimes, times, err := orderFiles(dataFilePath, shouldReportStatus)
if err != nil {
return nil, nil, err
}
@ -319,11 +319,11 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor
// on the number of line-centric differences between the contents of the two
// files.
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
f1, err := dm.fcc.GetLineIDsForFile(f1Number)
f1, err := dm.fcc.GetFileLineID(f1Number)
if err != nil {
return 0, fmt.Errorf("file %d: %w", f1Number, err)
}
f2, err := dm.fcc.GetLineIDsForFile(f2Number)
f2, err := dm.fcc.GetFileLineID(f2Number)
if err != nil {
return 0, fmt.Errorf("file %d: %w", f2Number, err)
}
@ -417,7 +417,7 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error)
return lines, nil
}
func (fcc *FileContentsCache) GetLineIDsForFile(fileNumber int) ([]uint64, error) {
func (fcc *FileContentsCache) GetFileLineID(fileNumber int) ([]uint64, error) {
if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok {
return cachedLineIDs.([]uint64), nil
}
@ -455,17 +455,29 @@ func makeFileLinesID(fileLines []string) []uint64 {
func compareFileLineIDs(f1, f2 []uint64) float64 {
var (
f2Index int
i, j = 0, 0
count int
similarity int
)
for _, lineID := range f1 {
if f2Index < len(f2) && f2[f2Index] == lineID {
similarity += 2
f2Index++
for i < len(f1) && j < len(f2) {
count++
if f1[i] == f2[j] {
similarity++
i++
j++
} else if f1[i] < f2[j] {
i++
} else {
j++
}
}
return float64(similarity) / float64(len(f1)+len(f2))
count += len(f1) - i + len(f2) - j
if count == 0 {
return 0
}
return float64(similarity) / float64(count)
}
// ClearFilesExcept removes the contents of the fileContentsCache except for the
@ -518,14 +530,20 @@ func readFileTime(filepath string) (int, error) {
// the map can be iterated in order of time. This allows stepping through the
// history of the files from the beginning. Using this, we can construct a
// "chain" of evolution for a given document.
func orderFiles(dir string) (map[int][]int, []int, error) {
func orderFiles(dir string, shouldReportStatus bool) (map[int][]int, []int, error) {
if shouldReportStatus {
defer fmt.Fprintln(os.Stderr)
}
timeMap := make(map[int][]int)
dirEntries, err := os.ReadDir(dir)
if err != nil {
return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
}
for _, entry := range dirEntries {
for i, entry := range dirEntries {
if shouldReportStatus {
fmt.Fprintf(os.Stderr, "\rReading directory %d of %d...", i+1, len(dirEntries))
}
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
continue
}