Compare commits

..

No commits in common. "numeric-comparison" and "master" have entirely different histories.

2 changed files with 20 additions and 89 deletions

3
.gitignore vendored
View File

@ -2,5 +2,4 @@
output.*.txt output.*.txt
.vscode .vscode
files files
files.* files.*/
*.prof

106
main.go
View File

@ -9,7 +9,6 @@ import (
"os" "os"
"path" "path"
"runtime" "runtime"
"runtime/pprof"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
@ -44,16 +43,6 @@ func main() {
start = time.Now() start = time.Now()
padding = "\n" padding = "\n"
) )
f, err := os.Create("cpu.prof")
if err != nil {
panic(err)
}
if err := pprof.StartCPUProfile(f); err != nil {
panic(err)
}
defer pprof.StopCPUProfile()
documents, output, err := run(os.Args) documents, output, err := run(os.Args)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err) fmt.Fprintf(os.Stderr, "error: %v\n", err)
@ -106,7 +95,7 @@ func run(args []string) ([]*Document, *os.File, error) {
// The files need to be processed in order of time, so determine the // The files need to be processed in order of time, so determine the
// timestamp of each file and sort them by time. // timestamp of each file and sort them by time.
fileTimes, times, err := orderFiles(dataFilePath, shouldReportStatus) fileTimes, times, err := orderFiles(dataFilePath)
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
@ -319,16 +308,29 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor
// on the number of line-centric differences between the contents of the two // on the number of line-centric differences between the contents of the two
// files. // files.
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) { func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
f1, err := dm.fcc.GetFileLineID(f1Number) f1, err := dm.fcc.GetFileContents(f1Number)
if err != nil { if err != nil {
return 0, fmt.Errorf("file %d: %w", f1Number, err) return 0, fmt.Errorf("file %d: %w", f1Number, err)
} }
f2, err := dm.fcc.GetFileLineID(f2Number) f2, err := dm.fcc.GetFileContents(f2Number)
if err != nil { if err != nil {
return 0, fmt.Errorf("file %d: %w", f2Number, err) return 0, fmt.Errorf("file %d: %w", f2Number, err)
} }
similarity := compareFileLineIDs(f1, f2) histogram := make(map[string]int)
for _, line := range f1 {
histogram[line]++
}
for _, line := range f2 {
histogram[line]--
}
var differences float64
for _, v := range histogram {
differences += math.Abs(float64(v))
}
similarity := 1 - (differences / float64(len(f1)+len(f2)))
return similarity, nil return similarity, nil
} }
@ -380,7 +382,6 @@ func (d *Document) SortAssociatedFiles() {
type FileContentsCache struct { type FileContentsCache struct {
BaseDir string BaseDir string
cache sync.Map cache sync.Map
lineIDs sync.Map
} }
// GetFileContents returns the contents of a file, excluding the first timestamp // GetFileContents returns the contents of a file, excluding the first timestamp
@ -417,69 +418,6 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error)
return lines, nil return lines, nil
} }
func (fcc *FileContentsCache) GetFileLineID(fileNumber int) ([]uint64, error) {
if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok {
return cachedLineIDs.([]uint64), nil
}
lines, err := fcc.GetFileContents(fileNumber)
if err != nil {
return nil, fmt.Errorf("adding line IDs for file %d: %w", fileNumber, err)
}
lineIDs := makeFileLinesID(lines)
fcc.lineIDs.Store(fileNumber, lineIDs)
return lineIDs, nil
}
var (
fileLinesCache sync.Map
fileLineIDSource atomic.Uint64
)
func makeFileLinesID(fileLines []string) []uint64 {
fileLineIDs := make([]uint64, 0, len(fileLines))
for _, line := range fileLines {
lineID, ok := fileLinesCache.Load(line)
if !ok {
newID := fileLineIDSource.Add(1)
fileLinesCache.Store(line, newID)
fileLineIDs = append(fileLineIDs, newID)
} else {
fileLineIDs = append(fileLineIDs, lineID.(uint64))
}
}
slices.Sort(fileLineIDs)
return fileLineIDs
}
func compareFileLineIDs(f1, f2 []uint64) float64 {
var (
i, j = 0, 0
count int
similarity int
)
for i < len(f1) && j < len(f2) {
count++
if f1[i] == f2[j] {
similarity++
i++
j++
} else if f1[i] < f2[j] {
i++
} else {
j++
}
}
count += len(f1) - i + len(f2) - j
if count == 0 {
return 0
}
return float64(similarity) / float64(count)
}
// ClearFilesExcept removes the contents of the fileContentsCache except for the // ClearFilesExcept removes the contents of the fileContentsCache except for the
// provided file numbers. This helps conserve memory by removing the contents of // provided file numbers. This helps conserve memory by removing the contents of
// files that are no longer of interest, which we can be sure of since we are // files that are no longer of interest, which we can be sure of since we are
@ -530,20 +468,14 @@ func readFileTime(filepath string) (int, error) {
// the map can be iterated in order of time. This allows stepping through the // the map can be iterated in order of time. This allows stepping through the
// history of the files from the beginning. Using this, we can construct a // history of the files from the beginning. Using this, we can construct a
// "chain" of evolution for a given document. // "chain" of evolution for a given document.
func orderFiles(dir string, shouldReportStatus bool) (map[int][]int, []int, error) { func orderFiles(dir string) (map[int][]int, []int, error) {
if shouldReportStatus {
defer fmt.Fprintln(os.Stderr)
}
timeMap := make(map[int][]int) timeMap := make(map[int][]int)
dirEntries, err := os.ReadDir(dir) dirEntries, err := os.ReadDir(dir)
if err != nil { if err != nil {
return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err) return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
} }
for i, entry := range dirEntries { for _, entry := range dirEntries {
if shouldReportStatus {
fmt.Fprintf(os.Stderr, "\rReading directory %d of %d...", i+1, len(dirEntries))
}
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") { if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
continue continue
} }