Compare commits
2 Commits
master
...
numeric-co
| Author | SHA1 | Date |
|---|---|---|
|
|
42d297263b | |
|
|
42db2d544f |
|
|
@ -2,4 +2,5 @@
|
||||||
output.*.txt
|
output.*.txt
|
||||||
.vscode
|
.vscode
|
||||||
files
|
files
|
||||||
files.*/
|
files.*
|
||||||
|
*.prof
|
||||||
|
|
|
||||||
106
main.go
106
main.go
|
|
@ -9,6 +9,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"runtime/pprof"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
@ -43,6 +44,16 @@ func main() {
|
||||||
start = time.Now()
|
start = time.Now()
|
||||||
padding = "\n"
|
padding = "\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
f, err := os.Create("cpu.prof")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
if err := pprof.StartCPUProfile(f); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
defer pprof.StopCPUProfile()
|
||||||
|
|
||||||
documents, output, err := run(os.Args)
|
documents, output, err := run(os.Args)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
|
|
@ -95,7 +106,7 @@ func run(args []string) ([]*Document, *os.File, error) {
|
||||||
|
|
||||||
// The files need to be processed in order of time, so determine the
|
// The files need to be processed in order of time, so determine the
|
||||||
// timestamp of each file and sort them by time.
|
// timestamp of each file and sort them by time.
|
||||||
fileTimes, times, err := orderFiles(dataFilePath)
|
fileTimes, times, err := orderFiles(dataFilePath, shouldReportStatus)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -308,29 +319,16 @@ func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, wor
|
||||||
// on the number of line-centric differences between the contents of the two
|
// on the number of line-centric differences between the contents of the two
|
||||||
// files.
|
// files.
|
||||||
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
|
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
|
||||||
f1, err := dm.fcc.GetFileContents(f1Number)
|
f1, err := dm.fcc.GetFileLineID(f1Number)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("file %d: %w", f1Number, err)
|
return 0, fmt.Errorf("file %d: %w", f1Number, err)
|
||||||
}
|
}
|
||||||
f2, err := dm.fcc.GetFileContents(f2Number)
|
f2, err := dm.fcc.GetFileLineID(f2Number)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("file %d: %w", f2Number, err)
|
return 0, fmt.Errorf("file %d: %w", f2Number, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
histogram := make(map[string]int)
|
similarity := compareFileLineIDs(f1, f2)
|
||||||
for _, line := range f1 {
|
|
||||||
histogram[line]++
|
|
||||||
}
|
|
||||||
for _, line := range f2 {
|
|
||||||
histogram[line]--
|
|
||||||
}
|
|
||||||
|
|
||||||
var differences float64
|
|
||||||
for _, v := range histogram {
|
|
||||||
differences += math.Abs(float64(v))
|
|
||||||
}
|
|
||||||
|
|
||||||
similarity := 1 - (differences / float64(len(f1)+len(f2)))
|
|
||||||
return similarity, nil
|
return similarity, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -382,6 +380,7 @@ func (d *Document) SortAssociatedFiles() {
|
||||||
type FileContentsCache struct {
|
type FileContentsCache struct {
|
||||||
BaseDir string
|
BaseDir string
|
||||||
cache sync.Map
|
cache sync.Map
|
||||||
|
lineIDs sync.Map
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetFileContents returns the contents of a file, excluding the first timestamp
|
// GetFileContents returns the contents of a file, excluding the first timestamp
|
||||||
|
|
@ -418,6 +417,69 @@ func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error)
|
||||||
return lines, nil
|
return lines, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (fcc *FileContentsCache) GetFileLineID(fileNumber int) ([]uint64, error) {
|
||||||
|
if cachedLineIDs, ok := fcc.lineIDs.Load(fileNumber); ok {
|
||||||
|
return cachedLineIDs.([]uint64), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
lines, err := fcc.GetFileContents(fileNumber)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("adding line IDs for file %d: %w", fileNumber, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
lineIDs := makeFileLinesID(lines)
|
||||||
|
fcc.lineIDs.Store(fileNumber, lineIDs)
|
||||||
|
return lineIDs, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
fileLinesCache sync.Map
|
||||||
|
fileLineIDSource atomic.Uint64
|
||||||
|
)
|
||||||
|
|
||||||
|
func makeFileLinesID(fileLines []string) []uint64 {
|
||||||
|
fileLineIDs := make([]uint64, 0, len(fileLines))
|
||||||
|
for _, line := range fileLines {
|
||||||
|
lineID, ok := fileLinesCache.Load(line)
|
||||||
|
if !ok {
|
||||||
|
newID := fileLineIDSource.Add(1)
|
||||||
|
fileLinesCache.Store(line, newID)
|
||||||
|
fileLineIDs = append(fileLineIDs, newID)
|
||||||
|
} else {
|
||||||
|
fileLineIDs = append(fileLineIDs, lineID.(uint64))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slices.Sort(fileLineIDs)
|
||||||
|
return fileLineIDs
|
||||||
|
}
|
||||||
|
|
||||||
|
func compareFileLineIDs(f1, f2 []uint64) float64 {
|
||||||
|
var (
|
||||||
|
i, j = 0, 0
|
||||||
|
count int
|
||||||
|
similarity int
|
||||||
|
)
|
||||||
|
for i < len(f1) && j < len(f2) {
|
||||||
|
count++
|
||||||
|
if f1[i] == f2[j] {
|
||||||
|
similarity++
|
||||||
|
i++
|
||||||
|
j++
|
||||||
|
} else if f1[i] < f2[j] {
|
||||||
|
i++
|
||||||
|
} else {
|
||||||
|
j++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
count += len(f1) - i + len(f2) - j
|
||||||
|
if count == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return float64(similarity) / float64(count)
|
||||||
|
}
|
||||||
|
|
||||||
// ClearFilesExcept removes the contents of the fileContentsCache except for the
|
// ClearFilesExcept removes the contents of the fileContentsCache except for the
|
||||||
// provided file numbers. This helps conserve memory by removing the contents of
|
// provided file numbers. This helps conserve memory by removing the contents of
|
||||||
// files that are no longer of interest, which we can be sure of since we are
|
// files that are no longer of interest, which we can be sure of since we are
|
||||||
|
|
@ -468,14 +530,20 @@ func readFileTime(filepath string) (int, error) {
|
||||||
// the map can be iterated in order of time. This allows stepping through the
|
// the map can be iterated in order of time. This allows stepping through the
|
||||||
// history of the files from the beginning. Using this, we can construct a
|
// history of the files from the beginning. Using this, we can construct a
|
||||||
// "chain" of evolution for a given document.
|
// "chain" of evolution for a given document.
|
||||||
func orderFiles(dir string) (map[int][]int, []int, error) {
|
func orderFiles(dir string, shouldReportStatus bool) (map[int][]int, []int, error) {
|
||||||
|
if shouldReportStatus {
|
||||||
|
defer fmt.Fprintln(os.Stderr)
|
||||||
|
}
|
||||||
timeMap := make(map[int][]int)
|
timeMap := make(map[int][]int)
|
||||||
|
|
||||||
dirEntries, err := os.ReadDir(dir)
|
dirEntries, err := os.ReadDir(dir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
|
return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
|
||||||
}
|
}
|
||||||
for _, entry := range dirEntries {
|
for i, entry := range dirEntries {
|
||||||
|
if shouldReportStatus {
|
||||||
|
fmt.Fprintf(os.Stderr, "\rReading directory %d of %d...", i+1, len(dirEntries))
|
||||||
|
}
|
||||||
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
|
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue