commit 5f1a8bc256a7e257515751b9d2e5c9918f424ceb Author: Ian Molee Date: Sat Mar 23 20:13:30 2024 -0700 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..397b4a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.log diff --git a/ProjectDescription.pdf b/ProjectDescription.pdf new file mode 100644 index 0000000..8c549fd Binary files /dev/null and b/ProjectDescription.pdf differ diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..49f2ba2 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/ianfoo/steelray-docgrouper + +go 1.22.1 + +require github.com/adrg/strutil v0.3.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..e81f627 --- /dev/null +++ b/go.sum @@ -0,0 +1,18 @@ +github.com/adrg/strutil v0.3.1 h1:OLvSS7CSJO8lBii4YmBt8jiK9QOtB9CzCzwl4Ic/Fz4= +github.com/adrg/strutil v0.3.1/go.mod h1:8h90y18QLrs11IBffcGX3NW/GFBXCMcNg4M7H6MspPA= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go new file mode 100644 index 0000000..7fe8cb1 --- /dev/null +++ b/main.go @@ -0,0 +1,373 @@ +// Timesheet +// Previously ~1h +// March 13, 2024: 00:00-02:30 +// March 19, 2024: 15:00-19:00 +// March 23, 2024: 20:00- +package main + +import ( + "bufio" + "flag" + "fmt" + "log/slog" + "os" + "path" + "slices" + "strconv" + "strings" + "sync" + "sync/atomic" +) + +// DataFilePath describes the default location where the file pool can be found. +const DefaultDataFilePath = "files" + +// Command line options +var ( + DataFilePath string + UseDocPrefix bool + Verbose bool +) + +func main() { + if err := run(os.Args); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +// run is the main entry point for the program. +func run(args []string) error { + flags := flag.NewFlagSet(args[0], flag.ExitOnError) + flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool") + flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output") + flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging") + flags.Parse(args[1:]) + + // SimilarityThreshold is the minimum similarity required for two files + // to be considered related. This value is arbitrary and could be adjusted + // based on the specific requirements of the problem. + const SimilarityThreshold = 0.5 + + fileTimes, times, err := orderFiles() + if err != nil { + return err + } + + var ( + // documents is the master list of documents that will be built up. + documents []*Document + + // fcc handles reading files and caching contents. + fcc = make(fileContentsCache) + ) + + for i, timestamp := range times { + _ = i + // fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes)) + // Track the files at this timestamp that have been associated with documents, so + // we can identify unassociated files later and then create new documents for them. + var ( + wg sync.WaitGroup + associatedFiles sync.Map + ) + + wg.Add(len(documents)) + log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents)) + for _, doc := range documents { + // Start a goroutine for each document, to parallelize the + // comparison with the files in the current timestamp. A more robust + // solution would limit the number of concurrent goroutines to avoid + // exhausting system resources, but for this problem we won't have + // more than a couple thousand documents. Goroutines are + // lightweight enough (2K stack) that we can start them pretty + // capriciously. + go func(doc *Document, files []int) { + defer wg.Done() + for _, candidateFileNumber := range files { + // Check to be certain this file hasn't been associated with another + // document already. If it has been, continue to the next file. + if _, ok := associatedFiles.Load(candidateFileNumber); ok { + continue + } + + latestFileNumber := doc.LatestAssociatedFile() + overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber) + if err != nil { + fmt.Fprintf( + os.Stderr, + "error comparing files %d and %d: %v\n", + latestFileNumber, candidateFileNumber, err, + ) + } + if overlap >= SimilarityThreshold { + // Add file to Document associated list + doc.AssociateFile(candidateFileNumber, timestamp) + associatedFiles.Store(candidateFileNumber, struct{}{}) + + // We know this document won't be associated with any other files + // with this timestamp, so we can stop looking at files with this + // timestamp, for this document. + return + } + } + }(doc, fileTimes[timestamp]) + } + + // Wait for all document comparisons to complete for this timestamp. + wg.Wait() + + // If we haven't associated all the files with existing documents, we need + // to create new documents for those that remain. + currentNumDocs := len(documents) + for _, fileNumber := range fileTimes[timestamp] { + if _, ok := associatedFiles.Load(fileNumber); !ok { + doc := NewDocument(fileNumber, timestamp) + documents = append(documents, &doc) + } + } + if len(documents) > currentNumDocs { + log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp) + } + + // Now we can clear the cache of file contents for files that aren't associated with + // a document, to conserve memory. + var latestDocumentFiles []int + for _, doc := range documents { + latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile()) + } + fcc.clearFilesExcept(latestDocumentFiles) + } + + // Output the list of documents, showing their associated files in ascending order. + // Order the documents by their first associated file. + slices.SortFunc(documents, func(a, b *Document) int { + return a.AssociatedFiles[0] - b.AssociatedFiles[0] + }) + for _, doc := range documents { + doc.SortAssociatedFiles() + fmt.Println(doc) + } + + return nil +} + +// DocumentIDSource is a concurrency-safe source from which to identify +// documents. This could easily be something other than an integer, but using +// this allows us to just use the standard library. +var DocumentIDSource atomic.Uint32 + +// Document stores a document ID and a list of associated files. +type Document struct { + AssociatedFiles []int + LatestTimestamp int + ID uint32 +} + +// String formats a document for output in the format described in the +// requirements. +func (d Document) String() string { + var sb strings.Builder + for _, f := range d.AssociatedFiles { + sb.WriteString(fmt.Sprintf("%d ", f)) + } + if UseDocPrefix { + return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String()) + } + return sb.String() +} + +// AssociateFile adds a file number to the list of associated files for a +// document, and also records the latest timestamp now associated with the +// document. +func (d *Document) AssociateFile(fileNumber, timestamp int) { + d.AssociatedFiles = append(d.AssociatedFiles, fileNumber) + d.LatestTimestamp = timestamp +} + +// LatestAssociatedFile returns the most recent file associated with a document. +func (d Document) LatestAssociatedFile() int { + return d.AssociatedFiles[len(d.AssociatedFiles)-1] +} + +// SortAssociatedFiles sorts the list of associated files for a document, +// since the requirements stipulate output in ascending order. +func (d *Document) SortAssociatedFiles() { + slices.Sort(d.AssociatedFiles) +} + +// NewDocument creates a new Document struct and initializes an ID and records +// the first file and timestamp associated with it. +func NewDocument(fileNumber, timestamp int) Document { + return Document{ + ID: DocumentIDSource.Add(1), + LatestTimestamp: timestamp, + AssociatedFiles: []int{fileNumber}, + } +} + +// readFileTime reads the first line of the file, which represents a +// time/version. The integer value will be returned. +func readFileTime(filepath string) (int, error) { + file, err := os.Open(filepath) + if err != nil { + return 0, err + } + defer file.Close() + + s := bufio.NewScanner(file) + var firstLine string + if s.Scan() { + firstLine = s.Text() + } + if err := s.Err(); err != nil { + return 0, err + } + + time, err := strconv.Atoi(firstLine) + if err != nil { + return 0, fmt.Errorf("invalid time %s: %w", firstLine, err) + } + return time, nil +} + +// compareFiles computes how much two files overlap, on a scale +// of 0 to 1 by iterating through the files and identifying lines +// that are duplicated. +func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) { + f1, err := fcc.getFileContents(f1Number) + if err != nil { + return 0, fmt.Errorf("file %d: %w", f1Number, err) + } + f2, err := fcc.getFileContents(f2Number) + if err != nil { + return 0, fmt.Errorf("file %d: %w", f2Number, err) + } + + histogram := make(map[string]int) + for _, lines := range [][]string{f1, f2} { + for _, line := range lines { + histogram[line]++ + } + } + + var overlap int + for _, v := range histogram { + if v == 2 { + overlap++ + } + } + return float64(overlap) / float64(len(histogram)), nil +} + +// fileContentsCache is a cache of file contents, keyed by file number, +// to avoid reading the same file from disk multiple times. +type fileContentsCache map[int][]string + +// getFileContents returns the contents of a file, excluding the first timestamp +// line. If the file is already in the cache, the contents are returned from +// there, otherwise the file is read from disk and the contents are cached. +func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) { + if contents, ok := fcc[fileNumber]; ok { + return contents, nil + } + var ( + fileName = makeFilePath(fileNumber) + lines []string + ) + + f, err := os.Open(fileName) + if err != nil { + return nil, err + } + + s := bufio.NewScanner(f) + + // Ignore first line that's just a timestamp. + if !s.Scan() { + fcc[fileNumber] = []string{} + return []string{}, nil + } + + for s.Scan() { + lines = append(lines, s.Text()) + } + if err := s.Err(); err != nil { + return nil, err + } + return lines, nil +} + +// clearFilesExcept removes the contents of the fileContentsCache except for the +// provided file numbers. This helps conserve memory by removing the contents of +// files that are no longer of interest, which we can be sure of since we are +// proceeding in order of time. +func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) { + for fNum := range fcc { + if !slices.Contains(fileNumbers, fNum) { + delete(fcc, fNum) + } + } +} + +func makeFileName(number int) string { + return fmt.Sprintf("%d.txt", number) +} + +func makeFilePath(number int) string { + return path.Join(DataFilePath, makeFileName(number)) +} + +// orderFiles determines the timestamp version of each file and creates a map of +// time to file numbers. It sorts the times (since maps are not ordered) so that +// the map can be iterated in order of time. This allows stepping through the +// history of the files from the beginning. Using this, we can construct a +// "chain" of evolution for a given document. +func orderFiles() (map[int][]int, []int, error) { + timeMap := make(map[int][]int) + + dirEntries, err := os.ReadDir(DataFilePath) + if err != nil { + return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err) + } + for _, entry := range dirEntries { + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") { + continue + } + + // Get the numeric representation of the file number. + var fileNumber int + { + numberStr := strings.TrimSuffix(entry.Name(), ".txt") + var err error + if fileNumber, err = strconv.Atoi(numberStr); err != nil { + return nil, nil, fmt.Errorf("invalid file number in file name %q: %w", entry.Name(), err) + } + } + + filePath := path.Join(DataFilePath, entry.Name()) + modTime, err := readFileTime(filePath) + if err != nil { + return nil, nil, err + } + if timeMap[modTime] == nil { + timeMap[modTime] = make([]int, 0) + } + timeMap[modTime] = append(timeMap[modTime], fileNumber) + } + + // Now make a slice of the times and sort them, so we can iterate through + // them in order. + timeSlice := make([]int, 0, len(timeMap)) + for k := range timeMap { + timeSlice = append(timeSlice, k) + } + slices.Sort(timeSlice) + return timeMap, timeSlice, nil +} + +func log(msg string, args ...any) { + if Verbose { + slog.Info(msg, args...) + } +}