// Timesheet // Previously ~1h // March 13, 2024: 00:00-02:30 // March 19, 2024: 15:00-19:00 // March 23, 2024: 20:00- package main import ( "bufio" "flag" "fmt" "log/slog" "os" "path" "slices" "strconv" "strings" "sync" "sync/atomic" ) // DataFilePath describes the default location where the file pool can be found. const DefaultDataFilePath = "files" // Command line options var ( DataFilePath string UseDocPrefix bool Verbose bool ) func main() { if err := run(os.Args); err != nil { fmt.Fprintf(os.Stderr, "error: %v\n", err) os.Exit(1) } } // run is the main entry point for the program. func run(args []string) error { flags := flag.NewFlagSet(args[0], flag.ExitOnError) flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool") flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output") flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging") flags.Parse(args[1:]) // SimilarityThreshold is the minimum similarity required for two files // to be considered related. This value is arbitrary and could be adjusted // based on the specific requirements of the problem. const SimilarityThreshold = 0.5 fileTimes, times, err := orderFiles() if err != nil { return err } var ( // documents is the master list of documents that will be built up. documents []*Document // fcc handles reading files and caching contents. fcc = make(fileContentsCache) ) for i, timestamp := range times { _ = i // fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes)) // Track the files at this timestamp that have been associated with documents, so // we can identify unassociated files later and then create new documents for them. var ( wg sync.WaitGroup associatedFiles sync.Map ) wg.Add(len(documents)) log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents)) for _, doc := range documents { // Start a goroutine for each document, to parallelize the // comparison with the files in the current timestamp. A more robust // solution would limit the number of concurrent goroutines to avoid // exhausting system resources, but for this problem we won't have // more than a couple thousand documents. Goroutines are // lightweight enough (2K stack) that we can start them pretty // capriciously. go func(doc *Document, files []int) { defer wg.Done() for _, candidateFileNumber := range files { // Check to be certain this file hasn't been associated with another // document already. If it has been, continue to the next file. if _, ok := associatedFiles.Load(candidateFileNumber); ok { continue } latestFileNumber := doc.LatestAssociatedFile() overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber) if err != nil { fmt.Fprintf( os.Stderr, "error comparing files %d and %d: %v\n", latestFileNumber, candidateFileNumber, err, ) } if overlap >= SimilarityThreshold { // Add file to Document associated list doc.AssociateFile(candidateFileNumber, timestamp) associatedFiles.Store(candidateFileNumber, struct{}{}) // We know this document won't be associated with any other files // with this timestamp, so we can stop looking at files with this // timestamp, for this document. return } } }(doc, fileTimes[timestamp]) } // Wait for all document comparisons to complete for this timestamp. wg.Wait() // If we haven't associated all the files with existing documents, we need // to create new documents for those that remain. currentNumDocs := len(documents) for _, fileNumber := range fileTimes[timestamp] { if _, ok := associatedFiles.Load(fileNumber); !ok { doc := NewDocument(fileNumber, timestamp) documents = append(documents, &doc) } } if len(documents) > currentNumDocs { log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp) } // Now we can clear the cache of file contents for files that aren't associated with // a document, to conserve memory. var latestDocumentFiles []int for _, doc := range documents { latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile()) } fcc.clearFilesExcept(latestDocumentFiles) } // Output the list of documents, showing their associated files in ascending order. // Order the documents by their first associated file. slices.SortFunc(documents, func(a, b *Document) int { return a.AssociatedFiles[0] - b.AssociatedFiles[0] }) for _, doc := range documents { doc.SortAssociatedFiles() fmt.Println(doc) } return nil } // DocumentIDSource is a concurrency-safe source from which to identify // documents. This could easily be something other than an integer, but using // this allows us to just use the standard library. var DocumentIDSource atomic.Uint32 // Document stores a document ID and a list of associated files. type Document struct { AssociatedFiles []int LatestTimestamp int ID uint32 } // String formats a document for output in the format described in the // requirements. func (d Document) String() string { var sb strings.Builder for _, f := range d.AssociatedFiles { sb.WriteString(fmt.Sprintf("%d ", f)) } if UseDocPrefix { return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String()) } return sb.String() } // AssociateFile adds a file number to the list of associated files for a // document, and also records the latest timestamp now associated with the // document. func (d *Document) AssociateFile(fileNumber, timestamp int) { d.AssociatedFiles = append(d.AssociatedFiles, fileNumber) d.LatestTimestamp = timestamp } // LatestAssociatedFile returns the most recent file associated with a document. func (d Document) LatestAssociatedFile() int { return d.AssociatedFiles[len(d.AssociatedFiles)-1] } // SortAssociatedFiles sorts the list of associated files for a document, // since the requirements stipulate output in ascending order. func (d *Document) SortAssociatedFiles() { slices.Sort(d.AssociatedFiles) } // NewDocument creates a new Document struct and initializes an ID and records // the first file and timestamp associated with it. func NewDocument(fileNumber, timestamp int) Document { return Document{ ID: DocumentIDSource.Add(1), LatestTimestamp: timestamp, AssociatedFiles: []int{fileNumber}, } } // readFileTime reads the first line of the file, which represents a // time/version. The integer value will be returned. func readFileTime(filepath string) (int, error) { file, err := os.Open(filepath) if err != nil { return 0, err } defer file.Close() s := bufio.NewScanner(file) var firstLine string if s.Scan() { firstLine = s.Text() } if err := s.Err(); err != nil { return 0, err } time, err := strconv.Atoi(firstLine) if err != nil { return 0, fmt.Errorf("invalid time %s: %w", firstLine, err) } return time, nil } // compareFiles computes how much two files overlap, on a scale // of 0 to 1 by iterating through the files and identifying lines // that are duplicated. func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) { f1, err := fcc.getFileContents(f1Number) if err != nil { return 0, fmt.Errorf("file %d: %w", f1Number, err) } f2, err := fcc.getFileContents(f2Number) if err != nil { return 0, fmt.Errorf("file %d: %w", f2Number, err) } histogram := make(map[string]int) for _, lines := range [][]string{f1, f2} { for _, line := range lines { histogram[line]++ } } var overlap int for _, v := range histogram { if v == 2 { overlap++ } } return float64(overlap) / float64(len(histogram)), nil } // fileContentsCache is a cache of file contents, keyed by file number, // to avoid reading the same file from disk multiple times. type fileContentsCache map[int][]string // getFileContents returns the contents of a file, excluding the first timestamp // line. If the file is already in the cache, the contents are returned from // there, otherwise the file is read from disk and the contents are cached. func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) { if contents, ok := fcc[fileNumber]; ok { return contents, nil } var ( fileName = makeFilePath(fileNumber) lines []string ) f, err := os.Open(fileName) if err != nil { return nil, err } s := bufio.NewScanner(f) // Ignore first line that's just a timestamp. if !s.Scan() { fcc[fileNumber] = []string{} return []string{}, nil } for s.Scan() { lines = append(lines, s.Text()) } if err := s.Err(); err != nil { return nil, err } return lines, nil } // clearFilesExcept removes the contents of the fileContentsCache except for the // provided file numbers. This helps conserve memory by removing the contents of // files that are no longer of interest, which we can be sure of since we are // proceeding in order of time. func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) { for fNum := range fcc { if !slices.Contains(fileNumbers, fNum) { delete(fcc, fNum) } } } func makeFileName(number int) string { return fmt.Sprintf("%d.txt", number) } func makeFilePath(number int) string { return path.Join(DataFilePath, makeFileName(number)) } // orderFiles determines the timestamp version of each file and creates a map of // time to file numbers. It sorts the times (since maps are not ordered) so that // the map can be iterated in order of time. This allows stepping through the // history of the files from the beginning. Using this, we can construct a // "chain" of evolution for a given document. func orderFiles() (map[int][]int, []int, error) { timeMap := make(map[int][]int) dirEntries, err := os.ReadDir(DataFilePath) if err != nil { return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err) } for _, entry := range dirEntries { if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") { continue } // Get the numeric representation of the file number. var fileNumber int { numberStr := strings.TrimSuffix(entry.Name(), ".txt") var err error if fileNumber, err = strconv.Atoi(numberStr); err != nil { return nil, nil, fmt.Errorf("invalid file number in file name %q: %w", entry.Name(), err) } } filePath := path.Join(DataFilePath, entry.Name()) modTime, err := readFileTime(filePath) if err != nil { return nil, nil, err } if timeMap[modTime] == nil { timeMap[modTime] = make([]int, 0) } timeMap[modTime] = append(timeMap[modTime], fileNumber) } // Now make a slice of the times and sort them, so we can iterate through // them in order. timeSlice := make([]int, 0, len(timeMap)) for k := range timeMap { timeSlice = append(timeSlice, k) } slices.Sort(timeSlice) return timeMap, timeSlice, nil } func log(msg string, args ...any) { if Verbose { slog.Info(msg, args...) } }