diff --git a/main.go b/main.go index a3b4746..01e6413 100644 --- a/main.go +++ b/main.go @@ -79,7 +79,7 @@ func run(args []string) ([]*Document, error) { // documents, so we can identify unassociated files later and then // create new documents for them. This needs to be distinct for each // timestamp, so it's created inside the timestamp loop's scope. - associatedFiles sync.Map + claimedFiles sync.Map // We might need to create new documents for files that weren't // associated with any document at this timestamp, so we need to make @@ -93,11 +93,11 @@ func run(args []string) ([]*Document, error) { for i, doc := range dm.Documents { wg.Add(1) dm.WorkCh <- WorkItem{ - doc: doc, - fileNumbers: fileTimes[timestamp], - timestamp: timestamp, - associatedFiles: &associatedFiles, - wg: &wg, + doc: doc, + fileNumbers: fileTimes[timestamp], + timestamp: timestamp, + claimedFiles: &claimedFiles, + wg: &wg, } log( "submitted work", @@ -115,7 +115,7 @@ func run(args []string) ([]*Document, error) { // documents for them. var docsAdded int for _, fileNumber := range fileTimes[timestamp] { - if _, ok := associatedFiles.Load(fileNumber); !ok { + if _, ok := claimedFiles.Load(fileNumber); !ok { dm.AddNewDocument(fileNumber, timestamp) docsAdded++ } @@ -134,11 +134,11 @@ func run(args []string) ([]*Document, error) { // WorkItem is what will be sent to the the workers in the worker pool. type WorkItem struct { - doc *Document - fileNumbers []int - timestamp int - associatedFiles *sync.Map - wg *sync.WaitGroup + doc *Document + fileNumbers []int + timestamp int + claimedFiles *sync.Map + wg *sync.WaitGroup } // DocumentManager handles the processing of documents and files. It maintains a @@ -229,50 +229,46 @@ func (dm *DocumentManager) SortedDocuments() []*Document { // document against each file and if a match is found, associate the file with the // document sent in the work item, and record the file as having been matched. func (dm *DocumentManager) ComparisonWorker(workerID int) { + defer dm.wg.Done() for workItem := range dm.WorkCh { - for _, fileNumber := range workItem.fileNumbers { - if _, ok := workItem.associatedFiles.Load(fileNumber); ok { - // This file has already been matched; skip it. - continue - } - latestFileNumber := workItem.doc.LatestAssociatedFile() - similarity, err := dm.compareFiles(latestFileNumber, fileNumber) - if err != nil { - // Simplistic error handling: log the error and continue. - slog.Error( - "error comparing files", - "file1", latestFileNumber, - "file2", fileNumber, - "document", workItem.doc.ID, - "worker", workerID, - ) - } + dm.maybeAssociateFileWithDocument(workItem, workerID) + } +} - // If current file doesn't match current document, skip to the next file. - if similarity < dm.similarityThreshold { - continue - } +func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, workerID int) { + defer workItem.wg.Done() + for _, candidateFileNumber := range workItem.fileNumbers { + if _, ok := workItem.claimedFiles.Load(candidateFileNumber); ok { + // This file has already been matched with another document, so skip it. + continue + } + latestFileNumber := workItem.doc.LatestAssociatedFile() + similarity, err := dm.compareFiles(latestFileNumber, candidateFileNumber) + if err != nil { + // Simplistic error handling: log the error and continue. + slog.Error( + "error comparing files", + "latestAssociatedFile", latestFileNumber, + "candidateFile", candidateFileNumber, + "document", workItem.doc.ID, + "worker", workerID, + ) + } - // Current file matches current document, so record this. - workItem.doc.AssociateFile(fileNumber, workItem.timestamp) - workItem.associatedFiles.Store(fileNumber, struct{}{}) + // If current file matches current document, record it and exit. + if similarity >= dm.similarityThreshold { + workItem.doc.AssociateFile(candidateFileNumber, workItem.timestamp) + workItem.claimedFiles.Store(candidateFileNumber, struct{}{}) log( "match found", "document", workItem.doc.ID, - "file", fileNumber, + "file", candidateFileNumber, "time", workItem.timestamp, "worker", workerID, ) - - // We don't need to consider this document anymore since we've found - // a match. End processing and wait for more work. - break + return } - workItem.wg.Done() } - - // Report that this worker is shutting down. - dm.wg.Done() } // compareFiles computes how much two files overlap, on a scale