Split up worker and worker logic
Break the worker function into one that ranges over the channel and one that actually does the work of associating the file with a document if it is determined to match.
This commit is contained in:
parent
b6de64cde6
commit
03c0840041
50
main.go
50
main.go
|
|
@ -79,7 +79,7 @@ func run(args []string) ([]*Document, error) {
|
|||
// documents, so we can identify unassociated files later and then
|
||||
// create new documents for them. This needs to be distinct for each
|
||||
// timestamp, so it's created inside the timestamp loop's scope.
|
||||
associatedFiles sync.Map
|
||||
claimedFiles sync.Map
|
||||
|
||||
// We might need to create new documents for files that weren't
|
||||
// associated with any document at this timestamp, so we need to make
|
||||
|
|
@ -96,7 +96,7 @@ func run(args []string) ([]*Document, error) {
|
|||
doc: doc,
|
||||
fileNumbers: fileTimes[timestamp],
|
||||
timestamp: timestamp,
|
||||
associatedFiles: &associatedFiles,
|
||||
claimedFiles: &claimedFiles,
|
||||
wg: &wg,
|
||||
}
|
||||
log(
|
||||
|
|
@ -115,7 +115,7 @@ func run(args []string) ([]*Document, error) {
|
|||
// documents for them.
|
||||
var docsAdded int
|
||||
for _, fileNumber := range fileTimes[timestamp] {
|
||||
if _, ok := associatedFiles.Load(fileNumber); !ok {
|
||||
if _, ok := claimedFiles.Load(fileNumber); !ok {
|
||||
dm.AddNewDocument(fileNumber, timestamp)
|
||||
docsAdded++
|
||||
}
|
||||
|
|
@ -137,7 +137,7 @@ type WorkItem struct {
|
|||
doc *Document
|
||||
fileNumbers []int
|
||||
timestamp int
|
||||
associatedFiles *sync.Map
|
||||
claimedFiles *sync.Map
|
||||
wg *sync.WaitGroup
|
||||
}
|
||||
|
||||
|
|
@ -229,50 +229,46 @@ func (dm *DocumentManager) SortedDocuments() []*Document {
|
|||
// document against each file and if a match is found, associate the file with the
|
||||
// document sent in the work item, and record the file as having been matched.
|
||||
func (dm *DocumentManager) ComparisonWorker(workerID int) {
|
||||
defer dm.wg.Done()
|
||||
for workItem := range dm.WorkCh {
|
||||
for _, fileNumber := range workItem.fileNumbers {
|
||||
if _, ok := workItem.associatedFiles.Load(fileNumber); ok {
|
||||
// This file has already been matched; skip it.
|
||||
dm.maybeAssociateFileWithDocument(workItem, workerID)
|
||||
}
|
||||
}
|
||||
|
||||
func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, workerID int) {
|
||||
defer workItem.wg.Done()
|
||||
for _, candidateFileNumber := range workItem.fileNumbers {
|
||||
if _, ok := workItem.claimedFiles.Load(candidateFileNumber); ok {
|
||||
// This file has already been matched with another document, so skip it.
|
||||
continue
|
||||
}
|
||||
latestFileNumber := workItem.doc.LatestAssociatedFile()
|
||||
similarity, err := dm.compareFiles(latestFileNumber, fileNumber)
|
||||
similarity, err := dm.compareFiles(latestFileNumber, candidateFileNumber)
|
||||
if err != nil {
|
||||
// Simplistic error handling: log the error and continue.
|
||||
slog.Error(
|
||||
"error comparing files",
|
||||
"file1", latestFileNumber,
|
||||
"file2", fileNumber,
|
||||
"latestAssociatedFile", latestFileNumber,
|
||||
"candidateFile", candidateFileNumber,
|
||||
"document", workItem.doc.ID,
|
||||
"worker", workerID,
|
||||
)
|
||||
}
|
||||
|
||||
// If current file doesn't match current document, skip to the next file.
|
||||
if similarity < dm.similarityThreshold {
|
||||
continue
|
||||
}
|
||||
|
||||
// Current file matches current document, so record this.
|
||||
workItem.doc.AssociateFile(fileNumber, workItem.timestamp)
|
||||
workItem.associatedFiles.Store(fileNumber, struct{}{})
|
||||
// If current file matches current document, record it and exit.
|
||||
if similarity >= dm.similarityThreshold {
|
||||
workItem.doc.AssociateFile(candidateFileNumber, workItem.timestamp)
|
||||
workItem.claimedFiles.Store(candidateFileNumber, struct{}{})
|
||||
log(
|
||||
"match found",
|
||||
"document", workItem.doc.ID,
|
||||
"file", fileNumber,
|
||||
"file", candidateFileNumber,
|
||||
"time", workItem.timestamp,
|
||||
"worker", workerID,
|
||||
)
|
||||
|
||||
// We don't need to consider this document anymore since we've found
|
||||
// a match. End processing and wait for more work.
|
||||
break
|
||||
return
|
||||
}
|
||||
workItem.wg.Done()
|
||||
}
|
||||
|
||||
// Report that this worker is shutting down.
|
||||
dm.wg.Done()
|
||||
}
|
||||
|
||||
// compareFiles computes how much two files overlap, on a scale
|
||||
|
|
|
|||
Loading…
Reference in New Issue