Split up worker and worker logic
Break the worker function into one that ranges over the channel and one that actually does the work of associating the file with a document if it is determined to match.
This commit is contained in:
parent
b6de64cde6
commit
03c0840041
86
main.go
86
main.go
|
|
@ -79,7 +79,7 @@ func run(args []string) ([]*Document, error) {
|
||||||
// documents, so we can identify unassociated files later and then
|
// documents, so we can identify unassociated files later and then
|
||||||
// create new documents for them. This needs to be distinct for each
|
// create new documents for them. This needs to be distinct for each
|
||||||
// timestamp, so it's created inside the timestamp loop's scope.
|
// timestamp, so it's created inside the timestamp loop's scope.
|
||||||
associatedFiles sync.Map
|
claimedFiles sync.Map
|
||||||
|
|
||||||
// We might need to create new documents for files that weren't
|
// We might need to create new documents for files that weren't
|
||||||
// associated with any document at this timestamp, so we need to make
|
// associated with any document at this timestamp, so we need to make
|
||||||
|
|
@ -93,11 +93,11 @@ func run(args []string) ([]*Document, error) {
|
||||||
for i, doc := range dm.Documents {
|
for i, doc := range dm.Documents {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
dm.WorkCh <- WorkItem{
|
dm.WorkCh <- WorkItem{
|
||||||
doc: doc,
|
doc: doc,
|
||||||
fileNumbers: fileTimes[timestamp],
|
fileNumbers: fileTimes[timestamp],
|
||||||
timestamp: timestamp,
|
timestamp: timestamp,
|
||||||
associatedFiles: &associatedFiles,
|
claimedFiles: &claimedFiles,
|
||||||
wg: &wg,
|
wg: &wg,
|
||||||
}
|
}
|
||||||
log(
|
log(
|
||||||
"submitted work",
|
"submitted work",
|
||||||
|
|
@ -115,7 +115,7 @@ func run(args []string) ([]*Document, error) {
|
||||||
// documents for them.
|
// documents for them.
|
||||||
var docsAdded int
|
var docsAdded int
|
||||||
for _, fileNumber := range fileTimes[timestamp] {
|
for _, fileNumber := range fileTimes[timestamp] {
|
||||||
if _, ok := associatedFiles.Load(fileNumber); !ok {
|
if _, ok := claimedFiles.Load(fileNumber); !ok {
|
||||||
dm.AddNewDocument(fileNumber, timestamp)
|
dm.AddNewDocument(fileNumber, timestamp)
|
||||||
docsAdded++
|
docsAdded++
|
||||||
}
|
}
|
||||||
|
|
@ -134,11 +134,11 @@ func run(args []string) ([]*Document, error) {
|
||||||
|
|
||||||
// WorkItem is what will be sent to the the workers in the worker pool.
|
// WorkItem is what will be sent to the the workers in the worker pool.
|
||||||
type WorkItem struct {
|
type WorkItem struct {
|
||||||
doc *Document
|
doc *Document
|
||||||
fileNumbers []int
|
fileNumbers []int
|
||||||
timestamp int
|
timestamp int
|
||||||
associatedFiles *sync.Map
|
claimedFiles *sync.Map
|
||||||
wg *sync.WaitGroup
|
wg *sync.WaitGroup
|
||||||
}
|
}
|
||||||
|
|
||||||
// DocumentManager handles the processing of documents and files. It maintains a
|
// DocumentManager handles the processing of documents and files. It maintains a
|
||||||
|
|
@ -229,50 +229,46 @@ func (dm *DocumentManager) SortedDocuments() []*Document {
|
||||||
// document against each file and if a match is found, associate the file with the
|
// document against each file and if a match is found, associate the file with the
|
||||||
// document sent in the work item, and record the file as having been matched.
|
// document sent in the work item, and record the file as having been matched.
|
||||||
func (dm *DocumentManager) ComparisonWorker(workerID int) {
|
func (dm *DocumentManager) ComparisonWorker(workerID int) {
|
||||||
|
defer dm.wg.Done()
|
||||||
for workItem := range dm.WorkCh {
|
for workItem := range dm.WorkCh {
|
||||||
for _, fileNumber := range workItem.fileNumbers {
|
dm.maybeAssociateFileWithDocument(workItem, workerID)
|
||||||
if _, ok := workItem.associatedFiles.Load(fileNumber); ok {
|
}
|
||||||
// This file has already been matched; skip it.
|
}
|
||||||
continue
|
|
||||||
}
|
|
||||||
latestFileNumber := workItem.doc.LatestAssociatedFile()
|
|
||||||
similarity, err := dm.compareFiles(latestFileNumber, fileNumber)
|
|
||||||
if err != nil {
|
|
||||||
// Simplistic error handling: log the error and continue.
|
|
||||||
slog.Error(
|
|
||||||
"error comparing files",
|
|
||||||
"file1", latestFileNumber,
|
|
||||||
"file2", fileNumber,
|
|
||||||
"document", workItem.doc.ID,
|
|
||||||
"worker", workerID,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// If current file doesn't match current document, skip to the next file.
|
func (dm *DocumentManager) maybeAssociateFileWithDocument(workItem WorkItem, workerID int) {
|
||||||
if similarity < dm.similarityThreshold {
|
defer workItem.wg.Done()
|
||||||
continue
|
for _, candidateFileNumber := range workItem.fileNumbers {
|
||||||
}
|
if _, ok := workItem.claimedFiles.Load(candidateFileNumber); ok {
|
||||||
|
// This file has already been matched with another document, so skip it.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
latestFileNumber := workItem.doc.LatestAssociatedFile()
|
||||||
|
similarity, err := dm.compareFiles(latestFileNumber, candidateFileNumber)
|
||||||
|
if err != nil {
|
||||||
|
// Simplistic error handling: log the error and continue.
|
||||||
|
slog.Error(
|
||||||
|
"error comparing files",
|
||||||
|
"latestAssociatedFile", latestFileNumber,
|
||||||
|
"candidateFile", candidateFileNumber,
|
||||||
|
"document", workItem.doc.ID,
|
||||||
|
"worker", workerID,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// Current file matches current document, so record this.
|
// If current file matches current document, record it and exit.
|
||||||
workItem.doc.AssociateFile(fileNumber, workItem.timestamp)
|
if similarity >= dm.similarityThreshold {
|
||||||
workItem.associatedFiles.Store(fileNumber, struct{}{})
|
workItem.doc.AssociateFile(candidateFileNumber, workItem.timestamp)
|
||||||
|
workItem.claimedFiles.Store(candidateFileNumber, struct{}{})
|
||||||
log(
|
log(
|
||||||
"match found",
|
"match found",
|
||||||
"document", workItem.doc.ID,
|
"document", workItem.doc.ID,
|
||||||
"file", fileNumber,
|
"file", candidateFileNumber,
|
||||||
"time", workItem.timestamp,
|
"time", workItem.timestamp,
|
||||||
"worker", workerID,
|
"worker", workerID,
|
||||||
)
|
)
|
||||||
|
return
|
||||||
// We don't need to consider this document anymore since we've found
|
|
||||||
// a match. End processing and wait for more work.
|
|
||||||
break
|
|
||||||
}
|
}
|
||||||
workItem.wg.Done()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Report that this worker is shutting down.
|
|
||||||
dm.wg.Done()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// compareFiles computes how much two files overlap, on a scale
|
// compareFiles computes how much two files overlap, on a scale
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue