Major refactor: use worker pool
Use a bounded worker pool to prevent creation of hundreds of goroutines contending for scheduling. Add some tests, a Dockerfile, a Makefile, and a readme.
This commit is contained in:
parent
5f1a8bc256
commit
b6de64cde6
|
|
@ -1 +1,3 @@
|
|||
*.log
|
||||
output.*.txt
|
||||
.vscode
|
||||
|
|
|
|||
|
|
@ -0,0 +1,11 @@
|
|||
FROM golang:1.22 as builder
|
||||
WORKDIR /go/src/docgrouper
|
||||
COPY testdata testdata/
|
||||
COPY *.go go.mod go.sum ./
|
||||
RUN go mod download
|
||||
RUN go test -v ./... && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o docgroup
|
||||
|
||||
FROM gcr.io/distroless/base-nossl-debian12
|
||||
COPY --from=builder /go/src/docgrouper/docgrouper /bin/docgrouper
|
||||
VOLUME [ "/files" ]
|
||||
ENTRYPOINT [ "docgroup" ]
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
DOCKER_IMAGE := "steelray-docgrouper"
|
||||
BINNAME := "steelray-docgrouper"
|
||||
FILES_PATH := "./files.200"
|
||||
|
||||
build: *.go
|
||||
go build -o $(BINNAME) .
|
||||
|
||||
test:
|
||||
go test -v ./...
|
||||
|
||||
clean:
|
||||
rm -f $(BINNAME)
|
||||
|
||||
docker-build:
|
||||
docker build -t $(DOCKER_IMAGE) .
|
||||
|
||||
docker-run:
|
||||
docker run -v $(FILES_PATH):/files $(DOCKER_IMAGE):latest
|
||||
|
||||
.PHONY: docker-build docker-run
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
# Docgrouper
|
||||
|
||||
Given a set of files with an integer timestamp as its first line, identify a set
|
||||
of documents that they represent at various points of the document's life.
|
||||
|
||||
## Building
|
||||
|
||||
Building **docgrouper** requires [Go](https://go.dev), and can be built by
|
||||
running `make build`. Because Go might not be installed, a `Dockerfile` is
|
||||
provided to test and build a container image. The docker image can be built via
|
||||
the `docker-build` Makefile target.
|
||||
|
||||
## Running
|
||||
|
||||
If running via Docker, the directory where the file pool exists must be mounted
|
||||
into the container, via the `-v` or `--volume` switch, like so:
|
||||
|
||||
```
|
||||
docker run --volume ./host-files:/files steelray-docgrouper
|
||||
```
|
||||
|
||||
This invocation is made available via the `docker-run` Makefile target, but this
|
||||
will only invoke docgrouper with the default command line arguments since
|
||||
arguments cannot be passed to a Makefile target.
|
||||
|
||||
## Options
|
||||
|
||||
```
|
||||
-path string
|
||||
path to the file pool (default "files")
|
||||
-prefix
|
||||
use '[doc ###]' prefix for output
|
||||
-threshold float
|
||||
similarity threshold (default 0.5)
|
||||
-verbose
|
||||
enable verbose logging
|
||||
-workers int
|
||||
number of workers to use (default 2*<number-of-cores>)
|
||||
```
|
||||
4
go.mod
4
go.mod
|
|
@ -1,5 +1,3 @@
|
|||
module github.com/ianfoo/steelray-docgrouper
|
||||
|
||||
go 1.22.1
|
||||
|
||||
require github.com/adrg/strutil v0.3.1
|
||||
go 1.22.2
|
||||
|
|
|
|||
547
main.go
547
main.go
|
|
@ -1,8 +1,11 @@
|
|||
// Timesheet
|
||||
// Time sheet
|
||||
// Previously ~1h
|
||||
// March 13, 2024: 00:00-02:30
|
||||
// March 19, 2024: 15:00-19:00
|
||||
// March 23, 2024: 20:00-
|
||||
// March 23, 2024: 20:00-22:00
|
||||
// April 02, 2024: 12:30-17:00
|
||||
// April 04, 2024: 21:00-23:30
|
||||
// April 05, 2024: 00:00-02:00
|
||||
package main
|
||||
|
||||
import (
|
||||
|
|
@ -12,6 +15,7 @@ import (
|
|||
"log/slog"
|
||||
"os"
|
||||
"path"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
|
@ -19,143 +23,290 @@ import (
|
|||
"sync/atomic"
|
||||
)
|
||||
|
||||
// DataFilePath describes the default location where the file pool can be found.
|
||||
const DefaultDataFilePath = "files"
|
||||
const (
|
||||
// defaultSimilarityThreshold is the default minimum similarity required for
|
||||
// two files to be considered related. This value is arbitrary and could be
|
||||
// adjusted based on the specific requirements of the problem.
|
||||
defaultSimilarityThreshold = 0.5
|
||||
|
||||
// defaultDataFilePath describes the default location where the file pool can be
|
||||
// found.
|
||||
defaultDataFilePath = "files"
|
||||
)
|
||||
|
||||
// Command line options
|
||||
var (
|
||||
DataFilePath string
|
||||
UseDocPrefix bool
|
||||
Verbose bool
|
||||
dataFilePath string
|
||||
similarityThreshold float64
|
||||
useDocPrefix bool
|
||||
verbose bool
|
||||
numWorkers int
|
||||
)
|
||||
|
||||
func main() {
|
||||
if err := run(os.Args); err != nil {
|
||||
documents, err := run(os.Args)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
os.Exit(-1)
|
||||
}
|
||||
for _, doc := range documents {
|
||||
fmt.Println(doc)
|
||||
}
|
||||
}
|
||||
|
||||
// run is the main entry point for the program.
|
||||
func run(args []string) error {
|
||||
func run(args []string) ([]*Document, error) {
|
||||
flags := flag.NewFlagSet(args[0], flag.ExitOnError)
|
||||
flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool")
|
||||
flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
|
||||
flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging")
|
||||
flags.Parse(args[1:])
|
||||
flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool")
|
||||
flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold")
|
||||
flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use")
|
||||
flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
|
||||
flags.BoolVar(&verbose, "verbose", false, "enable verbose logging")
|
||||
_ = flags.Parse(args[1:])
|
||||
|
||||
// SimilarityThreshold is the minimum similarity required for two files
|
||||
// to be considered related. This value is arbitrary and could be adjusted
|
||||
// based on the specific requirements of the problem.
|
||||
const SimilarityThreshold = 0.5
|
||||
|
||||
fileTimes, times, err := orderFiles()
|
||||
// The files need to be processed in order of time, so determine the
|
||||
// timestamp of each file and sort them by time.
|
||||
fileTimes, times, err := orderFiles(dataFilePath)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var (
|
||||
// documents is the master list of documents that will be built up.
|
||||
documents []*Document
|
||||
|
||||
// fcc handles reading files and caching contents.
|
||||
fcc = make(fileContentsCache)
|
||||
)
|
||||
dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers)
|
||||
|
||||
for i, timestamp := range times {
|
||||
_ = i
|
||||
// fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes))
|
||||
// Track the files at this timestamp that have been associated with documents, so
|
||||
// we can identify unassociated files later and then create new documents for them.
|
||||
var (
|
||||
wg sync.WaitGroup
|
||||
// Track the files at this timestamp that have been associated with
|
||||
// documents, so we can identify unassociated files later and then
|
||||
// create new documents for them. This needs to be distinct for each
|
||||
// timestamp, so it's created inside the timestamp loop's scope.
|
||||
associatedFiles sync.Map
|
||||
|
||||
// We might need to create new documents for files that weren't
|
||||
// associated with any document at this timestamp, so we need to make
|
||||
// sure that this timestamp has been entirely processed first. We do
|
||||
// this by waiting for the workers to indicate they've finished a work
|
||||
// item.
|
||||
wg sync.WaitGroup
|
||||
)
|
||||
|
||||
wg.Add(len(documents))
|
||||
log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents))
|
||||
for _, doc := range documents {
|
||||
// Start a goroutine for each document, to parallelize the
|
||||
// comparison with the files in the current timestamp. A more robust
|
||||
// solution would limit the number of concurrent goroutines to avoid
|
||||
// exhausting system resources, but for this problem we won't have
|
||||
// more than a couple thousand documents. Goroutines are
|
||||
// lightweight enough (2K stack) that we can start them pretty
|
||||
// capriciously.
|
||||
go func(doc *Document, files []int) {
|
||||
defer wg.Done()
|
||||
for _, candidateFileNumber := range files {
|
||||
// Check to be certain this file hasn't been associated with another
|
||||
// document already. If it has been, continue to the next file.
|
||||
if _, ok := associatedFiles.Load(candidateFileNumber); ok {
|
||||
log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times))
|
||||
for i, doc := range dm.Documents {
|
||||
wg.Add(1)
|
||||
dm.WorkCh <- WorkItem{
|
||||
doc: doc,
|
||||
fileNumbers: fileTimes[timestamp],
|
||||
timestamp: timestamp,
|
||||
associatedFiles: &associatedFiles,
|
||||
wg: &wg,
|
||||
}
|
||||
log(
|
||||
"submitted work",
|
||||
"documentNumber", i+1,
|
||||
"documentID", doc.ID,
|
||||
"totalDocs", len(dm.Documents),
|
||||
"timestamp", timestamp,
|
||||
)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
// Now that this timestamp has been fully processed, we can check to see
|
||||
// what files haven't been associated existing documents, and create new
|
||||
// documents for them.
|
||||
var docsAdded int
|
||||
for _, fileNumber := range fileTimes[timestamp] {
|
||||
if _, ok := associatedFiles.Load(fileNumber); !ok {
|
||||
dm.AddNewDocument(fileNumber, timestamp)
|
||||
docsAdded++
|
||||
}
|
||||
}
|
||||
if docsAdded > 0 {
|
||||
log("created new documents", "numAdded", docsAdded, "timestamp", timestamp)
|
||||
}
|
||||
|
||||
// Free up memory.
|
||||
dm.ShrinkCache()
|
||||
}
|
||||
|
||||
dm.Shutdown()
|
||||
return dm.SortedDocuments(), nil
|
||||
}
|
||||
|
||||
// WorkItem is what will be sent to the the workers in the worker pool.
|
||||
type WorkItem struct {
|
||||
doc *Document
|
||||
fileNumbers []int
|
||||
timestamp int
|
||||
associatedFiles *sync.Map
|
||||
wg *sync.WaitGroup
|
||||
}
|
||||
|
||||
// DocumentManager handles the processing of documents and files. It maintains a
|
||||
// list of documents and a cache of file contents, and uses a pool of workers to
|
||||
// compare documents against files.
|
||||
type DocumentManager struct {
|
||||
// Documents is the list of documents that have been identified.
|
||||
Documents []*Document
|
||||
|
||||
// WorkCh is the channel through which work items are submitted to the workers.
|
||||
WorkCh chan WorkItem
|
||||
|
||||
// docIDSource is a concurrency-safe source from which to identify documents.
|
||||
// This could easily be something other than an integer, but using this allows
|
||||
// us to just use the standard library.
|
||||
docIDSource atomic.Uint32
|
||||
|
||||
similarityThreshold float64
|
||||
fcc *FileContentsCache
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewDocumentManager creates a new DocumentManager with the specified base path
|
||||
// for the file pool and the specified number of workers.
|
||||
func NewDocumentManager(fileBasePath string, similarityThreshold float64, numWorkers int) *DocumentManager {
|
||||
dm := &DocumentManager{
|
||||
Documents: make([]*Document, 0),
|
||||
similarityThreshold: similarityThreshold,
|
||||
fcc: &FileContentsCache{BaseDir: fileBasePath},
|
||||
WorkCh: make(chan WorkItem),
|
||||
}
|
||||
|
||||
// Start workers.
|
||||
for wID := range numWorkers {
|
||||
go dm.ComparisonWorker(wID + 1)
|
||||
}
|
||||
dm.wg.Add(numWorkers)
|
||||
|
||||
return dm
|
||||
}
|
||||
|
||||
// Shutdown cleans up the document manager by closing the work channel to
|
||||
// trigger workers to exit and then waits for all workers to exit.
|
||||
func (dm *DocumentManager) Shutdown() {
|
||||
close(dm.WorkCh)
|
||||
dm.wg.Wait()
|
||||
}
|
||||
|
||||
func (dm *DocumentManager) AddNewDocument(fileNumber, timestamp int) {
|
||||
doc := Document{
|
||||
ID: dm.docIDSource.Add(1),
|
||||
LatestTimestamp: timestamp,
|
||||
AssociatedFiles: []int{fileNumber},
|
||||
}
|
||||
dm.Documents = append(dm.Documents, &doc)
|
||||
}
|
||||
|
||||
// ShrinkCache removes files from the cache that will never be used again, by
|
||||
// evicting those files that are not associated with any document. Note that
|
||||
// this is not concurrent-safe and should only be called when operations that
|
||||
// could modify the document list are not running. This is an optimization, but
|
||||
// could be removed if memory usage is not a concern.
|
||||
func (dm *DocumentManager) ShrinkCache() {
|
||||
var latestDocumentFiles []int
|
||||
for _, doc := range dm.Documents {
|
||||
latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
|
||||
}
|
||||
dm.fcc.ClearFilesExcept(latestDocumentFiles)
|
||||
}
|
||||
|
||||
// Return the list of documents with their associated files in ascending order,
|
||||
// and the documents themselves ordered by the documents by their first
|
||||
// associated file.
|
||||
func (dm *DocumentManager) SortedDocuments() []*Document {
|
||||
// Sort the associated files for each document.
|
||||
for _, doc := range dm.Documents {
|
||||
doc.SortAssociatedFiles()
|
||||
}
|
||||
// Sort the documents by their first associated file number.
|
||||
slices.SortFunc(dm.Documents, func(a, b *Document) int {
|
||||
return a.AssociatedFiles[0] - b.AssociatedFiles[0]
|
||||
})
|
||||
return dm.Documents
|
||||
}
|
||||
|
||||
// ComparisonWorker is a function that receives work items describing a document
|
||||
// and a list of candidate file IDs to compare against. It will compare the
|
||||
// document against each file and if a match is found, associate the file with the
|
||||
// document sent in the work item, and record the file as having been matched.
|
||||
func (dm *DocumentManager) ComparisonWorker(workerID int) {
|
||||
for workItem := range dm.WorkCh {
|
||||
for _, fileNumber := range workItem.fileNumbers {
|
||||
if _, ok := workItem.associatedFiles.Load(fileNumber); ok {
|
||||
// This file has already been matched; skip it.
|
||||
continue
|
||||
}
|
||||
latestFileNumber := workItem.doc.LatestAssociatedFile()
|
||||
similarity, err := dm.compareFiles(latestFileNumber, fileNumber)
|
||||
if err != nil {
|
||||
// Simplistic error handling: log the error and continue.
|
||||
slog.Error(
|
||||
"error comparing files",
|
||||
"file1", latestFileNumber,
|
||||
"file2", fileNumber,
|
||||
"document", workItem.doc.ID,
|
||||
"worker", workerID,
|
||||
)
|
||||
}
|
||||
|
||||
// If current file doesn't match current document, skip to the next file.
|
||||
if similarity < dm.similarityThreshold {
|
||||
continue
|
||||
}
|
||||
|
||||
latestFileNumber := doc.LatestAssociatedFile()
|
||||
overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber)
|
||||
if err != nil {
|
||||
fmt.Fprintf(
|
||||
os.Stderr,
|
||||
"error comparing files %d and %d: %v\n",
|
||||
latestFileNumber, candidateFileNumber, err,
|
||||
// Current file matches current document, so record this.
|
||||
workItem.doc.AssociateFile(fileNumber, workItem.timestamp)
|
||||
workItem.associatedFiles.Store(fileNumber, struct{}{})
|
||||
log(
|
||||
"match found",
|
||||
"document", workItem.doc.ID,
|
||||
"file", fileNumber,
|
||||
"time", workItem.timestamp,
|
||||
"worker", workerID,
|
||||
)
|
||||
}
|
||||
if overlap >= SimilarityThreshold {
|
||||
// Add file to Document associated list
|
||||
doc.AssociateFile(candidateFileNumber, timestamp)
|
||||
associatedFiles.Store(candidateFileNumber, struct{}{})
|
||||
|
||||
// We know this document won't be associated with any other files
|
||||
// with this timestamp, so we can stop looking at files with this
|
||||
// timestamp, for this document.
|
||||
return
|
||||
// We don't need to consider this document anymore since we've found
|
||||
// a match. End processing and wait for more work.
|
||||
break
|
||||
}
|
||||
}
|
||||
}(doc, fileTimes[timestamp])
|
||||
workItem.wg.Done()
|
||||
}
|
||||
|
||||
// Wait for all document comparisons to complete for this timestamp.
|
||||
wg.Wait()
|
||||
|
||||
// If we haven't associated all the files with existing documents, we need
|
||||
// to create new documents for those that remain.
|
||||
currentNumDocs := len(documents)
|
||||
for _, fileNumber := range fileTimes[timestamp] {
|
||||
if _, ok := associatedFiles.Load(fileNumber); !ok {
|
||||
doc := NewDocument(fileNumber, timestamp)
|
||||
documents = append(documents, &doc)
|
||||
}
|
||||
}
|
||||
if len(documents) > currentNumDocs {
|
||||
log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp)
|
||||
}
|
||||
|
||||
// Now we can clear the cache of file contents for files that aren't associated with
|
||||
// a document, to conserve memory.
|
||||
var latestDocumentFiles []int
|
||||
for _, doc := range documents {
|
||||
latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
|
||||
}
|
||||
fcc.clearFilesExcept(latestDocumentFiles)
|
||||
}
|
||||
|
||||
// Output the list of documents, showing their associated files in ascending order.
|
||||
// Order the documents by their first associated file.
|
||||
slices.SortFunc(documents, func(a, b *Document) int {
|
||||
return a.AssociatedFiles[0] - b.AssociatedFiles[0]
|
||||
})
|
||||
for _, doc := range documents {
|
||||
doc.SortAssociatedFiles()
|
||||
fmt.Println(doc)
|
||||
}
|
||||
|
||||
return nil
|
||||
// Report that this worker is shutting down.
|
||||
dm.wg.Done()
|
||||
}
|
||||
|
||||
// DocumentIDSource is a concurrency-safe source from which to identify
|
||||
// documents. This could easily be something other than an integer, but using
|
||||
// this allows us to just use the standard library.
|
||||
var DocumentIDSource atomic.Uint32
|
||||
// compareFiles computes how much two files overlap, on a scale
|
||||
// of 0 to 1 by iterating through the files and identifying lines
|
||||
// that are duplicated.
|
||||
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
|
||||
f1, err := dm.fcc.GetFileContents(f1Number)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("file %d: %w", f1Number, err)
|
||||
}
|
||||
f2, err := dm.fcc.GetFileContents(f2Number)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("file %d: %w", f2Number, err)
|
||||
}
|
||||
|
||||
histogram := make(map[string]int)
|
||||
for _, lines := range [][]string{f1, f2} {
|
||||
for _, line := range lines {
|
||||
// Skip blank lines, which can throw off the count.
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
histogram[line]++
|
||||
}
|
||||
}
|
||||
|
||||
var overlap int
|
||||
for _, v := range histogram {
|
||||
if v == 2 {
|
||||
overlap++
|
||||
}
|
||||
}
|
||||
return float64(overlap) / float64(len(histogram)), nil
|
||||
}
|
||||
|
||||
// Document stores a document ID and a list of associated files.
|
||||
type Document struct {
|
||||
|
|
@ -171,8 +322,8 @@ func (d Document) String() string {
|
|||
for _, f := range d.AssociatedFiles {
|
||||
sb.WriteString(fmt.Sprintf("%d ", f))
|
||||
}
|
||||
if UseDocPrefix {
|
||||
return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String())
|
||||
if useDocPrefix {
|
||||
return fmt.Sprintf("[doc %4d] %s", d.ID, strings.TrimSpace(sb.String()))
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
|
@ -186,23 +337,78 @@ func (d *Document) AssociateFile(fileNumber, timestamp int) {
|
|||
}
|
||||
|
||||
// LatestAssociatedFile returns the most recent file associated with a document.
|
||||
// Note that this presumes that the list of associated files is sorted in
|
||||
// temporal order based on the timestamp at the head of the file.
|
||||
func (d Document) LatestAssociatedFile() int {
|
||||
return d.AssociatedFiles[len(d.AssociatedFiles)-1]
|
||||
}
|
||||
|
||||
// SortAssociatedFiles sorts the list of associated files for a document,
|
||||
// since the requirements stipulate output in ascending order.
|
||||
// SortAssociatedFiles sorts the list of associated files for a document, since
|
||||
// the requirements stipulate output in ascending numerical order. Note that
|
||||
// this changes the order of associated files from their original temporal
|
||||
// order, so must only be invoked when the work is entirely finished.
|
||||
func (d *Document) SortAssociatedFiles() {
|
||||
slices.Sort(d.AssociatedFiles)
|
||||
}
|
||||
|
||||
// NewDocument creates a new Document struct and initializes an ID and records
|
||||
// the first file and timestamp associated with it.
|
||||
func NewDocument(fileNumber, timestamp int) Document {
|
||||
return Document{
|
||||
ID: DocumentIDSource.Add(1),
|
||||
LatestTimestamp: timestamp,
|
||||
AssociatedFiles: []int{fileNumber},
|
||||
// FileContentsCache is a cache of file contents, keyed by file number,
|
||||
// to avoid reading the same file from disk multiple times.
|
||||
type FileContentsCache struct {
|
||||
BaseDir string
|
||||
cache sync.Map
|
||||
}
|
||||
|
||||
// GetFileContents returns the contents of a file, excluding the first timestamp
|
||||
// line. If the file is already in the cache, the contents are returned from
|
||||
// there, otherwise the file is read from disk and the contents are cached.
|
||||
func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error) {
|
||||
if contents, ok := fcc.cache.Load(fileNumber); ok {
|
||||
return contents.([]string), nil
|
||||
}
|
||||
var (
|
||||
fileName = makeFilePath(fcc.BaseDir, fileNumber)
|
||||
lines []string
|
||||
)
|
||||
|
||||
f, err := os.Open(fileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
s := bufio.NewScanner(f)
|
||||
|
||||
// Read first line and ignore it since it's just the timestamp.
|
||||
_ = s.Scan()
|
||||
|
||||
// Read file and store contents in cache.
|
||||
for s.Scan() {
|
||||
lines = append(lines, s.Text())
|
||||
}
|
||||
if err := s.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fcc.cache.Store(fileNumber, lines)
|
||||
return lines, nil
|
||||
}
|
||||
|
||||
// ClearFilesExcept removes the contents of the fileContentsCache except for the
|
||||
// provided file numbers. This helps conserve memory by removing the contents of
|
||||
// files that are no longer of interest, which we can be sure of since we are
|
||||
// proceeding in order of time.
|
||||
func (fcc *FileContentsCache) ClearFilesExcept(fileNumbers []int) {
|
||||
// Build up a list of entries to delete to avoid modifying the concurrent
|
||||
// map while iterating over it.
|
||||
var toDelete []int
|
||||
fcc.cache.Range(func(key, _ any) bool {
|
||||
storedFileNum := key.(int)
|
||||
if !slices.Contains(fileNumbers, storedFileNum) {
|
||||
toDelete = append(toDelete, storedFileNum)
|
||||
}
|
||||
return true
|
||||
})
|
||||
for _, k := range toDelete {
|
||||
fcc.cache.Delete(k)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -231,104 +437,17 @@ func readFileTime(filepath string) (int, error) {
|
|||
return time, nil
|
||||
}
|
||||
|
||||
// compareFiles computes how much two files overlap, on a scale
|
||||
// of 0 to 1 by iterating through the files and identifying lines
|
||||
// that are duplicated.
|
||||
func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) {
|
||||
f1, err := fcc.getFileContents(f1Number)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("file %d: %w", f1Number, err)
|
||||
}
|
||||
f2, err := fcc.getFileContents(f2Number)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("file %d: %w", f2Number, err)
|
||||
}
|
||||
|
||||
histogram := make(map[string]int)
|
||||
for _, lines := range [][]string{f1, f2} {
|
||||
for _, line := range lines {
|
||||
histogram[line]++
|
||||
}
|
||||
}
|
||||
|
||||
var overlap int
|
||||
for _, v := range histogram {
|
||||
if v == 2 {
|
||||
overlap++
|
||||
}
|
||||
}
|
||||
return float64(overlap) / float64(len(histogram)), nil
|
||||
}
|
||||
|
||||
// fileContentsCache is a cache of file contents, keyed by file number,
|
||||
// to avoid reading the same file from disk multiple times.
|
||||
type fileContentsCache map[int][]string
|
||||
|
||||
// getFileContents returns the contents of a file, excluding the first timestamp
|
||||
// line. If the file is already in the cache, the contents are returned from
|
||||
// there, otherwise the file is read from disk and the contents are cached.
|
||||
func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) {
|
||||
if contents, ok := fcc[fileNumber]; ok {
|
||||
return contents, nil
|
||||
}
|
||||
var (
|
||||
fileName = makeFilePath(fileNumber)
|
||||
lines []string
|
||||
)
|
||||
|
||||
f, err := os.Open(fileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
s := bufio.NewScanner(f)
|
||||
|
||||
// Ignore first line that's just a timestamp.
|
||||
if !s.Scan() {
|
||||
fcc[fileNumber] = []string{}
|
||||
return []string{}, nil
|
||||
}
|
||||
|
||||
for s.Scan() {
|
||||
lines = append(lines, s.Text())
|
||||
}
|
||||
if err := s.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return lines, nil
|
||||
}
|
||||
|
||||
// clearFilesExcept removes the contents of the fileContentsCache except for the
|
||||
// provided file numbers. This helps conserve memory by removing the contents of
|
||||
// files that are no longer of interest, which we can be sure of since we are
|
||||
// proceeding in order of time.
|
||||
func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) {
|
||||
for fNum := range fcc {
|
||||
if !slices.Contains(fileNumbers, fNum) {
|
||||
delete(fcc, fNum)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func makeFileName(number int) string {
|
||||
return fmt.Sprintf("%d.txt", number)
|
||||
}
|
||||
|
||||
func makeFilePath(number int) string {
|
||||
return path.Join(DataFilePath, makeFileName(number))
|
||||
}
|
||||
|
||||
// orderFiles determines the timestamp version of each file and creates a map of
|
||||
// time to file numbers. It sorts the times (since maps are not ordered) so that
|
||||
// the map can be iterated in order of time. This allows stepping through the
|
||||
// history of the files from the beginning. Using this, we can construct a
|
||||
// "chain" of evolution for a given document.
|
||||
func orderFiles() (map[int][]int, []int, error) {
|
||||
func orderFiles(dir string) (map[int][]int, []int, error) {
|
||||
timeMap := make(map[int][]int)
|
||||
|
||||
dirEntries, err := os.ReadDir(DataFilePath)
|
||||
dirEntries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err)
|
||||
return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
|
||||
}
|
||||
for _, entry := range dirEntries {
|
||||
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
|
||||
|
|
@ -345,7 +464,7 @@ func orderFiles() (map[int][]int, []int, error) {
|
|||
}
|
||||
}
|
||||
|
||||
filePath := path.Join(DataFilePath, entry.Name())
|
||||
filePath := path.Join(dir, entry.Name())
|
||||
modTime, err := readFileTime(filePath)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
|
|
@ -366,8 +485,16 @@ func orderFiles() (map[int][]int, []int, error) {
|
|||
return timeMap, timeSlice, nil
|
||||
}
|
||||
|
||||
func makeFileName(number int) string {
|
||||
return fmt.Sprintf("%d.txt", number)
|
||||
}
|
||||
|
||||
func makeFilePath(dataFilePath string, number int) string {
|
||||
return path.Join(dataFilePath, makeFileName(number))
|
||||
}
|
||||
|
||||
func log(msg string, args ...any) {
|
||||
if Verbose {
|
||||
if verbose {
|
||||
slog.Info(msg, args...)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,123 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestReadFileTime(t *testing.T) {
|
||||
tt := []struct {
|
||||
fileName string
|
||||
expectedTimestamp int
|
||||
}{
|
||||
{"1.txt", 3},
|
||||
{"2.txt", 5},
|
||||
{"3.txt", 11},
|
||||
}
|
||||
for _, tc := range tt {
|
||||
t.Run(tc.fileName, func(t *testing.T) {
|
||||
filePath := filepath.Join("testdata", tc.fileName)
|
||||
timestamp, err := readFileTime(filePath)
|
||||
if err != nil {
|
||||
t.Fatal("error reading file time: ", err)
|
||||
}
|
||||
if timestamp != tc.expectedTimestamp {
|
||||
t.Errorf("expected %d, got %d", tc.expectedTimestamp, timestamp)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestOrderFiles(t *testing.T) {
|
||||
var (
|
||||
expectedOrder = []int{3, 5, 11}
|
||||
expectedMap = map[int][]int{
|
||||
3: {1},
|
||||
5: {2, 7},
|
||||
11: {3, 4, 5},
|
||||
}
|
||||
)
|
||||
fileMap, order, err := orderFiles("testdata")
|
||||
if err != nil {
|
||||
t.Fatal("error ordering files: ", err)
|
||||
}
|
||||
if !reflect.DeepEqual(order, expectedOrder) {
|
||||
t.Errorf("expected %v, got %v", expectedOrder, order)
|
||||
}
|
||||
if !reflect.DeepEqual(fileMap, expectedMap) {
|
||||
t.Errorf("expected %v, got %v", expectedMap, fileMap)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFileContentsCache(t *testing.T) {
|
||||
fcc := FileContentsCache{BaseDir: "testdata"}
|
||||
cases := []struct {
|
||||
fileNumber int
|
||||
contents string
|
||||
}{
|
||||
{1, "foo foo foo"},
|
||||
{2, "bar bar bar"},
|
||||
{3, "baz baz baz"},
|
||||
}
|
||||
|
||||
// Test initial reads.
|
||||
for _, c := range cases {
|
||||
t.Run(fmt.Sprintf("initial read %d", c.fileNumber), func(t *testing.T) {
|
||||
got, err := fcc.GetFileContents(c.fileNumber)
|
||||
if err != nil {
|
||||
t.Fatal("error getting file contents: ", err)
|
||||
}
|
||||
if !reflect.DeepEqual(got, []string{c.contents}) {
|
||||
t.Errorf("expected %q, got %q", c.contents, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Ensure files are actually stored in cache.
|
||||
for _, c := range cases {
|
||||
t.Run(fmt.Sprintf("cache check %d", c.fileNumber), func(t *testing.T) {
|
||||
if got, ok := fcc.cache.Load(c.fileNumber); !ok {
|
||||
t.Fatalf("file %d not found in cache", c.fileNumber)
|
||||
} else if !reflect.DeepEqual(got, []string{c.contents}) {
|
||||
t.Fatalf("expected %q, got %q", c.contents, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Test clear-except operation.
|
||||
t.Run("clear except", func(t *testing.T) {
|
||||
fcc.ClearFilesExcept([]int{1})
|
||||
if _, ok := fcc.cache.Load(1); !ok {
|
||||
t.Fatal("file 1 not found in cache, expected to be kept")
|
||||
}
|
||||
if _, ok := fcc.cache.Load(2); ok {
|
||||
t.Fatal("file 2 found in cache, expected to be cleared")
|
||||
}
|
||||
if _, ok := fcc.cache.Load(3); ok {
|
||||
t.Fatal("file 3 found in cache, expected to be cleared")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestEndToEnd(t *testing.T) {
|
||||
docs, err := run([]string{"argv0", "-path", "testdata/e2e"})
|
||||
want := []int{1, 6, 9, 12, 14, 18}
|
||||
if err != nil {
|
||||
t.Fatal("error running program: ", err)
|
||||
}
|
||||
if len(docs) != 1 {
|
||||
t.Fatalf("expected %d documents, got %d", 1, len(docs))
|
||||
}
|
||||
doc := docs[0]
|
||||
if doc.ID != 1 {
|
||||
t.Errorf("expected ID %d, got %d", 0, doc.ID)
|
||||
}
|
||||
if doc.LatestTimestamp != 5 {
|
||||
t.Errorf("expected latest timestamp %d, got %d", 3, doc.LatestTimestamp)
|
||||
}
|
||||
if !reflect.DeepEqual(doc.AssociatedFiles, want) {
|
||||
t.Errorf("expected associated files %v, got %v", want, doc.AssociatedFiles)
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
3
|
||||
foo foo foo
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
5
|
||||
bar bar bar
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
11
|
||||
baz baz baz
|
||||
|
|
@ -0,0 +1 @@
|
|||
11
|
||||
|
|
@ -0,0 +1 @@
|
|||
11
|
||||
|
|
@ -0,0 +1 @@
|
|||
5
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
0
|
||||
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||
That Funky Music”. When it was later released as its own single, it became an
|
||||
international smash hit.
|
||||
|
||||
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||
over Queen’s “Under Pressure”, was the B-side.
|
||||
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
5
|
||||
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||
That Funky Music”. When it was later released as its own single, it became an
|
||||
international smash hit.
|
||||
|
||||
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||
over Queen’s “Under Pressure”, was the B-side.
|
||||
|
||||
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||
with “Play That Funky Music” as its flipside.
|
||||
|
||||
The song began climbing charts around the world, eventually reaching the top 10
|
||||
in twelve countries – hitting #1 in six of them including the UK and the US
|
||||
(where it became the first chart-topping rap single in history).
|
||||
|
||||
“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when
|
||||
confronted about it, Ice claimed he’d altered it, but he later admitted he
|
||||
actually hadn’t. The parties settled out of court for an undisclosed sum, and
|
||||
members of Queen, plus the guest vocalist on the original song David Bowie, were
|
||||
also given songwriting credits.
|
||||
|
||||
The song is credited for ‘making hip-hop an acceptable genre to mainstream
|
||||
media’ and continues to be popular into the 2000s. It was certified Gold in 2005
|
||||
for selling 500K digital downloads and named by VH1 the #29 top song of the 90s.
|
||||
|
||||
But it also has its share of negative feedback, with MTV ranking it the #9 worst
|
||||
video in history, and Houston Press calling it the worst song to come from
|
||||
Texas. In 2012, actor/comedian Adam Scott discussed the song’s opening lyrics on
|
||||
Conan, pointing out how ridiculous they sound when analyzed.
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
1
|
||||
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||
That Funky Music”. When it was later released as its own single, it became an
|
||||
international smash hit.
|
||||
|
||||
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||
over Queen’s “Under Pressure”, was the B-side.
|
||||
|
||||
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||
with “Play That Funky Music” as its flipside.
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
4
|
||||
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||
That Funky Music”. When it was later released as its own single, it became an
|
||||
international smash hit.
|
||||
|
||||
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||
over Queen’s “Under Pressure”, was the B-side.
|
||||
|
||||
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||
with “Play That Funky Music” as its flipside.
|
||||
|
||||
The song began climbing charts around the world, eventually reaching the top 10
|
||||
in twelve countries – hitting #1 in six of them including the UK and the US
|
||||
(where it became the first chart-topping rap single in history).
|
||||
|
||||
That year “Ice Ice Baby” was certified Platinum two months after its release and
|
||||
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
|
||||
Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t
|
||||
Touch This”.
|
||||
|
||||
Death Row Records' CEO, Suge Knight, learned that Marvin “Chocolate” Johnson, a
|
||||
Death Row signee, was a co-writer on the track, so he invited Ice to his Los
|
||||
Angeles hotel room to negotiate payment. It has been rumored that Vanilla Ice
|
||||
was hung over a balcony during the negotations, but Ice has denied these rumors
|
||||
several times, insisting, “He didn’t have to hang me from no balcony or slap me
|
||||
around or nothing”.
|
||||
|
||||
“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when
|
||||
confronted about it, Ice claimed he’d altered it, but he later admitted he
|
||||
actually hadn’t. The parties settled out of court for an undisclosed sum, and
|
||||
members of Queen, plus the guest vocalist on the original song David Bowie, were
|
||||
also given songwriting credits.
|
||||
|
||||
The song is credited for ‘making hip-hop an acceptable genre to mainstream
|
||||
media’ and continues to be popular into the 2000s. It was certified Gold in 2005
|
||||
for selling 500K digital downloads and named by VH1 the #29 top song of the 90s.
|
||||
|
||||
But it also has its share of negative feedback, with MTV ranking it the #9 worst
|
||||
video in history, and Houston Press calling it the worst song to come from
|
||||
Texas. In 2012, actor/comedian Adam Scott discussed the song’s opening lyrics on
|
||||
Conan, pointing out how ridiculous they sound when analyzed.
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
2
|
||||
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||
That Funky Music”. When it was later released as its own single, it became an
|
||||
international smash hit.
|
||||
|
||||
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||
over Queen’s “Under Pressure”, was the B-side.
|
||||
|
||||
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||
with “Play That Funky Music” as its flipside.
|
||||
|
||||
The song began climbing charts around the world, eventually reaching the top 10
|
||||
in twelve countries – hitting #1 in six of them including the UK and the US
|
||||
(where it became the first chart-topping rap single in history).
|
||||
|
||||
That year “Ice Ice Baby” was certified Platinum two months after its release and
|
||||
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
|
||||
Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t
|
||||
Touch This”.
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
3
|
||||
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||
That Funky Music”. When it was later released as its own single, it became an
|
||||
international smash hit.
|
||||
|
||||
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||
over Queen’s “Under Pressure”, was the B-side.
|
||||
|
||||
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||
with “Play That Funky Music” as its flipside.
|
||||
|
||||
The song began climbing charts around the world, eventually reaching the top 10
|
||||
in twelve countries – hitting #1 in six of them including the UK and the US
|
||||
(where it became the first chart-topping rap single in history).
|
||||
|
||||
That year “Ice Ice Baby” was certified Platinum two months after its release and
|
||||
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
|
||||
Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t
|
||||
Touch This”.
|
||||
|
||||
Death Row Records' CEO, Suge Knight, learned that Marvin “Chocolate” Johnson, a
|
||||
Death Row signee, was a co-writer on the track, so he invited Ice to his Los
|
||||
Angeles hotel room to negotiate payment. It has been rumored that Vanilla Ice
|
||||
was hung over a balcony during the negotations, but Ice has denied these rumors
|
||||
several times, insisting, “He didn’t have to hang me from no balcony or slap me
|
||||
around or nothing”.
|
||||
|
||||
“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when
|
||||
confronted about it, Ice claimed he’d altered it, but he later admitted he
|
||||
actually hadn’t. The parties settled out of court for an undisclosed sum, and
|
||||
members of Queen, plus the guest vocalist on the original song David Bowie, were
|
||||
also given songwriting credits.
|
||||
Loading…
Reference in New Issue