From b6de64cde69837a6e9bd294ddf1946aa9eb8d445 Mon Sep 17 00:00:00 2001 From: Ian Molee Date: Fri, 5 Apr 2024 02:03:14 -0700 Subject: [PATCH] Major refactor: use worker pool Use a bounded worker pool to prevent creation of hundreds of goroutines contending for scheduling. Add some tests, a Dockerfile, a Makefile, and a readme. --- .gitignore | 2 + Dockerfile | 11 + Makefile | 20 ++ README.md | 39 ++++ go.mod | 4 +- main.go | 539 +++++++++++++++++++++++++++----------------- main_test.go | 123 ++++++++++ testdata/1.txt | 2 + testdata/2.txt | 2 + testdata/3.txt | 2 + testdata/4.txt | 1 + testdata/5.txt | 1 + testdata/7.txt | 1 + testdata/e2e/1.txt | 12 + testdata/e2e/12.txt | 37 +++ testdata/e2e/14.txt | 18 ++ testdata/e2e/18.txt | 49 ++++ testdata/e2e/6.txt | 27 +++ testdata/e2e/9.txt | 40 ++++ 19 files changed, 721 insertions(+), 209 deletions(-) create mode 100644 Dockerfile create mode 100644 Makefile create mode 100644 README.md create mode 100644 main_test.go create mode 100644 testdata/1.txt create mode 100644 testdata/2.txt create mode 100644 testdata/3.txt create mode 100644 testdata/4.txt create mode 100644 testdata/5.txt create mode 100644 testdata/7.txt create mode 100644 testdata/e2e/1.txt create mode 100644 testdata/e2e/12.txt create mode 100644 testdata/e2e/14.txt create mode 100644 testdata/e2e/18.txt create mode 100644 testdata/e2e/6.txt create mode 100644 testdata/e2e/9.txt diff --git a/.gitignore b/.gitignore index 397b4a7..29596f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ *.log +output.*.txt +.vscode diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..26d7e89 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM golang:1.22 as builder +WORKDIR /go/src/docgrouper +COPY testdata testdata/ +COPY *.go go.mod go.sum ./ +RUN go mod download +RUN go test -v ./... && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o docgroup + +FROM gcr.io/distroless/base-nossl-debian12 +COPY --from=builder /go/src/docgrouper/docgrouper /bin/docgrouper +VOLUME [ "/files" ] +ENTRYPOINT [ "docgroup" ] \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d663dc3 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +DOCKER_IMAGE := "steelray-docgrouper" +BINNAME := "steelray-docgrouper" +FILES_PATH := "./files.200" + +build: *.go + go build -o $(BINNAME) . + +test: + go test -v ./... + +clean: + rm -f $(BINNAME) + +docker-build: + docker build -t $(DOCKER_IMAGE) . + +docker-run: + docker run -v $(FILES_PATH):/files $(DOCKER_IMAGE):latest + +.PHONY: docker-build docker-run \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..af5676a --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# Docgrouper + +Given a set of files with an integer timestamp as its first line, identify a set +of documents that they represent at various points of the document's life. + +## Building + +Building **docgrouper** requires [Go](https://go.dev), and can be built by +running `make build`. Because Go might not be installed, a `Dockerfile` is +provided to test and build a container image. The docker image can be built via +the `docker-build` Makefile target. + +## Running + +If running via Docker, the directory where the file pool exists must be mounted +into the container, via the `-v` or `--volume` switch, like so: + +``` +docker run --volume ./host-files:/files steelray-docgrouper +``` + +This invocation is made available via the `docker-run` Makefile target, but this +will only invoke docgrouper with the default command line arguments since +arguments cannot be passed to a Makefile target. + +## Options + +``` + -path string + path to the file pool (default "files") + -prefix + use '[doc ###]' prefix for output + -threshold float + similarity threshold (default 0.5) + -verbose + enable verbose logging + -workers int + number of workers to use (default 2*) +``` diff --git a/go.mod b/go.mod index 49f2ba2..fab0fac 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module github.com/ianfoo/steelray-docgrouper -go 1.22.1 - -require github.com/adrg/strutil v0.3.1 +go 1.22.2 diff --git a/main.go b/main.go index 7fe8cb1..a3b4746 100644 --- a/main.go +++ b/main.go @@ -1,8 +1,11 @@ -// Timesheet +// Time sheet // Previously ~1h // March 13, 2024: 00:00-02:30 // March 19, 2024: 15:00-19:00 -// March 23, 2024: 20:00- +// March 23, 2024: 20:00-22:00 +// April 02, 2024: 12:30-17:00 +// April 04, 2024: 21:00-23:30 +// April 05, 2024: 00:00-02:00 package main import ( @@ -12,6 +15,7 @@ import ( "log/slog" "os" "path" + "runtime" "slices" "strconv" "strings" @@ -19,143 +23,290 @@ import ( "sync/atomic" ) -// DataFilePath describes the default location where the file pool can be found. -const DefaultDataFilePath = "files" +const ( + // defaultSimilarityThreshold is the default minimum similarity required for + // two files to be considered related. This value is arbitrary and could be + // adjusted based on the specific requirements of the problem. + defaultSimilarityThreshold = 0.5 + + // defaultDataFilePath describes the default location where the file pool can be + // found. + defaultDataFilePath = "files" +) // Command line options var ( - DataFilePath string - UseDocPrefix bool - Verbose bool + dataFilePath string + similarityThreshold float64 + useDocPrefix bool + verbose bool + numWorkers int ) func main() { - if err := run(os.Args); err != nil { + documents, err := run(os.Args) + if err != nil { fmt.Fprintf(os.Stderr, "error: %v\n", err) - os.Exit(1) + os.Exit(-1) + } + for _, doc := range documents { + fmt.Println(doc) } } // run is the main entry point for the program. -func run(args []string) error { +func run(args []string) ([]*Document, error) { flags := flag.NewFlagSet(args[0], flag.ExitOnError) - flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool") - flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output") - flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging") - flags.Parse(args[1:]) + flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool") + flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold") + flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use") + flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output") + flags.BoolVar(&verbose, "verbose", false, "enable verbose logging") + _ = flags.Parse(args[1:]) - // SimilarityThreshold is the minimum similarity required for two files - // to be considered related. This value is arbitrary and could be adjusted - // based on the specific requirements of the problem. - const SimilarityThreshold = 0.5 - - fileTimes, times, err := orderFiles() + // The files need to be processed in order of time, so determine the + // timestamp of each file and sort them by time. + fileTimes, times, err := orderFiles(dataFilePath) if err != nil { - return err + return nil, err } - var ( - // documents is the master list of documents that will be built up. - documents []*Document - - // fcc handles reading files and caching contents. - fcc = make(fileContentsCache) - ) + dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers) for i, timestamp := range times { - _ = i - // fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes)) - // Track the files at this timestamp that have been associated with documents, so - // we can identify unassociated files later and then create new documents for them. var ( - wg sync.WaitGroup + // Track the files at this timestamp that have been associated with + // documents, so we can identify unassociated files later and then + // create new documents for them. This needs to be distinct for each + // timestamp, so it's created inside the timestamp loop's scope. associatedFiles sync.Map + + // We might need to create new documents for files that weren't + // associated with any document at this timestamp, so we need to make + // sure that this timestamp has been entirely processed first. We do + // this by waiting for the workers to indicate they've finished a work + // item. + wg sync.WaitGroup ) - wg.Add(len(documents)) - log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents)) - for _, doc := range documents { - // Start a goroutine for each document, to parallelize the - // comparison with the files in the current timestamp. A more robust - // solution would limit the number of concurrent goroutines to avoid - // exhausting system resources, but for this problem we won't have - // more than a couple thousand documents. Goroutines are - // lightweight enough (2K stack) that we can start them pretty - // capriciously. - go func(doc *Document, files []int) { - defer wg.Done() - for _, candidateFileNumber := range files { - // Check to be certain this file hasn't been associated with another - // document already. If it has been, continue to the next file. - if _, ok := associatedFiles.Load(candidateFileNumber); ok { - continue - } - - latestFileNumber := doc.LatestAssociatedFile() - overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber) - if err != nil { - fmt.Fprintf( - os.Stderr, - "error comparing files %d and %d: %v\n", - latestFileNumber, candidateFileNumber, err, - ) - } - if overlap >= SimilarityThreshold { - // Add file to Document associated list - doc.AssociateFile(candidateFileNumber, timestamp) - associatedFiles.Store(candidateFileNumber, struct{}{}) - - // We know this document won't be associated with any other files - // with this timestamp, so we can stop looking at files with this - // timestamp, for this document. - return - } - } - }(doc, fileTimes[timestamp]) + log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times)) + for i, doc := range dm.Documents { + wg.Add(1) + dm.WorkCh <- WorkItem{ + doc: doc, + fileNumbers: fileTimes[timestamp], + timestamp: timestamp, + associatedFiles: &associatedFiles, + wg: &wg, + } + log( + "submitted work", + "documentNumber", i+1, + "documentID", doc.ID, + "totalDocs", len(dm.Documents), + "timestamp", timestamp, + ) } - // Wait for all document comparisons to complete for this timestamp. wg.Wait() - // If we haven't associated all the files with existing documents, we need - // to create new documents for those that remain. - currentNumDocs := len(documents) + // Now that this timestamp has been fully processed, we can check to see + // what files haven't been associated existing documents, and create new + // documents for them. + var docsAdded int for _, fileNumber := range fileTimes[timestamp] { if _, ok := associatedFiles.Load(fileNumber); !ok { - doc := NewDocument(fileNumber, timestamp) - documents = append(documents, &doc) + dm.AddNewDocument(fileNumber, timestamp) + docsAdded++ } } - if len(documents) > currentNumDocs { - log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp) + if docsAdded > 0 { + log("created new documents", "numAdded", docsAdded, "timestamp", timestamp) } - // Now we can clear the cache of file contents for files that aren't associated with - // a document, to conserve memory. - var latestDocumentFiles []int - for _, doc := range documents { - latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile()) - } - fcc.clearFilesExcept(latestDocumentFiles) + // Free up memory. + dm.ShrinkCache() } - // Output the list of documents, showing their associated files in ascending order. - // Order the documents by their first associated file. - slices.SortFunc(documents, func(a, b *Document) int { - return a.AssociatedFiles[0] - b.AssociatedFiles[0] - }) - for _, doc := range documents { - doc.SortAssociatedFiles() - fmt.Println(doc) - } - - return nil + dm.Shutdown() + return dm.SortedDocuments(), nil } -// DocumentIDSource is a concurrency-safe source from which to identify -// documents. This could easily be something other than an integer, but using -// this allows us to just use the standard library. -var DocumentIDSource atomic.Uint32 +// WorkItem is what will be sent to the the workers in the worker pool. +type WorkItem struct { + doc *Document + fileNumbers []int + timestamp int + associatedFiles *sync.Map + wg *sync.WaitGroup +} + +// DocumentManager handles the processing of documents and files. It maintains a +// list of documents and a cache of file contents, and uses a pool of workers to +// compare documents against files. +type DocumentManager struct { + // Documents is the list of documents that have been identified. + Documents []*Document + + // WorkCh is the channel through which work items are submitted to the workers. + WorkCh chan WorkItem + + // docIDSource is a concurrency-safe source from which to identify documents. + // This could easily be something other than an integer, but using this allows + // us to just use the standard library. + docIDSource atomic.Uint32 + + similarityThreshold float64 + fcc *FileContentsCache + wg sync.WaitGroup +} + +// NewDocumentManager creates a new DocumentManager with the specified base path +// for the file pool and the specified number of workers. +func NewDocumentManager(fileBasePath string, similarityThreshold float64, numWorkers int) *DocumentManager { + dm := &DocumentManager{ + Documents: make([]*Document, 0), + similarityThreshold: similarityThreshold, + fcc: &FileContentsCache{BaseDir: fileBasePath}, + WorkCh: make(chan WorkItem), + } + + // Start workers. + for wID := range numWorkers { + go dm.ComparisonWorker(wID + 1) + } + dm.wg.Add(numWorkers) + + return dm +} + +// Shutdown cleans up the document manager by closing the work channel to +// trigger workers to exit and then waits for all workers to exit. +func (dm *DocumentManager) Shutdown() { + close(dm.WorkCh) + dm.wg.Wait() +} + +func (dm *DocumentManager) AddNewDocument(fileNumber, timestamp int) { + doc := Document{ + ID: dm.docIDSource.Add(1), + LatestTimestamp: timestamp, + AssociatedFiles: []int{fileNumber}, + } + dm.Documents = append(dm.Documents, &doc) +} + +// ShrinkCache removes files from the cache that will never be used again, by +// evicting those files that are not associated with any document. Note that +// this is not concurrent-safe and should only be called when operations that +// could modify the document list are not running. This is an optimization, but +// could be removed if memory usage is not a concern. +func (dm *DocumentManager) ShrinkCache() { + var latestDocumentFiles []int + for _, doc := range dm.Documents { + latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile()) + } + dm.fcc.ClearFilesExcept(latestDocumentFiles) +} + +// Return the list of documents with their associated files in ascending order, +// and the documents themselves ordered by the documents by their first +// associated file. +func (dm *DocumentManager) SortedDocuments() []*Document { + // Sort the associated files for each document. + for _, doc := range dm.Documents { + doc.SortAssociatedFiles() + } + // Sort the documents by their first associated file number. + slices.SortFunc(dm.Documents, func(a, b *Document) int { + return a.AssociatedFiles[0] - b.AssociatedFiles[0] + }) + return dm.Documents +} + +// ComparisonWorker is a function that receives work items describing a document +// and a list of candidate file IDs to compare against. It will compare the +// document against each file and if a match is found, associate the file with the +// document sent in the work item, and record the file as having been matched. +func (dm *DocumentManager) ComparisonWorker(workerID int) { + for workItem := range dm.WorkCh { + for _, fileNumber := range workItem.fileNumbers { + if _, ok := workItem.associatedFiles.Load(fileNumber); ok { + // This file has already been matched; skip it. + continue + } + latestFileNumber := workItem.doc.LatestAssociatedFile() + similarity, err := dm.compareFiles(latestFileNumber, fileNumber) + if err != nil { + // Simplistic error handling: log the error and continue. + slog.Error( + "error comparing files", + "file1", latestFileNumber, + "file2", fileNumber, + "document", workItem.doc.ID, + "worker", workerID, + ) + } + + // If current file doesn't match current document, skip to the next file. + if similarity < dm.similarityThreshold { + continue + } + + // Current file matches current document, so record this. + workItem.doc.AssociateFile(fileNumber, workItem.timestamp) + workItem.associatedFiles.Store(fileNumber, struct{}{}) + log( + "match found", + "document", workItem.doc.ID, + "file", fileNumber, + "time", workItem.timestamp, + "worker", workerID, + ) + + // We don't need to consider this document anymore since we've found + // a match. End processing and wait for more work. + break + } + workItem.wg.Done() + } + + // Report that this worker is shutting down. + dm.wg.Done() +} + +// compareFiles computes how much two files overlap, on a scale +// of 0 to 1 by iterating through the files and identifying lines +// that are duplicated. +func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) { + f1, err := dm.fcc.GetFileContents(f1Number) + if err != nil { + return 0, fmt.Errorf("file %d: %w", f1Number, err) + } + f2, err := dm.fcc.GetFileContents(f2Number) + if err != nil { + return 0, fmt.Errorf("file %d: %w", f2Number, err) + } + + histogram := make(map[string]int) + for _, lines := range [][]string{f1, f2} { + for _, line := range lines { + // Skip blank lines, which can throw off the count. + if line == "" { + continue + } + histogram[line]++ + } + } + + var overlap int + for _, v := range histogram { + if v == 2 { + overlap++ + } + } + return float64(overlap) / float64(len(histogram)), nil +} // Document stores a document ID and a list of associated files. type Document struct { @@ -171,8 +322,8 @@ func (d Document) String() string { for _, f := range d.AssociatedFiles { sb.WriteString(fmt.Sprintf("%d ", f)) } - if UseDocPrefix { - return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String()) + if useDocPrefix { + return fmt.Sprintf("[doc %4d] %s", d.ID, strings.TrimSpace(sb.String())) } return sb.String() } @@ -186,23 +337,78 @@ func (d *Document) AssociateFile(fileNumber, timestamp int) { } // LatestAssociatedFile returns the most recent file associated with a document. +// Note that this presumes that the list of associated files is sorted in +// temporal order based on the timestamp at the head of the file. func (d Document) LatestAssociatedFile() int { return d.AssociatedFiles[len(d.AssociatedFiles)-1] } -// SortAssociatedFiles sorts the list of associated files for a document, -// since the requirements stipulate output in ascending order. +// SortAssociatedFiles sorts the list of associated files for a document, since +// the requirements stipulate output in ascending numerical order. Note that +// this changes the order of associated files from their original temporal +// order, so must only be invoked when the work is entirely finished. func (d *Document) SortAssociatedFiles() { slices.Sort(d.AssociatedFiles) } -// NewDocument creates a new Document struct and initializes an ID and records -// the first file and timestamp associated with it. -func NewDocument(fileNumber, timestamp int) Document { - return Document{ - ID: DocumentIDSource.Add(1), - LatestTimestamp: timestamp, - AssociatedFiles: []int{fileNumber}, +// FileContentsCache is a cache of file contents, keyed by file number, +// to avoid reading the same file from disk multiple times. +type FileContentsCache struct { + BaseDir string + cache sync.Map +} + +// GetFileContents returns the contents of a file, excluding the first timestamp +// line. If the file is already in the cache, the contents are returned from +// there, otherwise the file is read from disk and the contents are cached. +func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error) { + if contents, ok := fcc.cache.Load(fileNumber); ok { + return contents.([]string), nil + } + var ( + fileName = makeFilePath(fcc.BaseDir, fileNumber) + lines []string + ) + + f, err := os.Open(fileName) + if err != nil { + return nil, err + } + + s := bufio.NewScanner(f) + + // Read first line and ignore it since it's just the timestamp. + _ = s.Scan() + + // Read file and store contents in cache. + for s.Scan() { + lines = append(lines, s.Text()) + } + if err := s.Err(); err != nil { + return nil, err + } + + fcc.cache.Store(fileNumber, lines) + return lines, nil +} + +// ClearFilesExcept removes the contents of the fileContentsCache except for the +// provided file numbers. This helps conserve memory by removing the contents of +// files that are no longer of interest, which we can be sure of since we are +// proceeding in order of time. +func (fcc *FileContentsCache) ClearFilesExcept(fileNumbers []int) { + // Build up a list of entries to delete to avoid modifying the concurrent + // map while iterating over it. + var toDelete []int + fcc.cache.Range(func(key, _ any) bool { + storedFileNum := key.(int) + if !slices.Contains(fileNumbers, storedFileNum) { + toDelete = append(toDelete, storedFileNum) + } + return true + }) + for _, k := range toDelete { + fcc.cache.Delete(k) } } @@ -231,104 +437,17 @@ func readFileTime(filepath string) (int, error) { return time, nil } -// compareFiles computes how much two files overlap, on a scale -// of 0 to 1 by iterating through the files and identifying lines -// that are duplicated. -func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) { - f1, err := fcc.getFileContents(f1Number) - if err != nil { - return 0, fmt.Errorf("file %d: %w", f1Number, err) - } - f2, err := fcc.getFileContents(f2Number) - if err != nil { - return 0, fmt.Errorf("file %d: %w", f2Number, err) - } - - histogram := make(map[string]int) - for _, lines := range [][]string{f1, f2} { - for _, line := range lines { - histogram[line]++ - } - } - - var overlap int - for _, v := range histogram { - if v == 2 { - overlap++ - } - } - return float64(overlap) / float64(len(histogram)), nil -} - -// fileContentsCache is a cache of file contents, keyed by file number, -// to avoid reading the same file from disk multiple times. -type fileContentsCache map[int][]string - -// getFileContents returns the contents of a file, excluding the first timestamp -// line. If the file is already in the cache, the contents are returned from -// there, otherwise the file is read from disk and the contents are cached. -func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) { - if contents, ok := fcc[fileNumber]; ok { - return contents, nil - } - var ( - fileName = makeFilePath(fileNumber) - lines []string - ) - - f, err := os.Open(fileName) - if err != nil { - return nil, err - } - - s := bufio.NewScanner(f) - - // Ignore first line that's just a timestamp. - if !s.Scan() { - fcc[fileNumber] = []string{} - return []string{}, nil - } - - for s.Scan() { - lines = append(lines, s.Text()) - } - if err := s.Err(); err != nil { - return nil, err - } - return lines, nil -} - -// clearFilesExcept removes the contents of the fileContentsCache except for the -// provided file numbers. This helps conserve memory by removing the contents of -// files that are no longer of interest, which we can be sure of since we are -// proceeding in order of time. -func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) { - for fNum := range fcc { - if !slices.Contains(fileNumbers, fNum) { - delete(fcc, fNum) - } - } -} - -func makeFileName(number int) string { - return fmt.Sprintf("%d.txt", number) -} - -func makeFilePath(number int) string { - return path.Join(DataFilePath, makeFileName(number)) -} - // orderFiles determines the timestamp version of each file and creates a map of // time to file numbers. It sorts the times (since maps are not ordered) so that // the map can be iterated in order of time. This allows stepping through the // history of the files from the beginning. Using this, we can construct a // "chain" of evolution for a given document. -func orderFiles() (map[int][]int, []int, error) { +func orderFiles(dir string) (map[int][]int, []int, error) { timeMap := make(map[int][]int) - dirEntries, err := os.ReadDir(DataFilePath) + dirEntries, err := os.ReadDir(dir) if err != nil { - return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err) + return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err) } for _, entry := range dirEntries { if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") { @@ -345,7 +464,7 @@ func orderFiles() (map[int][]int, []int, error) { } } - filePath := path.Join(DataFilePath, entry.Name()) + filePath := path.Join(dir, entry.Name()) modTime, err := readFileTime(filePath) if err != nil { return nil, nil, err @@ -366,8 +485,16 @@ func orderFiles() (map[int][]int, []int, error) { return timeMap, timeSlice, nil } +func makeFileName(number int) string { + return fmt.Sprintf("%d.txt", number) +} + +func makeFilePath(dataFilePath string, number int) string { + return path.Join(dataFilePath, makeFileName(number)) +} + func log(msg string, args ...any) { - if Verbose { + if verbose { slog.Info(msg, args...) } } diff --git a/main_test.go b/main_test.go new file mode 100644 index 0000000..79590fb --- /dev/null +++ b/main_test.go @@ -0,0 +1,123 @@ +package main + +import ( + "fmt" + "path/filepath" + "reflect" + "testing" +) + +func TestReadFileTime(t *testing.T) { + tt := []struct { + fileName string + expectedTimestamp int + }{ + {"1.txt", 3}, + {"2.txt", 5}, + {"3.txt", 11}, + } + for _, tc := range tt { + t.Run(tc.fileName, func(t *testing.T) { + filePath := filepath.Join("testdata", tc.fileName) + timestamp, err := readFileTime(filePath) + if err != nil { + t.Fatal("error reading file time: ", err) + } + if timestamp != tc.expectedTimestamp { + t.Errorf("expected %d, got %d", tc.expectedTimestamp, timestamp) + } + }) + } +} + +func TestOrderFiles(t *testing.T) { + var ( + expectedOrder = []int{3, 5, 11} + expectedMap = map[int][]int{ + 3: {1}, + 5: {2, 7}, + 11: {3, 4, 5}, + } + ) + fileMap, order, err := orderFiles("testdata") + if err != nil { + t.Fatal("error ordering files: ", err) + } + if !reflect.DeepEqual(order, expectedOrder) { + t.Errorf("expected %v, got %v", expectedOrder, order) + } + if !reflect.DeepEqual(fileMap, expectedMap) { + t.Errorf("expected %v, got %v", expectedMap, fileMap) + } +} + +func TestFileContentsCache(t *testing.T) { + fcc := FileContentsCache{BaseDir: "testdata"} + cases := []struct { + fileNumber int + contents string + }{ + {1, "foo foo foo"}, + {2, "bar bar bar"}, + {3, "baz baz baz"}, + } + + // Test initial reads. + for _, c := range cases { + t.Run(fmt.Sprintf("initial read %d", c.fileNumber), func(t *testing.T) { + got, err := fcc.GetFileContents(c.fileNumber) + if err != nil { + t.Fatal("error getting file contents: ", err) + } + if !reflect.DeepEqual(got, []string{c.contents}) { + t.Errorf("expected %q, got %q", c.contents, got) + } + }) + } + + // Ensure files are actually stored in cache. + for _, c := range cases { + t.Run(fmt.Sprintf("cache check %d", c.fileNumber), func(t *testing.T) { + if got, ok := fcc.cache.Load(c.fileNumber); !ok { + t.Fatalf("file %d not found in cache", c.fileNumber) + } else if !reflect.DeepEqual(got, []string{c.contents}) { + t.Fatalf("expected %q, got %q", c.contents, got) + } + }) + } + + // Test clear-except operation. + t.Run("clear except", func(t *testing.T) { + fcc.ClearFilesExcept([]int{1}) + if _, ok := fcc.cache.Load(1); !ok { + t.Fatal("file 1 not found in cache, expected to be kept") + } + if _, ok := fcc.cache.Load(2); ok { + t.Fatal("file 2 found in cache, expected to be cleared") + } + if _, ok := fcc.cache.Load(3); ok { + t.Fatal("file 3 found in cache, expected to be cleared") + } + }) +} + +func TestEndToEnd(t *testing.T) { + docs, err := run([]string{"argv0", "-path", "testdata/e2e"}) + want := []int{1, 6, 9, 12, 14, 18} + if err != nil { + t.Fatal("error running program: ", err) + } + if len(docs) != 1 { + t.Fatalf("expected %d documents, got %d", 1, len(docs)) + } + doc := docs[0] + if doc.ID != 1 { + t.Errorf("expected ID %d, got %d", 0, doc.ID) + } + if doc.LatestTimestamp != 5 { + t.Errorf("expected latest timestamp %d, got %d", 3, doc.LatestTimestamp) + } + if !reflect.DeepEqual(doc.AssociatedFiles, want) { + t.Errorf("expected associated files %v, got %v", want, doc.AssociatedFiles) + } +} diff --git a/testdata/1.txt b/testdata/1.txt new file mode 100644 index 0000000..227ed62 --- /dev/null +++ b/testdata/1.txt @@ -0,0 +1,2 @@ +3 +foo foo foo diff --git a/testdata/2.txt b/testdata/2.txt new file mode 100644 index 0000000..80fc3d3 --- /dev/null +++ b/testdata/2.txt @@ -0,0 +1,2 @@ +5 +bar bar bar diff --git a/testdata/3.txt b/testdata/3.txt new file mode 100644 index 0000000..8158e44 --- /dev/null +++ b/testdata/3.txt @@ -0,0 +1,2 @@ +11 +baz baz baz \ No newline at end of file diff --git a/testdata/4.txt b/testdata/4.txt new file mode 100644 index 0000000..b4de394 --- /dev/null +++ b/testdata/4.txt @@ -0,0 +1 @@ +11 diff --git a/testdata/5.txt b/testdata/5.txt new file mode 100644 index 0000000..b4de394 --- /dev/null +++ b/testdata/5.txt @@ -0,0 +1 @@ +11 diff --git a/testdata/7.txt b/testdata/7.txt new file mode 100644 index 0000000..7ed6ff8 --- /dev/null +++ b/testdata/7.txt @@ -0,0 +1 @@ +5 diff --git a/testdata/e2e/1.txt b/testdata/e2e/1.txt new file mode 100644 index 0000000..c719ae4 --- /dev/null +++ b/testdata/e2e/1.txt @@ -0,0 +1,12 @@ +0 +“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play +That Funky Music”. When it was later released as its own single, it became an +international smash hit. + +After every record company turned Vanilla Ice’s original demos down, Tommy Quon +(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce +two tracks for release on his label Ultrax Records. “Play That Funky Music”, +based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”, +based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken +over Queen’s “Under Pressure”, was the B-side. + diff --git a/testdata/e2e/12.txt b/testdata/e2e/12.txt new file mode 100644 index 0000000..f575c02 --- /dev/null +++ b/testdata/e2e/12.txt @@ -0,0 +1,37 @@ +5 +“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play +That Funky Music”. When it was later released as its own single, it became an +international smash hit. + +After every record company turned Vanilla Ice’s original demos down, Tommy Quon +(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce +two tracks for release on his label Ultrax Records. “Play That Funky Music”, +based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”, +based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken +over Queen’s “Under Pressure”, was the B-side. + +A DJ at an FM station in Georgia liked the B-side better and played it on air. +Soon it became that station’s #1 requested song, leading stations in Tennessee +and Texas to do the same. It also became Video Jukebox’s most requested video. +After SBK Records' founder was played the song over the phone, he signed Ice the +next day. In August of 1990, his label released “Ice Ice Baby” as the A-side +with “Play That Funky Music” as its flipside. + +The song began climbing charts around the world, eventually reaching the top 10 +in twelve countries – hitting #1 in six of them including the UK and the US +(where it became the first chart-topping rap single in history). + +“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when +confronted about it, Ice claimed he’d altered it, but he later admitted he +actually hadn’t. The parties settled out of court for an undisclosed sum, and +members of Queen, plus the guest vocalist on the original song David Bowie, were +also given songwriting credits. + +The song is credited for ‘making hip-hop an acceptable genre to mainstream +media’ and continues to be popular into the 2000s. It was certified Gold in 2005 +for selling 500K digital downloads and named by VH1 the #29 top song of the 90s. + +But it also has its share of negative feedback, with MTV ranking it the #9 worst +video in history, and Houston Press calling it the worst song to come from +Texas. In 2012, actor/comedian Adam Scott discussed the song’s opening lyrics on +Conan, pointing out how ridiculous they sound when analyzed. \ No newline at end of file diff --git a/testdata/e2e/14.txt b/testdata/e2e/14.txt new file mode 100644 index 0000000..bf2c1cb --- /dev/null +++ b/testdata/e2e/14.txt @@ -0,0 +1,18 @@ +1 +“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play +That Funky Music”. When it was later released as its own single, it became an +international smash hit. + +After every record company turned Vanilla Ice’s original demos down, Tommy Quon +(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce +two tracks for release on his label Ultrax Records. “Play That Funky Music”, +based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”, +based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken +over Queen’s “Under Pressure”, was the B-side. + +A DJ at an FM station in Georgia liked the B-side better and played it on air. +Soon it became that station’s #1 requested song, leading stations in Tennessee +and Texas to do the same. It also became Video Jukebox’s most requested video. +After SBK Records' founder was played the song over the phone, he signed Ice the +next day. In August of 1990, his label released “Ice Ice Baby” as the A-side +with “Play That Funky Music” as its flipside. \ No newline at end of file diff --git a/testdata/e2e/18.txt b/testdata/e2e/18.txt new file mode 100644 index 0000000..6c73b9b --- /dev/null +++ b/testdata/e2e/18.txt @@ -0,0 +1,49 @@ +4 +“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play +That Funky Music”. When it was later released as its own single, it became an +international smash hit. + +After every record company turned Vanilla Ice’s original demos down, Tommy Quon +(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce +two tracks for release on his label Ultrax Records. “Play That Funky Music”, +based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”, +based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken +over Queen’s “Under Pressure”, was the B-side. + +A DJ at an FM station in Georgia liked the B-side better and played it on air. +Soon it became that station’s #1 requested song, leading stations in Tennessee +and Texas to do the same. It also became Video Jukebox’s most requested video. +After SBK Records' founder was played the song over the phone, he signed Ice the +next day. In August of 1990, his label released “Ice Ice Baby” as the A-side +with “Play That Funky Music” as its flipside. + +The song began climbing charts around the world, eventually reaching the top 10 +in twelve countries – hitting #1 in six of them including the UK and the US +(where it became the first chart-topping rap single in history). + +That year “Ice Ice Baby” was certified Platinum two months after its release and +was ranked the #45 song of 1990 by Billboard. The song was also nominated for a +Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t +Touch This”. + +Death Row Records' CEO, Suge Knight, learned that Marvin “Chocolate” Johnson, a +Death Row signee, was a co-writer on the track, so he invited Ice to his Los +Angeles hotel room to negotiate payment. It has been rumored that Vanilla Ice +was hung over a balcony during the negotations, but Ice has denied these rumors +several times, insisting, “He didn’t have to hang me from no balcony or slap me +around or nothing”. + +“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when +confronted about it, Ice claimed he’d altered it, but he later admitted he +actually hadn’t. The parties settled out of court for an undisclosed sum, and +members of Queen, plus the guest vocalist on the original song David Bowie, were +also given songwriting credits. + +The song is credited for ‘making hip-hop an acceptable genre to mainstream +media’ and continues to be popular into the 2000s. It was certified Gold in 2005 +for selling 500K digital downloads and named by VH1 the #29 top song of the 90s. + +But it also has its share of negative feedback, with MTV ranking it the #9 worst +video in history, and Houston Press calling it the worst song to come from +Texas. In 2012, actor/comedian Adam Scott discussed the song’s opening lyrics on +Conan, pointing out how ridiculous they sound when analyzed. \ No newline at end of file diff --git a/testdata/e2e/6.txt b/testdata/e2e/6.txt new file mode 100644 index 0000000..e713bdd --- /dev/null +++ b/testdata/e2e/6.txt @@ -0,0 +1,27 @@ +2 +“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play +That Funky Music”. When it was later released as its own single, it became an +international smash hit. + +After every record company turned Vanilla Ice’s original demos down, Tommy Quon +(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce +two tracks for release on his label Ultrax Records. “Play That Funky Music”, +based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”, +based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken +over Queen’s “Under Pressure”, was the B-side. + +A DJ at an FM station in Georgia liked the B-side better and played it on air. +Soon it became that station’s #1 requested song, leading stations in Tennessee +and Texas to do the same. It also became Video Jukebox’s most requested video. +After SBK Records' founder was played the song over the phone, he signed Ice the +next day. In August of 1990, his label released “Ice Ice Baby” as the A-side +with “Play That Funky Music” as its flipside. + +The song began climbing charts around the world, eventually reaching the top 10 +in twelve countries – hitting #1 in six of them including the UK and the US +(where it became the first chart-topping rap single in history). + +That year “Ice Ice Baby” was certified Platinum two months after its release and +was ranked the #45 song of 1990 by Billboard. The song was also nominated for a +Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t +Touch This”. \ No newline at end of file diff --git a/testdata/e2e/9.txt b/testdata/e2e/9.txt new file mode 100644 index 0000000..32ed45f --- /dev/null +++ b/testdata/e2e/9.txt @@ -0,0 +1,40 @@ +3 +“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play +That Funky Music”. When it was later released as its own single, it became an +international smash hit. + +After every record company turned Vanilla Ice’s original demos down, Tommy Quon +(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce +two tracks for release on his label Ultrax Records. “Play That Funky Music”, +based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”, +based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken +over Queen’s “Under Pressure”, was the B-side. + +A DJ at an FM station in Georgia liked the B-side better and played it on air. +Soon it became that station’s #1 requested song, leading stations in Tennessee +and Texas to do the same. It also became Video Jukebox’s most requested video. +After SBK Records' founder was played the song over the phone, he signed Ice the +next day. In August of 1990, his label released “Ice Ice Baby” as the A-side +with “Play That Funky Music” as its flipside. + +The song began climbing charts around the world, eventually reaching the top 10 +in twelve countries – hitting #1 in six of them including the UK and the US +(where it became the first chart-topping rap single in history). + +That year “Ice Ice Baby” was certified Platinum two months after its release and +was ranked the #45 song of 1990 by Billboard. The song was also nominated for a +Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t +Touch This”. + +Death Row Records' CEO, Suge Knight, learned that Marvin “Chocolate” Johnson, a +Death Row signee, was a co-writer on the track, so he invited Ice to his Los +Angeles hotel room to negotiate payment. It has been rumored that Vanilla Ice +was hung over a balcony during the negotations, but Ice has denied these rumors +several times, insisting, “He didn’t have to hang me from no balcony or slap me +around or nothing”. + +“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when +confronted about it, Ice claimed he’d altered it, but he later admitted he +actually hadn’t. The parties settled out of court for an undisclosed sum, and +members of Queen, plus the guest vocalist on the original song David Bowie, were +also given songwriting credits. \ No newline at end of file