Major refactor: use worker pool

Use a bounded worker pool to prevent creation of hundreds of goroutines
contending for scheduling. Add some tests, a Dockerfile, a Makefile, and
a readme.
This commit is contained in:
Ian Molee 2024-04-05 02:03:14 -07:00
parent 5f1a8bc256
commit b6de64cde6
19 changed files with 721 additions and 209 deletions

2
.gitignore vendored
View File

@ -1 +1,3 @@
*.log
output.*.txt
.vscode

11
Dockerfile Normal file
View File

@ -0,0 +1,11 @@
FROM golang:1.22 as builder
WORKDIR /go/src/docgrouper
COPY testdata testdata/
COPY *.go go.mod go.sum ./
RUN go mod download
RUN go test -v ./... && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o docgroup
FROM gcr.io/distroless/base-nossl-debian12
COPY --from=builder /go/src/docgrouper/docgrouper /bin/docgrouper
VOLUME [ "/files" ]
ENTRYPOINT [ "docgroup" ]

20
Makefile Normal file
View File

@ -0,0 +1,20 @@
DOCKER_IMAGE := "steelray-docgrouper"
BINNAME := "steelray-docgrouper"
FILES_PATH := "./files.200"
build: *.go
go build -o $(BINNAME) .
test:
go test -v ./...
clean:
rm -f $(BINNAME)
docker-build:
docker build -t $(DOCKER_IMAGE) .
docker-run:
docker run -v $(FILES_PATH):/files $(DOCKER_IMAGE):latest
.PHONY: docker-build docker-run

39
README.md Normal file
View File

@ -0,0 +1,39 @@
# Docgrouper
Given a set of files with an integer timestamp as its first line, identify a set
of documents that they represent at various points of the document's life.
## Building
Building **docgrouper** requires [Go](https://go.dev), and can be built by
running `make build`. Because Go might not be installed, a `Dockerfile` is
provided to test and build a container image. The docker image can be built via
the `docker-build` Makefile target.
## Running
If running via Docker, the directory where the file pool exists must be mounted
into the container, via the `-v` or `--volume` switch, like so:
```
docker run --volume ./host-files:/files steelray-docgrouper
```
This invocation is made available via the `docker-run` Makefile target, but this
will only invoke docgrouper with the default command line arguments since
arguments cannot be passed to a Makefile target.
## Options
```
-path string
path to the file pool (default "files")
-prefix
use '[doc ###]' prefix for output
-threshold float
similarity threshold (default 0.5)
-verbose
enable verbose logging
-workers int
number of workers to use (default 2*<number-of-cores>)
```

4
go.mod
View File

@ -1,5 +1,3 @@
module github.com/ianfoo/steelray-docgrouper
go 1.22.1
require github.com/adrg/strutil v0.3.1
go 1.22.2

533
main.go
View File

@ -2,7 +2,10 @@
// Previously ~1h
// March 13, 2024: 00:00-02:30
// March 19, 2024: 15:00-19:00
// March 23, 2024: 20:00-
// March 23, 2024: 20:00-22:00
// April 02, 2024: 12:30-17:00
// April 04, 2024: 21:00-23:30
// April 05, 2024: 00:00-02:00
package main
import (
@ -12,6 +15,7 @@ import (
"log/slog"
"os"
"path"
"runtime"
"slices"
"strconv"
"strings"
@ -19,143 +23,290 @@ import (
"sync/atomic"
)
// DataFilePath describes the default location where the file pool can be found.
const DefaultDataFilePath = "files"
const (
// defaultSimilarityThreshold is the default minimum similarity required for
// two files to be considered related. This value is arbitrary and could be
// adjusted based on the specific requirements of the problem.
defaultSimilarityThreshold = 0.5
// defaultDataFilePath describes the default location where the file pool can be
// found.
defaultDataFilePath = "files"
)
// Command line options
var (
DataFilePath string
UseDocPrefix bool
Verbose bool
dataFilePath string
similarityThreshold float64
useDocPrefix bool
verbose bool
numWorkers int
)
func main() {
if err := run(os.Args); err != nil {
documents, err := run(os.Args)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(1)
os.Exit(-1)
}
for _, doc := range documents {
fmt.Println(doc)
}
}
// run is the main entry point for the program.
func run(args []string) error {
func run(args []string) ([]*Document, error) {
flags := flag.NewFlagSet(args[0], flag.ExitOnError)
flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool")
flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging")
flags.Parse(args[1:])
flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool")
flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold")
flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use")
flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
flags.BoolVar(&verbose, "verbose", false, "enable verbose logging")
_ = flags.Parse(args[1:])
// SimilarityThreshold is the minimum similarity required for two files
// to be considered related. This value is arbitrary and could be adjusted
// based on the specific requirements of the problem.
const SimilarityThreshold = 0.5
fileTimes, times, err := orderFiles()
// The files need to be processed in order of time, so determine the
// timestamp of each file and sort them by time.
fileTimes, times, err := orderFiles(dataFilePath)
if err != nil {
return err
return nil, err
}
var (
// documents is the master list of documents that will be built up.
documents []*Document
// fcc handles reading files and caching contents.
fcc = make(fileContentsCache)
)
dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers)
for i, timestamp := range times {
_ = i
// fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes))
// Track the files at this timestamp that have been associated with documents, so
// we can identify unassociated files later and then create new documents for them.
var (
wg sync.WaitGroup
// Track the files at this timestamp that have been associated with
// documents, so we can identify unassociated files later and then
// create new documents for them. This needs to be distinct for each
// timestamp, so it's created inside the timestamp loop's scope.
associatedFiles sync.Map
// We might need to create new documents for files that weren't
// associated with any document at this timestamp, so we need to make
// sure that this timestamp has been entirely processed first. We do
// this by waiting for the workers to indicate they've finished a work
// item.
wg sync.WaitGroup
)
wg.Add(len(documents))
log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents))
for _, doc := range documents {
// Start a goroutine for each document, to parallelize the
// comparison with the files in the current timestamp. A more robust
// solution would limit the number of concurrent goroutines to avoid
// exhausting system resources, but for this problem we won't have
// more than a couple thousand documents. Goroutines are
// lightweight enough (2K stack) that we can start them pretty
// capriciously.
go func(doc *Document, files []int) {
defer wg.Done()
for _, candidateFileNumber := range files {
// Check to be certain this file hasn't been associated with another
// document already. If it has been, continue to the next file.
if _, ok := associatedFiles.Load(candidateFileNumber); ok {
log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times))
for i, doc := range dm.Documents {
wg.Add(1)
dm.WorkCh <- WorkItem{
doc: doc,
fileNumbers: fileTimes[timestamp],
timestamp: timestamp,
associatedFiles: &associatedFiles,
wg: &wg,
}
log(
"submitted work",
"documentNumber", i+1,
"documentID", doc.ID,
"totalDocs", len(dm.Documents),
"timestamp", timestamp,
)
}
wg.Wait()
// Now that this timestamp has been fully processed, we can check to see
// what files haven't been associated existing documents, and create new
// documents for them.
var docsAdded int
for _, fileNumber := range fileTimes[timestamp] {
if _, ok := associatedFiles.Load(fileNumber); !ok {
dm.AddNewDocument(fileNumber, timestamp)
docsAdded++
}
}
if docsAdded > 0 {
log("created new documents", "numAdded", docsAdded, "timestamp", timestamp)
}
// Free up memory.
dm.ShrinkCache()
}
dm.Shutdown()
return dm.SortedDocuments(), nil
}
// WorkItem is what will be sent to the the workers in the worker pool.
type WorkItem struct {
doc *Document
fileNumbers []int
timestamp int
associatedFiles *sync.Map
wg *sync.WaitGroup
}
// DocumentManager handles the processing of documents and files. It maintains a
// list of documents and a cache of file contents, and uses a pool of workers to
// compare documents against files.
type DocumentManager struct {
// Documents is the list of documents that have been identified.
Documents []*Document
// WorkCh is the channel through which work items are submitted to the workers.
WorkCh chan WorkItem
// docIDSource is a concurrency-safe source from which to identify documents.
// This could easily be something other than an integer, but using this allows
// us to just use the standard library.
docIDSource atomic.Uint32
similarityThreshold float64
fcc *FileContentsCache
wg sync.WaitGroup
}
// NewDocumentManager creates a new DocumentManager with the specified base path
// for the file pool and the specified number of workers.
func NewDocumentManager(fileBasePath string, similarityThreshold float64, numWorkers int) *DocumentManager {
dm := &DocumentManager{
Documents: make([]*Document, 0),
similarityThreshold: similarityThreshold,
fcc: &FileContentsCache{BaseDir: fileBasePath},
WorkCh: make(chan WorkItem),
}
// Start workers.
for wID := range numWorkers {
go dm.ComparisonWorker(wID + 1)
}
dm.wg.Add(numWorkers)
return dm
}
// Shutdown cleans up the document manager by closing the work channel to
// trigger workers to exit and then waits for all workers to exit.
func (dm *DocumentManager) Shutdown() {
close(dm.WorkCh)
dm.wg.Wait()
}
func (dm *DocumentManager) AddNewDocument(fileNumber, timestamp int) {
doc := Document{
ID: dm.docIDSource.Add(1),
LatestTimestamp: timestamp,
AssociatedFiles: []int{fileNumber},
}
dm.Documents = append(dm.Documents, &doc)
}
// ShrinkCache removes files from the cache that will never be used again, by
// evicting those files that are not associated with any document. Note that
// this is not concurrent-safe and should only be called when operations that
// could modify the document list are not running. This is an optimization, but
// could be removed if memory usage is not a concern.
func (dm *DocumentManager) ShrinkCache() {
var latestDocumentFiles []int
for _, doc := range dm.Documents {
latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
}
dm.fcc.ClearFilesExcept(latestDocumentFiles)
}
// Return the list of documents with their associated files in ascending order,
// and the documents themselves ordered by the documents by their first
// associated file.
func (dm *DocumentManager) SortedDocuments() []*Document {
// Sort the associated files for each document.
for _, doc := range dm.Documents {
doc.SortAssociatedFiles()
}
// Sort the documents by their first associated file number.
slices.SortFunc(dm.Documents, func(a, b *Document) int {
return a.AssociatedFiles[0] - b.AssociatedFiles[0]
})
return dm.Documents
}
// ComparisonWorker is a function that receives work items describing a document
// and a list of candidate file IDs to compare against. It will compare the
// document against each file and if a match is found, associate the file with the
// document sent in the work item, and record the file as having been matched.
func (dm *DocumentManager) ComparisonWorker(workerID int) {
for workItem := range dm.WorkCh {
for _, fileNumber := range workItem.fileNumbers {
if _, ok := workItem.associatedFiles.Load(fileNumber); ok {
// This file has already been matched; skip it.
continue
}
latestFileNumber := workItem.doc.LatestAssociatedFile()
similarity, err := dm.compareFiles(latestFileNumber, fileNumber)
if err != nil {
// Simplistic error handling: log the error and continue.
slog.Error(
"error comparing files",
"file1", latestFileNumber,
"file2", fileNumber,
"document", workItem.doc.ID,
"worker", workerID,
)
}
// If current file doesn't match current document, skip to the next file.
if similarity < dm.similarityThreshold {
continue
}
latestFileNumber := doc.LatestAssociatedFile()
overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber)
if err != nil {
fmt.Fprintf(
os.Stderr,
"error comparing files %d and %d: %v\n",
latestFileNumber, candidateFileNumber, err,
// Current file matches current document, so record this.
workItem.doc.AssociateFile(fileNumber, workItem.timestamp)
workItem.associatedFiles.Store(fileNumber, struct{}{})
log(
"match found",
"document", workItem.doc.ID,
"file", fileNumber,
"time", workItem.timestamp,
"worker", workerID,
)
}
if overlap >= SimilarityThreshold {
// Add file to Document associated list
doc.AssociateFile(candidateFileNumber, timestamp)
associatedFiles.Store(candidateFileNumber, struct{}{})
// We know this document won't be associated with any other files
// with this timestamp, so we can stop looking at files with this
// timestamp, for this document.
return
// We don't need to consider this document anymore since we've found
// a match. End processing and wait for more work.
break
}
}
}(doc, fileTimes[timestamp])
workItem.wg.Done()
}
// Wait for all document comparisons to complete for this timestamp.
wg.Wait()
// If we haven't associated all the files with existing documents, we need
// to create new documents for those that remain.
currentNumDocs := len(documents)
for _, fileNumber := range fileTimes[timestamp] {
if _, ok := associatedFiles.Load(fileNumber); !ok {
doc := NewDocument(fileNumber, timestamp)
documents = append(documents, &doc)
}
}
if len(documents) > currentNumDocs {
log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp)
// Report that this worker is shutting down.
dm.wg.Done()
}
// Now we can clear the cache of file contents for files that aren't associated with
// a document, to conserve memory.
var latestDocumentFiles []int
for _, doc := range documents {
latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
// compareFiles computes how much two files overlap, on a scale
// of 0 to 1 by iterating through the files and identifying lines
// that are duplicated.
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
f1, err := dm.fcc.GetFileContents(f1Number)
if err != nil {
return 0, fmt.Errorf("file %d: %w", f1Number, err)
}
fcc.clearFilesExcept(latestDocumentFiles)
f2, err := dm.fcc.GetFileContents(f2Number)
if err != nil {
return 0, fmt.Errorf("file %d: %w", f2Number, err)
}
// Output the list of documents, showing their associated files in ascending order.
// Order the documents by their first associated file.
slices.SortFunc(documents, func(a, b *Document) int {
return a.AssociatedFiles[0] - b.AssociatedFiles[0]
})
for _, doc := range documents {
doc.SortAssociatedFiles()
fmt.Println(doc)
histogram := make(map[string]int)
for _, lines := range [][]string{f1, f2} {
for _, line := range lines {
// Skip blank lines, which can throw off the count.
if line == "" {
continue
}
histogram[line]++
}
}
return nil
var overlap int
for _, v := range histogram {
if v == 2 {
overlap++
}
}
return float64(overlap) / float64(len(histogram)), nil
}
// DocumentIDSource is a concurrency-safe source from which to identify
// documents. This could easily be something other than an integer, but using
// this allows us to just use the standard library.
var DocumentIDSource atomic.Uint32
// Document stores a document ID and a list of associated files.
type Document struct {
@ -171,8 +322,8 @@ func (d Document) String() string {
for _, f := range d.AssociatedFiles {
sb.WriteString(fmt.Sprintf("%d ", f))
}
if UseDocPrefix {
return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String())
if useDocPrefix {
return fmt.Sprintf("[doc %4d] %s", d.ID, strings.TrimSpace(sb.String()))
}
return sb.String()
}
@ -186,23 +337,78 @@ func (d *Document) AssociateFile(fileNumber, timestamp int) {
}
// LatestAssociatedFile returns the most recent file associated with a document.
// Note that this presumes that the list of associated files is sorted in
// temporal order based on the timestamp at the head of the file.
func (d Document) LatestAssociatedFile() int {
return d.AssociatedFiles[len(d.AssociatedFiles)-1]
}
// SortAssociatedFiles sorts the list of associated files for a document,
// since the requirements stipulate output in ascending order.
// SortAssociatedFiles sorts the list of associated files for a document, since
// the requirements stipulate output in ascending numerical order. Note that
// this changes the order of associated files from their original temporal
// order, so must only be invoked when the work is entirely finished.
func (d *Document) SortAssociatedFiles() {
slices.Sort(d.AssociatedFiles)
}
// NewDocument creates a new Document struct and initializes an ID and records
// the first file and timestamp associated with it.
func NewDocument(fileNumber, timestamp int) Document {
return Document{
ID: DocumentIDSource.Add(1),
LatestTimestamp: timestamp,
AssociatedFiles: []int{fileNumber},
// FileContentsCache is a cache of file contents, keyed by file number,
// to avoid reading the same file from disk multiple times.
type FileContentsCache struct {
BaseDir string
cache sync.Map
}
// GetFileContents returns the contents of a file, excluding the first timestamp
// line. If the file is already in the cache, the contents are returned from
// there, otherwise the file is read from disk and the contents are cached.
func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error) {
if contents, ok := fcc.cache.Load(fileNumber); ok {
return contents.([]string), nil
}
var (
fileName = makeFilePath(fcc.BaseDir, fileNumber)
lines []string
)
f, err := os.Open(fileName)
if err != nil {
return nil, err
}
s := bufio.NewScanner(f)
// Read first line and ignore it since it's just the timestamp.
_ = s.Scan()
// Read file and store contents in cache.
for s.Scan() {
lines = append(lines, s.Text())
}
if err := s.Err(); err != nil {
return nil, err
}
fcc.cache.Store(fileNumber, lines)
return lines, nil
}
// ClearFilesExcept removes the contents of the fileContentsCache except for the
// provided file numbers. This helps conserve memory by removing the contents of
// files that are no longer of interest, which we can be sure of since we are
// proceeding in order of time.
func (fcc *FileContentsCache) ClearFilesExcept(fileNumbers []int) {
// Build up a list of entries to delete to avoid modifying the concurrent
// map while iterating over it.
var toDelete []int
fcc.cache.Range(func(key, _ any) bool {
storedFileNum := key.(int)
if !slices.Contains(fileNumbers, storedFileNum) {
toDelete = append(toDelete, storedFileNum)
}
return true
})
for _, k := range toDelete {
fcc.cache.Delete(k)
}
}
@ -231,104 +437,17 @@ func readFileTime(filepath string) (int, error) {
return time, nil
}
// compareFiles computes how much two files overlap, on a scale
// of 0 to 1 by iterating through the files and identifying lines
// that are duplicated.
func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) {
f1, err := fcc.getFileContents(f1Number)
if err != nil {
return 0, fmt.Errorf("file %d: %w", f1Number, err)
}
f2, err := fcc.getFileContents(f2Number)
if err != nil {
return 0, fmt.Errorf("file %d: %w", f2Number, err)
}
histogram := make(map[string]int)
for _, lines := range [][]string{f1, f2} {
for _, line := range lines {
histogram[line]++
}
}
var overlap int
for _, v := range histogram {
if v == 2 {
overlap++
}
}
return float64(overlap) / float64(len(histogram)), nil
}
// fileContentsCache is a cache of file contents, keyed by file number,
// to avoid reading the same file from disk multiple times.
type fileContentsCache map[int][]string
// getFileContents returns the contents of a file, excluding the first timestamp
// line. If the file is already in the cache, the contents are returned from
// there, otherwise the file is read from disk and the contents are cached.
func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) {
if contents, ok := fcc[fileNumber]; ok {
return contents, nil
}
var (
fileName = makeFilePath(fileNumber)
lines []string
)
f, err := os.Open(fileName)
if err != nil {
return nil, err
}
s := bufio.NewScanner(f)
// Ignore first line that's just a timestamp.
if !s.Scan() {
fcc[fileNumber] = []string{}
return []string{}, nil
}
for s.Scan() {
lines = append(lines, s.Text())
}
if err := s.Err(); err != nil {
return nil, err
}
return lines, nil
}
// clearFilesExcept removes the contents of the fileContentsCache except for the
// provided file numbers. This helps conserve memory by removing the contents of
// files that are no longer of interest, which we can be sure of since we are
// proceeding in order of time.
func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) {
for fNum := range fcc {
if !slices.Contains(fileNumbers, fNum) {
delete(fcc, fNum)
}
}
}
func makeFileName(number int) string {
return fmt.Sprintf("%d.txt", number)
}
func makeFilePath(number int) string {
return path.Join(DataFilePath, makeFileName(number))
}
// orderFiles determines the timestamp version of each file and creates a map of
// time to file numbers. It sorts the times (since maps are not ordered) so that
// the map can be iterated in order of time. This allows stepping through the
// history of the files from the beginning. Using this, we can construct a
// "chain" of evolution for a given document.
func orderFiles() (map[int][]int, []int, error) {
func orderFiles(dir string) (map[int][]int, []int, error) {
timeMap := make(map[int][]int)
dirEntries, err := os.ReadDir(DataFilePath)
dirEntries, err := os.ReadDir(dir)
if err != nil {
return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err)
return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
}
for _, entry := range dirEntries {
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
@ -345,7 +464,7 @@ func orderFiles() (map[int][]int, []int, error) {
}
}
filePath := path.Join(DataFilePath, entry.Name())
filePath := path.Join(dir, entry.Name())
modTime, err := readFileTime(filePath)
if err != nil {
return nil, nil, err
@ -366,8 +485,16 @@ func orderFiles() (map[int][]int, []int, error) {
return timeMap, timeSlice, nil
}
func makeFileName(number int) string {
return fmt.Sprintf("%d.txt", number)
}
func makeFilePath(dataFilePath string, number int) string {
return path.Join(dataFilePath, makeFileName(number))
}
func log(msg string, args ...any) {
if Verbose {
if verbose {
slog.Info(msg, args...)
}
}

123
main_test.go Normal file
View File

@ -0,0 +1,123 @@
package main
import (
"fmt"
"path/filepath"
"reflect"
"testing"
)
func TestReadFileTime(t *testing.T) {
tt := []struct {
fileName string
expectedTimestamp int
}{
{"1.txt", 3},
{"2.txt", 5},
{"3.txt", 11},
}
for _, tc := range tt {
t.Run(tc.fileName, func(t *testing.T) {
filePath := filepath.Join("testdata", tc.fileName)
timestamp, err := readFileTime(filePath)
if err != nil {
t.Fatal("error reading file time: ", err)
}
if timestamp != tc.expectedTimestamp {
t.Errorf("expected %d, got %d", tc.expectedTimestamp, timestamp)
}
})
}
}
func TestOrderFiles(t *testing.T) {
var (
expectedOrder = []int{3, 5, 11}
expectedMap = map[int][]int{
3: {1},
5: {2, 7},
11: {3, 4, 5},
}
)
fileMap, order, err := orderFiles("testdata")
if err != nil {
t.Fatal("error ordering files: ", err)
}
if !reflect.DeepEqual(order, expectedOrder) {
t.Errorf("expected %v, got %v", expectedOrder, order)
}
if !reflect.DeepEqual(fileMap, expectedMap) {
t.Errorf("expected %v, got %v", expectedMap, fileMap)
}
}
func TestFileContentsCache(t *testing.T) {
fcc := FileContentsCache{BaseDir: "testdata"}
cases := []struct {
fileNumber int
contents string
}{
{1, "foo foo foo"},
{2, "bar bar bar"},
{3, "baz baz baz"},
}
// Test initial reads.
for _, c := range cases {
t.Run(fmt.Sprintf("initial read %d", c.fileNumber), func(t *testing.T) {
got, err := fcc.GetFileContents(c.fileNumber)
if err != nil {
t.Fatal("error getting file contents: ", err)
}
if !reflect.DeepEqual(got, []string{c.contents}) {
t.Errorf("expected %q, got %q", c.contents, got)
}
})
}
// Ensure files are actually stored in cache.
for _, c := range cases {
t.Run(fmt.Sprintf("cache check %d", c.fileNumber), func(t *testing.T) {
if got, ok := fcc.cache.Load(c.fileNumber); !ok {
t.Fatalf("file %d not found in cache", c.fileNumber)
} else if !reflect.DeepEqual(got, []string{c.contents}) {
t.Fatalf("expected %q, got %q", c.contents, got)
}
})
}
// Test clear-except operation.
t.Run("clear except", func(t *testing.T) {
fcc.ClearFilesExcept([]int{1})
if _, ok := fcc.cache.Load(1); !ok {
t.Fatal("file 1 not found in cache, expected to be kept")
}
if _, ok := fcc.cache.Load(2); ok {
t.Fatal("file 2 found in cache, expected to be cleared")
}
if _, ok := fcc.cache.Load(3); ok {
t.Fatal("file 3 found in cache, expected to be cleared")
}
})
}
func TestEndToEnd(t *testing.T) {
docs, err := run([]string{"argv0", "-path", "testdata/e2e"})
want := []int{1, 6, 9, 12, 14, 18}
if err != nil {
t.Fatal("error running program: ", err)
}
if len(docs) != 1 {
t.Fatalf("expected %d documents, got %d", 1, len(docs))
}
doc := docs[0]
if doc.ID != 1 {
t.Errorf("expected ID %d, got %d", 0, doc.ID)
}
if doc.LatestTimestamp != 5 {
t.Errorf("expected latest timestamp %d, got %d", 3, doc.LatestTimestamp)
}
if !reflect.DeepEqual(doc.AssociatedFiles, want) {
t.Errorf("expected associated files %v, got %v", want, doc.AssociatedFiles)
}
}

2
testdata/1.txt vendored Normal file
View File

@ -0,0 +1,2 @@
3
foo foo foo

2
testdata/2.txt vendored Normal file
View File

@ -0,0 +1,2 @@
5
bar bar bar

2
testdata/3.txt vendored Normal file
View File

@ -0,0 +1,2 @@
11
baz baz baz

1
testdata/4.txt vendored Normal file
View File

@ -0,0 +1 @@
11

1
testdata/5.txt vendored Normal file
View File

@ -0,0 +1 @@
11

1
testdata/7.txt vendored Normal file
View File

@ -0,0 +1 @@
5

12
testdata/e2e/1.txt vendored Normal file
View File

@ -0,0 +1,12 @@
0
“Ice Ice Baby” was originally the flipside to Vanilla Ices debut single “Play
That Funky Music”. When it was later released as its own single, it became an
international smash hit.
After every record company turned Vanilla Ices original demos down, Tommy Quon
(the owner of the Dallas club City Lights) had the clubs DJ Earthquake produce
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
over Queens “Under Pressure”, was the B-side.

37
testdata/e2e/12.txt vendored Normal file
View File

@ -0,0 +1,37 @@
5
“Ice Ice Baby” was originally the flipside to Vanilla Ices debut single “Play
That Funky Music”. When it was later released as its own single, it became an
international smash hit.
After every record company turned Vanilla Ices original demos down, Tommy Quon
(the owner of the Dallas club City Lights) had the clubs DJ Earthquake produce
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
over Queens “Under Pressure”, was the B-side.
A DJ at an FM station in Georgia liked the B-side better and played it on air.
Soon it became that stations #1 requested song, leading stations in Tennessee
and Texas to do the same. It also became Video Jukeboxs most requested video.
After SBK Records' founder was played the song over the phone, he signed Ice the
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
with “Play That Funky Music” as its flipside.
The song began climbing charts around the world, eventually reaching the top 10
in twelve countries hitting #1 in six of them including the UK and the US
(where it became the first chart-topping rap single in history).
“Ice Ice Baby” contains an uncleared sample of Queens “Under Pressure”, so when
confronted about it, Ice claimed hed altered it, but he later admitted he
actually hadnt. The parties settled out of court for an undisclosed sum, and
members of Queen, plus the guest vocalist on the original song David Bowie, were
also given songwriting credits.
The song is credited for making hip-hop an acceptable genre to mainstream
media and continues to be popular into the 2000s. It was certified Gold in 2005
for selling 500K digital downloads and named by VH1 the #29 top song of the 90s.
But it also has its share of negative feedback, with MTV ranking it the #9 worst
video in history, and Houston Press calling it the worst song to come from
Texas. In 2012, actor/comedian Adam Scott discussed the songs opening lyrics on
Conan, pointing out how ridiculous they sound when analyzed.

18
testdata/e2e/14.txt vendored Normal file
View File

@ -0,0 +1,18 @@
1
“Ice Ice Baby” was originally the flipside to Vanilla Ices debut single “Play
That Funky Music”. When it was later released as its own single, it became an
international smash hit.
After every record company turned Vanilla Ices original demos down, Tommy Quon
(the owner of the Dallas club City Lights) had the clubs DJ Earthquake produce
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
over Queens “Under Pressure”, was the B-side.
A DJ at an FM station in Georgia liked the B-side better and played it on air.
Soon it became that stations #1 requested song, leading stations in Tennessee
and Texas to do the same. It also became Video Jukeboxs most requested video.
After SBK Records' founder was played the song over the phone, he signed Ice the
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
with “Play That Funky Music” as its flipside.

49
testdata/e2e/18.txt vendored Normal file
View File

@ -0,0 +1,49 @@
4
“Ice Ice Baby” was originally the flipside to Vanilla Ices debut single “Play
That Funky Music”. When it was later released as its own single, it became an
international smash hit.
After every record company turned Vanilla Ices original demos down, Tommy Quon
(the owner of the Dallas club City Lights) had the clubs DJ Earthquake produce
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
over Queens “Under Pressure”, was the B-side.
A DJ at an FM station in Georgia liked the B-side better and played it on air.
Soon it became that stations #1 requested song, leading stations in Tennessee
and Texas to do the same. It also became Video Jukeboxs most requested video.
After SBK Records' founder was played the song over the phone, he signed Ice the
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
with “Play That Funky Music” as its flipside.
The song began climbing charts around the world, eventually reaching the top 10
in twelve countries hitting #1 in six of them including the UK and the US
(where it became the first chart-topping rap single in history).
That year “Ice Ice Baby” was certified Platinum two months after its release and
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
Grammy that year for Best Solo Rap Performance, but lost to MC Hammers “U Cant
Touch This”.
Death Row Records' CEO, Suge Knight, learned that Marvin “Chocolate” Johnson, a
Death Row signee, was a co-writer on the track, so he invited Ice to his Los
Angeles hotel room to negotiate payment. It has been rumored that Vanilla Ice
was hung over a balcony during the negotations, but Ice has denied these rumors
several times, insisting, “He didnt have to hang me from no balcony or slap me
around or nothing”.
“Ice Ice Baby” contains an uncleared sample of Queens “Under Pressure”, so when
confronted about it, Ice claimed hed altered it, but he later admitted he
actually hadnt. The parties settled out of court for an undisclosed sum, and
members of Queen, plus the guest vocalist on the original song David Bowie, were
also given songwriting credits.
The song is credited for making hip-hop an acceptable genre to mainstream
media and continues to be popular into the 2000s. It was certified Gold in 2005
for selling 500K digital downloads and named by VH1 the #29 top song of the 90s.
But it also has its share of negative feedback, with MTV ranking it the #9 worst
video in history, and Houston Press calling it the worst song to come from
Texas. In 2012, actor/comedian Adam Scott discussed the songs opening lyrics on
Conan, pointing out how ridiculous they sound when analyzed.

27
testdata/e2e/6.txt vendored Normal file
View File

@ -0,0 +1,27 @@
2
“Ice Ice Baby” was originally the flipside to Vanilla Ices debut single “Play
That Funky Music”. When it was later released as its own single, it became an
international smash hit.
After every record company turned Vanilla Ices original demos down, Tommy Quon
(the owner of the Dallas club City Lights) had the clubs DJ Earthquake produce
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
over Queens “Under Pressure”, was the B-side.
A DJ at an FM station in Georgia liked the B-side better and played it on air.
Soon it became that stations #1 requested song, leading stations in Tennessee
and Texas to do the same. It also became Video Jukeboxs most requested video.
After SBK Records' founder was played the song over the phone, he signed Ice the
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
with “Play That Funky Music” as its flipside.
The song began climbing charts around the world, eventually reaching the top 10
in twelve countries hitting #1 in six of them including the UK and the US
(where it became the first chart-topping rap single in history).
That year “Ice Ice Baby” was certified Platinum two months after its release and
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
Grammy that year for Best Solo Rap Performance, but lost to MC Hammers “U Cant
Touch This”.

40
testdata/e2e/9.txt vendored Normal file
View File

@ -0,0 +1,40 @@
3
“Ice Ice Baby” was originally the flipside to Vanilla Ices debut single “Play
That Funky Music”. When it was later released as its own single, it became an
international smash hit.
After every record company turned Vanilla Ices original demos down, Tommy Quon
(the owner of the Dallas club City Lights) had the clubs DJ Earthquake produce
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
over Queens “Under Pressure”, was the B-side.
A DJ at an FM station in Georgia liked the B-side better and played it on air.
Soon it became that stations #1 requested song, leading stations in Tennessee
and Texas to do the same. It also became Video Jukeboxs most requested video.
After SBK Records' founder was played the song over the phone, he signed Ice the
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
with “Play That Funky Music” as its flipside.
The song began climbing charts around the world, eventually reaching the top 10
in twelve countries hitting #1 in six of them including the UK and the US
(where it became the first chart-topping rap single in history).
That year “Ice Ice Baby” was certified Platinum two months after its release and
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
Grammy that year for Best Solo Rap Performance, but lost to MC Hammers “U Cant
Touch This”.
Death Row Records' CEO, Suge Knight, learned that Marvin “Chocolate” Johnson, a
Death Row signee, was a co-writer on the track, so he invited Ice to his Los
Angeles hotel room to negotiate payment. It has been rumored that Vanilla Ice
was hung over a balcony during the negotations, but Ice has denied these rumors
several times, insisting, “He didnt have to hang me from no balcony or slap me
around or nothing”.
“Ice Ice Baby” contains an uncleared sample of Queens “Under Pressure”, so when
confronted about it, Ice claimed hed altered it, but he later admitted he
actually hadnt. The parties settled out of court for an undisclosed sum, and
members of Queen, plus the guest vocalist on the original song David Bowie, were
also given songwriting credits.