docgrouper/main.go

// Time sheet
// Previously ~1h
// March 13, 2024: 00:00-02:30
// March 19, 2024: 15:00-19:00
// March 23, 2024: 20:00-22:00
// April 02, 2024: 12:30-17:00
// April 04, 2024: 21:00-23:30
// April 05, 2024: 00:00-02:00
package main

import (
	"bufio"
	"flag"
	"fmt"
	"log/slog"
	"os"
	"path"
	"runtime"
	"slices"
	"strconv"
	"strings"
	"sync"
	"sync/atomic"
)

const (
	// defaultSimilarityThreshold is the default minimum similarity required for
	// two files to be considered related. This value is arbitrary and could be
	// adjusted based on the specific requirements of the problem.
	defaultSimilarityThreshold = 0.5

	// defaultDataFilePath describes the default location where the file pool can be
	// found.
	defaultDataFilePath = "files"
)

// Command line options
var (
	dataFilePath        string
	similarityThreshold float64
	useDocPrefix        bool
	verbose             bool
	numWorkers          int
)

func main() {
	documents, err := run(os.Args)
	if err != nil {
		fmt.Fprintf(os.Stderr, "error: %v\n", err)
		os.Exit(-1)
	}
	for _, doc := range documents {
		fmt.Println(doc)
	}
}

// run is the main entry point for the program.
func run(args []string) ([]*Document, error) {
	flags := flag.NewFlagSet(args[0], flag.ExitOnError)
	flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool")
	flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold")
	flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use")
	flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
	flags.BoolVar(&verbose, "verbose", false, "enable verbose logging")
	_ = flags.Parse(args[1:])

	// The files need to be processed in order of time, so determine the
	// timestamp of each file and sort them by time.
	fileTimes, times, err := orderFiles(dataFilePath)
	if err != nil {
		return nil, err
	}

	dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers)

	for i, timestamp := range times {
		var (
			// Track the files at this timestamp that have been associated with
			// documents, so we can identify unassociated files later and then
			// create new documents for them. This needs to be distinct for each
			// timestamp, so it's created inside the timestamp loop's scope.
			associatedFiles sync.Map

			// We might need to create new documents for files that weren't
			// associated with any document at this timestamp, so we need to make
			// sure that this timestamp has been entirely processed first. We do
			// this by waiting for the workers to indicate they've finished a work
			// item.
			wg sync.WaitGroup
		)

		log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times))
		for i, doc := range dm.Documents {
			wg.Add(1)
			dm.WorkCh <- WorkItem{
				doc:             doc,
				fileNumbers:     fileTimes[timestamp],
				timestamp:       timestamp,
				associatedFiles: &associatedFiles,
				wg:              &wg,
			}
			log(
				"submitted work",
				"documentNumber", i+1,
				"documentID", doc.ID,
				"totalDocs", len(dm.Documents),
				"timestamp", timestamp,
			)
		}

		wg.Wait()

		// Now that this timestamp has been fully processed, we can check to see
		// what files haven't been associated existing documents, and create new
		// documents for them.
		var docsAdded int
		for _, fileNumber := range fileTimes[timestamp] {
			if _, ok := associatedFiles.Load(fileNumber); !ok {
				dm.AddNewDocument(fileNumber, timestamp)
				docsAdded++
			}
		}
		if docsAdded > 0 {
			log("created new documents", "numAdded", docsAdded, "timestamp", timestamp)
		}

		// Free up memory.
		dm.ShrinkCache()
	}

	dm.Shutdown()
	return dm.SortedDocuments(), nil
}

// WorkItem is what will be sent to the the workers in the worker pool.
type WorkItem struct {
	doc             *Document
	fileNumbers     []int
	timestamp       int
	associatedFiles *sync.Map
	wg              *sync.WaitGroup
}

// DocumentManager handles the processing of documents and files. It maintains a
// list of documents and a cache of file contents, and uses a pool of workers to
// compare documents against files.
type DocumentManager struct {
	// Documents is the list of documents that have been identified.
	Documents []*Document

	// WorkCh is the channel through which work items are submitted to the workers.
	WorkCh chan WorkItem

	// docIDSource is a concurrency-safe source from which to identify documents.
	// This could easily be something other than an integer, but using this allows
	// us to just use the standard library.
	docIDSource atomic.Uint32

	similarityThreshold float64
	fcc                 *FileContentsCache
	wg                  sync.WaitGroup
}

// NewDocumentManager creates a new DocumentManager with the specified base path
// for the file pool and the specified number of workers.
func NewDocumentManager(fileBasePath string, similarityThreshold float64, numWorkers int) *DocumentManager {
	dm := &DocumentManager{
		Documents:           make([]*Document, 0),
		similarityThreshold: similarityThreshold,
		fcc:                 &FileContentsCache{BaseDir: fileBasePath},
		WorkCh:              make(chan WorkItem),
	}

	// Start workers.
	for wID := range numWorkers {
		go dm.ComparisonWorker(wID + 1)
	}
	dm.wg.Add(numWorkers)

	return dm
}

// Shutdown cleans up the document manager by closing the work channel to
// trigger workers to exit and then waits for all workers to exit.
func (dm *DocumentManager) Shutdown() {
	close(dm.WorkCh)
	dm.wg.Wait()
}

func (dm *DocumentManager) AddNewDocument(fileNumber, timestamp int) {
	doc := Document{
		ID:              dm.docIDSource.Add(1),
		LatestTimestamp: timestamp,
		AssociatedFiles: []int{fileNumber},
	}
	dm.Documents = append(dm.Documents, &doc)
}

// ShrinkCache removes files from the cache that will never be used again, by
// evicting those files that are not associated with any document. Note that
// this is not concurrent-safe and should only be called when operations that
// could modify the document list are not running. This is an optimization, but
// could be removed if memory usage is not a concern.
func (dm *DocumentManager) ShrinkCache() {
	var latestDocumentFiles []int
	for _, doc := range dm.Documents {
		latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
	}
	dm.fcc.ClearFilesExcept(latestDocumentFiles)
}

// Return the list of documents with their associated files in ascending order,
// and the documents themselves ordered by the documents by their first
// associated file.
func (dm *DocumentManager) SortedDocuments() []*Document {
	// Sort the associated files for each document.
	for _, doc := range dm.Documents {
		doc.SortAssociatedFiles()
	}
	// Sort the documents by their first associated file number.
	slices.SortFunc(dm.Documents, func(a, b *Document) int {
		return a.AssociatedFiles[0] - b.AssociatedFiles[0]
	})
	return dm.Documents
}

// ComparisonWorker is a function that receives work items describing a document
// and a list of candidate file IDs to compare against. It will compare the
// document against each file and if a match is found, associate the file with the
// document sent in the work item, and record the file as having been matched.
func (dm *DocumentManager) ComparisonWorker(workerID int) {
	for workItem := range dm.WorkCh {
		for _, fileNumber := range workItem.fileNumbers {
			if _, ok := workItem.associatedFiles.Load(fileNumber); ok {
				// This file has already been matched; skip it.
				continue
			}
			latestFileNumber := workItem.doc.LatestAssociatedFile()
			similarity, err := dm.compareFiles(latestFileNumber, fileNumber)
			if err != nil {
				// Simplistic error handling: log the error and continue.
				slog.Error(
					"error comparing files",
					"file1", latestFileNumber,
					"file2", fileNumber,
					"document", workItem.doc.ID,
					"worker", workerID,
				)
			}

			// If current file doesn't match current document, skip to the next file.
			if similarity < dm.similarityThreshold {
				continue
			}

			// Current file matches current document, so record this.
			workItem.doc.AssociateFile(fileNumber, workItem.timestamp)
			workItem.associatedFiles.Store(fileNumber, struct{}{})
			log(
				"match found",
				"document", workItem.doc.ID,
				"file", fileNumber,
				"time", workItem.timestamp,
				"worker", workerID,
			)

			// We don't need to consider this document anymore since we've found
			// a match. End processing and wait for more work.
			break
		}
		workItem.wg.Done()
	}

	// Report that this worker is shutting down.
	dm.wg.Done()
}

// compareFiles computes how much two files overlap, on a scale
// of 0 to 1 by iterating through the files and identifying lines
// that are duplicated.
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
	f1, err := dm.fcc.GetFileContents(f1Number)
	if err != nil {
		return 0, fmt.Errorf("file %d: %w", f1Number, err)
	}
	f2, err := dm.fcc.GetFileContents(f2Number)
	if err != nil {
		return 0, fmt.Errorf("file %d: %w", f2Number, err)
	}

	histogram := make(map[string]int)
	for _, lines := range [][]string{f1, f2} {
		for _, line := range lines {
			// Skip blank lines, which can throw off the count.
			if line == "" {
				continue
			}
			histogram[line]++
		}
	}

	var overlap int
	for _, v := range histogram {
		if v == 2 {
			overlap++
		}
	}
	return float64(overlap) / float64(len(histogram)), nil
}

// Document stores a document ID and a list of associated files.
type Document struct {
	AssociatedFiles []int
	LatestTimestamp int
	ID              uint32
}

// String formats a document for output in the format described in the
// requirements.
func (d Document) String() string {
	var sb strings.Builder
	for _, f := range d.AssociatedFiles {
		sb.WriteString(fmt.Sprintf("%d ", f))
	}
	if useDocPrefix {
		return fmt.Sprintf("[doc %4d] %s", d.ID, strings.TrimSpace(sb.String()))
	}
	return sb.String()
}

// AssociateFile adds a file number to the list of associated files for a
// document, and also records the latest timestamp now associated with the
// document.
func (d *Document) AssociateFile(fileNumber, timestamp int) {
	d.AssociatedFiles = append(d.AssociatedFiles, fileNumber)
	d.LatestTimestamp = timestamp
}

// LatestAssociatedFile returns the most recent file associated with a document.
// Note that this presumes that the list of associated files is sorted in
// temporal order based on the timestamp at the head of the file.
func (d Document) LatestAssociatedFile() int {
	return d.AssociatedFiles[len(d.AssociatedFiles)-1]
}

// SortAssociatedFiles sorts the list of associated files for a document, since
// the requirements stipulate output in ascending numerical order. Note that
// this changes the order of associated files from their original temporal
// order, so must only be invoked when the work is entirely finished.
func (d *Document) SortAssociatedFiles() {
	slices.Sort(d.AssociatedFiles)
}

// FileContentsCache is a cache of file contents, keyed by file number,
// to avoid reading the same file from disk multiple times.
type FileContentsCache struct {
	BaseDir string
	cache   sync.Map
}

// GetFileContents returns the contents of a file, excluding the first timestamp
// line. If the file is already in the cache, the contents are returned from
// there, otherwise the file is read from disk and the contents are cached.
func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error) {
	if contents, ok := fcc.cache.Load(fileNumber); ok {
		return contents.([]string), nil
	}
	var (
		fileName = makeFilePath(fcc.BaseDir, fileNumber)
		lines    []string
	)

	f, err := os.Open(fileName)
	if err != nil {
		return nil, err
	}

	s := bufio.NewScanner(f)

	// Read first line and ignore it since it's just the timestamp.
	_ = s.Scan()

	// Read file and store contents in cache.
	for s.Scan() {
		lines = append(lines, s.Text())
	}
	if err := s.Err(); err != nil {
		return nil, err
	}

	fcc.cache.Store(fileNumber, lines)
	return lines, nil
}

// ClearFilesExcept removes the contents of the fileContentsCache except for the
// provided file numbers. This helps conserve memory by removing the contents of
// files that are no longer of interest, which we can be sure of since we are
// proceeding in order of time.
func (fcc *FileContentsCache) ClearFilesExcept(fileNumbers []int) {
	// Build up a list of entries to delete to avoid modifying the concurrent
	// map while iterating over it.
	var toDelete []int
	fcc.cache.Range(func(key, _ any) bool {
		storedFileNum := key.(int)
		if !slices.Contains(fileNumbers, storedFileNum) {
			toDelete = append(toDelete, storedFileNum)
		}
		return true
	})
	for _, k := range toDelete {
		fcc.cache.Delete(k)
	}
}

// readFileTime reads the first line of the file, which represents a
// time/version. The integer value will be returned.
func readFileTime(filepath string) (int, error) {
	file, err := os.Open(filepath)
	if err != nil {
		return 0, err
	}
	defer file.Close()

	s := bufio.NewScanner(file)
	var firstLine string
	if s.Scan() {
		firstLine = s.Text()
	}
	if err := s.Err(); err != nil {
		return 0, err
	}

	time, err := strconv.Atoi(firstLine)
	if err != nil {
		return 0, fmt.Errorf("invalid time %s: %w", firstLine, err)
	}
	return time, nil
}

// orderFiles determines the timestamp version of each file and creates a map of
// time to file numbers. It sorts the times (since maps are not ordered) so that
// the map can be iterated in order of time. This allows stepping through the
// history of the files from the beginning. Using this, we can construct a
// "chain" of evolution for a given document.
func orderFiles(dir string) (map[int][]int, []int, error) {
	timeMap := make(map[int][]int)

	dirEntries, err := os.ReadDir(dir)
	if err != nil {
		return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
	}
	for _, entry := range dirEntries {
		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
			continue
		}

		// Get the numeric representation of the file number.
		var fileNumber int
		{
			numberStr := strings.TrimSuffix(entry.Name(), ".txt")
			var err error
			if fileNumber, err = strconv.Atoi(numberStr); err != nil {
				return nil, nil, fmt.Errorf("invalid file number in file name %q: %w", entry.Name(), err)
			}
		}

		filePath := path.Join(dir, entry.Name())
		modTime, err := readFileTime(filePath)
		if err != nil {
			return nil, nil, err
		}
		if timeMap[modTime] == nil {
			timeMap[modTime] = make([]int, 0)
		}
		timeMap[modTime] = append(timeMap[modTime], fileNumber)
	}

	// Now make a slice of the times and sort them, so we can iterate through
	// them in order.
	timeSlice := make([]int, 0, len(timeMap))
	for k := range timeMap {
		timeSlice = append(timeSlice, k)
	}
	slices.Sort(timeSlice)
	return timeMap, timeSlice, nil
}

func makeFileName(number int) string {
	return fmt.Sprintf("%d.txt", number)
}

func makeFilePath(dataFilePath string, number int) string {
	return path.Join(dataFilePath, makeFileName(number))
}

func log(msg string, args ...any) {
	if verbose {
		slog.Info(msg, args...)
	}
}