docgrouper/main.go

// Timesheet
// Previously ~1h
// March 13, 2024: 00:00-02:30
// March 19, 2024: 15:00-19:00
// March 23, 2024: 20:00-
package main

import (
	"bufio"
	"flag"
	"fmt"
	"log/slog"
	"os"
	"path"
	"slices"
	"strconv"
	"strings"
	"sync"
	"sync/atomic"
)

// DataFilePath describes the default location where the file pool can be found.
const DefaultDataFilePath = "files"

// Command line options
var (
	DataFilePath string
	UseDocPrefix bool
	Verbose      bool
)

func main() {
	if err := run(os.Args); err != nil {
		fmt.Fprintf(os.Stderr, "error: %v\n", err)
		os.Exit(1)
	}
}

// run is the main entry point for the program.
func run(args []string) error {
	flags := flag.NewFlagSet(args[0], flag.ExitOnError)
	flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool")
	flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
	flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging")
	flags.Parse(args[1:])

	// SimilarityThreshold is the minimum similarity required for two files
	// to be considered related. This value is arbitrary and could be adjusted
	// based on the specific requirements of the problem.
	const SimilarityThreshold = 0.5

	fileTimes, times, err := orderFiles()
	if err != nil {
		return err
	}

	var (
		// documents is the master list of documents that will be built up.
		documents []*Document

		// fcc handles reading files and caching contents.
		fcc = make(fileContentsCache)
	)

	for i, timestamp := range times {
		_ = i
		// fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes))
		// Track the files at this timestamp that have been associated with documents, so
		// we can identify unassociated files later and then create new documents for them.
		var (
			wg              sync.WaitGroup
			associatedFiles sync.Map
		)

		wg.Add(len(documents))
		log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents))
		for _, doc := range documents {
			// Start a goroutine for each document, to parallelize the
			// comparison with the files in the current timestamp. A more robust
			// solution would limit the number of concurrent goroutines to avoid
			// exhausting system resources, but for this problem we won't have
			// more than a couple thousand documents. Goroutines are
			// lightweight enough (2K stack) that we can start them pretty
			// capriciously.
			go func(doc *Document, files []int) {
				defer wg.Done()
				for _, candidateFileNumber := range files {
					// Check to be certain this file hasn't been associated with another
					// document already. If it has been, continue to the next file.
					if _, ok := associatedFiles.Load(candidateFileNumber); ok {
						continue
					}

					latestFileNumber := doc.LatestAssociatedFile()
					overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber)
					if err != nil {
						fmt.Fprintf(
							os.Stderr,
							"error comparing files %d and %d: %v\n",
							latestFileNumber, candidateFileNumber, err,
						)
					}
					if overlap >= SimilarityThreshold {
						// Add file to Document associated list
						doc.AssociateFile(candidateFileNumber, timestamp)
						associatedFiles.Store(candidateFileNumber, struct{}{})

						// We know this document won't be associated with any other files
						// with this timestamp, so we can stop looking at files with this
						// timestamp, for this document.
						return
					}
				}
			}(doc, fileTimes[timestamp])
		}

		// Wait for all document comparisons to complete for this timestamp.
		wg.Wait()

		// If we haven't associated all the files with existing documents, we need
		// to create new documents for those that remain.
		currentNumDocs := len(documents)
		for _, fileNumber := range fileTimes[timestamp] {
			if _, ok := associatedFiles.Load(fileNumber); !ok {
				doc := NewDocument(fileNumber, timestamp)
				documents = append(documents, &doc)
			}
		}
		if len(documents) > currentNumDocs {
			log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp)
		}

		// Now we can clear the cache of file contents for files that aren't associated with
		// a document, to conserve memory.
		var latestDocumentFiles []int
		for _, doc := range documents {
			latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
		}
		fcc.clearFilesExcept(latestDocumentFiles)
	}

	// Output the list of documents, showing their associated files in ascending order.
	// Order the documents by their first associated file.
	slices.SortFunc(documents, func(a, b *Document) int {
		return a.AssociatedFiles[0] - b.AssociatedFiles[0]
	})
	for _, doc := range documents {
		doc.SortAssociatedFiles()
		fmt.Println(doc)
	}

	return nil
}

// DocumentIDSource is a concurrency-safe source from which to identify
// documents. This could easily be something other than an integer, but using
// this allows us to just use the standard library.
var DocumentIDSource atomic.Uint32

// Document stores a document ID and a list of associated files.
type Document struct {
	AssociatedFiles []int
	LatestTimestamp int
	ID              uint32
}

// String formats a document for output in the format described in the
// requirements.
func (d Document) String() string {
	var sb strings.Builder
	for _, f := range d.AssociatedFiles {
		sb.WriteString(fmt.Sprintf("%d ", f))
	}
	if UseDocPrefix {
		return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String())
	}
	return sb.String()
}

// AssociateFile adds a file number to the list of associated files for a
// document, and also records the latest timestamp now associated with the
// document.
func (d *Document) AssociateFile(fileNumber, timestamp int) {
	d.AssociatedFiles = append(d.AssociatedFiles, fileNumber)
	d.LatestTimestamp = timestamp
}

// LatestAssociatedFile returns the most recent file associated with a document.
func (d Document) LatestAssociatedFile() int {
	return d.AssociatedFiles[len(d.AssociatedFiles)-1]
}

// SortAssociatedFiles sorts the list of associated files for a document,
// since the requirements stipulate output in ascending order.
func (d *Document) SortAssociatedFiles() {
	slices.Sort(d.AssociatedFiles)
}

// NewDocument creates a new Document struct and initializes an ID and records
// the first file and timestamp associated with it.
func NewDocument(fileNumber, timestamp int) Document {
	return Document{
		ID:              DocumentIDSource.Add(1),
		LatestTimestamp: timestamp,
		AssociatedFiles: []int{fileNumber},
	}
}

// readFileTime reads the first line of the file, which represents a
// time/version. The integer value will be returned.
func readFileTime(filepath string) (int, error) {
	file, err := os.Open(filepath)
	if err != nil {
		return 0, err
	}
	defer file.Close()

	s := bufio.NewScanner(file)
	var firstLine string
	if s.Scan() {
		firstLine = s.Text()
	}
	if err := s.Err(); err != nil {
		return 0, err
	}

	time, err := strconv.Atoi(firstLine)
	if err != nil {
		return 0, fmt.Errorf("invalid time %s: %w", firstLine, err)
	}
	return time, nil
}

// compareFiles computes how much two files overlap, on a scale
// of 0 to 1 by iterating through the files and identifying lines
// that are duplicated.
func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) {
	f1, err := fcc.getFileContents(f1Number)
	if err != nil {
		return 0, fmt.Errorf("file %d: %w", f1Number, err)
	}
	f2, err := fcc.getFileContents(f2Number)
	if err != nil {
		return 0, fmt.Errorf("file %d: %w", f2Number, err)
	}

	histogram := make(map[string]int)
	for _, lines := range [][]string{f1, f2} {
		for _, line := range lines {
			histogram[line]++
		}
	}

	var overlap int
	for _, v := range histogram {
		if v == 2 {
			overlap++
		}
	}
	return float64(overlap) / float64(len(histogram)), nil
}

// fileContentsCache is a cache of file contents, keyed by file number,
// to avoid reading the same file from disk multiple times.
type fileContentsCache map[int][]string

// getFileContents returns the contents of a file, excluding the first timestamp
// line. If the file is already in the cache, the contents are returned from
// there, otherwise the file is read from disk and the contents are cached.
func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) {
	if contents, ok := fcc[fileNumber]; ok {
		return contents, nil
	}
	var (
		fileName = makeFilePath(fileNumber)
		lines    []string
	)

	f, err := os.Open(fileName)
	if err != nil {
		return nil, err
	}

	s := bufio.NewScanner(f)

	// Ignore first line that's just a timestamp.
	if !s.Scan() {
		fcc[fileNumber] = []string{}
		return []string{}, nil
	}

	for s.Scan() {
		lines = append(lines, s.Text())
	}
	if err := s.Err(); err != nil {
		return nil, err
	}
	return lines, nil
}

// clearFilesExcept removes the contents of the fileContentsCache except for the
// provided file numbers. This helps conserve memory by removing the contents of
// files that are no longer of interest, which we can be sure of since we are
// proceeding in order of time.
func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) {
	for fNum := range fcc {
		if !slices.Contains(fileNumbers, fNum) {
			delete(fcc, fNum)
		}
	}
}

func makeFileName(number int) string {
	return fmt.Sprintf("%d.txt", number)
}

func makeFilePath(number int) string {
	return path.Join(DataFilePath, makeFileName(number))
}

// orderFiles determines the timestamp version of each file and creates a map of
// time to file numbers. It sorts the times (since maps are not ordered) so that
// the map can be iterated in order of time. This allows stepping through the
// history of the files from the beginning. Using this, we can construct a
// "chain" of evolution for a given document.
func orderFiles() (map[int][]int, []int, error) {
	timeMap := make(map[int][]int)

	dirEntries, err := os.ReadDir(DataFilePath)
	if err != nil {
		return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err)
	}
	for _, entry := range dirEntries {
		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
			continue
		}

		// Get the numeric representation of the file number.
		var fileNumber int
		{
			numberStr := strings.TrimSuffix(entry.Name(), ".txt")
			var err error
			if fileNumber, err = strconv.Atoi(numberStr); err != nil {
				return nil, nil, fmt.Errorf("invalid file number in file name %q: %w", entry.Name(), err)
			}
		}

		filePath := path.Join(DataFilePath, entry.Name())
		modTime, err := readFileTime(filePath)
		if err != nil {
			return nil, nil, err
		}
		if timeMap[modTime] == nil {
			timeMap[modTime] = make([]int, 0)
		}
		timeMap[modTime] = append(timeMap[modTime], fileNumber)
	}

	// Now make a slice of the times and sort them, so we can iterate through
	// them in order.
	timeSlice := make([]int, 0, len(timeMap))
	for k := range timeMap {
		timeSlice = append(timeSlice, k)
	}
	slices.Sort(timeSlice)
	return timeMap, timeSlice, nil
}

func log(msg string, args ...any) {
	if Verbose {
		slog.Info(msg, args...)
	}
}