commit 5f1a8bc256a7e257515751b9d2e5c9918f424ceb
Author: Ian Molee <imolee@gmail.com>
Date:   Sat Mar 23 20:13:30 2024 -0700

    initial commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..397b4a7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.log
diff --git a/ProjectDescription.pdf b/ProjectDescription.pdf
new file mode 100644
index 0000000..8c549fd
Binary files /dev/null and b/ProjectDescription.pdf differ
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..49f2ba2
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,5 @@
+module github.com/ianfoo/steelray-docgrouper
+
+go 1.22.1
+
+require github.com/adrg/strutil v0.3.1
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..e81f627
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,18 @@
+github.com/adrg/strutil v0.3.1 h1:OLvSS7CSJO8lBii4YmBt8jiK9QOtB9CzCzwl4Ic/Fz4=
+github.com/adrg/strutil v0.3.1/go.mod h1:8h90y18QLrs11IBffcGX3NW/GFBXCMcNg4M7H6MspPA=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..7fe8cb1
--- /dev/null
+++ b/main.go
@@ -0,0 +1,373 @@
+// Timesheet
+// Previously ~1h
+// March 13, 2024: 00:00-02:30
+// March 19, 2024: 15:00-19:00
+// March 23, 2024: 20:00-
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"log/slog"
+	"os"
+	"path"
+	"slices"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+// DataFilePath describes the default location where the file pool can be found.
+const DefaultDataFilePath = "files"
+
+// Command line options
+var (
+	DataFilePath string
+	UseDocPrefix bool
+	Verbose      bool
+)
+
+func main() {
+	if err := run(os.Args); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+// run is the main entry point for the program.
+func run(args []string) error {
+	flags := flag.NewFlagSet(args[0], flag.ExitOnError)
+	flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool")
+	flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
+	flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging")
+	flags.Parse(args[1:])
+
+	// SimilarityThreshold is the minimum similarity required for two files
+	// to be considered related. This value is arbitrary and could be adjusted
+	// based on the specific requirements of the problem.
+	const SimilarityThreshold = 0.5
+
+	fileTimes, times, err := orderFiles()
+	if err != nil {
+		return err
+	}
+
+	var (
+		// documents is the master list of documents that will be built up.
+		documents []*Document
+
+		// fcc handles reading files and caching contents.
+		fcc = make(fileContentsCache)
+	)
+
+	for i, timestamp := range times {
+		_ = i
+		// fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes))
+		// Track the files at this timestamp that have been associated with documents, so
+		// we can identify unassociated files later and then create new documents for them.
+		var (
+			wg              sync.WaitGroup
+			associatedFiles sync.Map
+		)
+
+		wg.Add(len(documents))
+		log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents))
+		for _, doc := range documents {
+			// Start a goroutine for each document, to parallelize the
+			// comparison with the files in the current timestamp. A more robust
+			// solution would limit the number of concurrent goroutines to avoid
+			// exhausting system resources, but for this problem we won't have
+			// more than a couple thousand documents. Goroutines are
+			// lightweight enough (2K stack) that we can start them pretty
+			// capriciously.
+			go func(doc *Document, files []int) {
+				defer wg.Done()
+				for _, candidateFileNumber := range files {
+					// Check to be certain this file hasn't been associated with another
+					// document already. If it has been, continue to the next file.
+					if _, ok := associatedFiles.Load(candidateFileNumber); ok {
+						continue
+					}
+
+					latestFileNumber := doc.LatestAssociatedFile()
+					overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber)
+					if err != nil {
+						fmt.Fprintf(
+							os.Stderr,
+							"error comparing files %d and %d: %v\n",
+							latestFileNumber, candidateFileNumber, err,
+						)
+					}
+					if overlap >= SimilarityThreshold {
+						// Add file to Document associated list
+						doc.AssociateFile(candidateFileNumber, timestamp)
+						associatedFiles.Store(candidateFileNumber, struct{}{})
+
+						// We know this document won't be associated with any other files
+						// with this timestamp, so we can stop looking at files with this
+						// timestamp, for this document.
+						return
+					}
+				}
+			}(doc, fileTimes[timestamp])
+		}
+
+		// Wait for all document comparisons to complete for this timestamp.
+		wg.Wait()
+
+		// If we haven't associated all the files with existing documents, we need
+		// to create new documents for those that remain.
+		currentNumDocs := len(documents)
+		for _, fileNumber := range fileTimes[timestamp] {
+			if _, ok := associatedFiles.Load(fileNumber); !ok {
+				doc := NewDocument(fileNumber, timestamp)
+				documents = append(documents, &doc)
+			}
+		}
+		if len(documents) > currentNumDocs {
+			log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp)
+		}
+
+		// Now we can clear the cache of file contents for files that aren't associated with
+		// a document, to conserve memory.
+		var latestDocumentFiles []int
+		for _, doc := range documents {
+			latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
+		}
+		fcc.clearFilesExcept(latestDocumentFiles)
+	}
+
+	// Output the list of documents, showing their associated files in ascending order.
+	// Order the documents by their first associated file.
+	slices.SortFunc(documents, func(a, b *Document) int {
+		return a.AssociatedFiles[0] - b.AssociatedFiles[0]
+	})
+	for _, doc := range documents {
+		doc.SortAssociatedFiles()
+		fmt.Println(doc)
+	}
+
+	return nil
+}
+
+// DocumentIDSource is a concurrency-safe source from which to identify
+// documents. This could easily be something other than an integer, but using
+// this allows us to just use the standard library.
+var DocumentIDSource atomic.Uint32
+
+// Document stores a document ID and a list of associated files.
+type Document struct {
+	AssociatedFiles []int
+	LatestTimestamp int
+	ID              uint32
+}
+
+// String formats a document for output in the format described in the
+// requirements.
+func (d Document) String() string {
+	var sb strings.Builder
+	for _, f := range d.AssociatedFiles {
+		sb.WriteString(fmt.Sprintf("%d ", f))
+	}
+	if UseDocPrefix {
+		return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String())
+	}
+	return sb.String()
+}
+
+// AssociateFile adds a file number to the list of associated files for a
+// document, and also records the latest timestamp now associated with the
+// document.
+func (d *Document) AssociateFile(fileNumber, timestamp int) {
+	d.AssociatedFiles = append(d.AssociatedFiles, fileNumber)
+	d.LatestTimestamp = timestamp
+}
+
+// LatestAssociatedFile returns the most recent file associated with a document.
+func (d Document) LatestAssociatedFile() int {
+	return d.AssociatedFiles[len(d.AssociatedFiles)-1]
+}
+
+// SortAssociatedFiles sorts the list of associated files for a document,
+// since the requirements stipulate output in ascending order.
+func (d *Document) SortAssociatedFiles() {
+	slices.Sort(d.AssociatedFiles)
+}
+
+// NewDocument creates a new Document struct and initializes an ID and records
+// the first file and timestamp associated with it.
+func NewDocument(fileNumber, timestamp int) Document {
+	return Document{
+		ID:              DocumentIDSource.Add(1),
+		LatestTimestamp: timestamp,
+		AssociatedFiles: []int{fileNumber},
+	}
+}
+
+// readFileTime reads the first line of the file, which represents a
+// time/version. The integer value will be returned.
+func readFileTime(filepath string) (int, error) {
+	file, err := os.Open(filepath)
+	if err != nil {
+		return 0, err
+	}
+	defer file.Close()
+
+	s := bufio.NewScanner(file)
+	var firstLine string
+	if s.Scan() {
+		firstLine = s.Text()
+	}
+	if err := s.Err(); err != nil {
+		return 0, err
+	}
+
+	time, err := strconv.Atoi(firstLine)
+	if err != nil {
+		return 0, fmt.Errorf("invalid time %s: %w", firstLine, err)
+	}
+	return time, nil
+}
+
+// compareFiles computes how much two files overlap, on a scale
+// of 0 to 1 by iterating through the files and identifying lines
+// that are duplicated.
+func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) {
+	f1, err := fcc.getFileContents(f1Number)
+	if err != nil {
+		return 0, fmt.Errorf("file %d: %w", f1Number, err)
+	}
+	f2, err := fcc.getFileContents(f2Number)
+	if err != nil {
+		return 0, fmt.Errorf("file %d: %w", f2Number, err)
+	}
+
+	histogram := make(map[string]int)
+	for _, lines := range [][]string{f1, f2} {
+		for _, line := range lines {
+			histogram[line]++
+		}
+	}
+
+	var overlap int
+	for _, v := range histogram {
+		if v == 2 {
+			overlap++
+		}
+	}
+	return float64(overlap) / float64(len(histogram)), nil
+}
+
+// fileContentsCache is a cache of file contents, keyed by file number,
+// to avoid reading the same file from disk multiple times.
+type fileContentsCache map[int][]string
+
+// getFileContents returns the contents of a file, excluding the first timestamp
+// line. If the file is already in the cache, the contents are returned from
+// there, otherwise the file is read from disk and the contents are cached.
+func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) {
+	if contents, ok := fcc[fileNumber]; ok {
+		return contents, nil
+	}
+	var (
+		fileName = makeFilePath(fileNumber)
+		lines    []string
+	)
+
+	f, err := os.Open(fileName)
+	if err != nil {
+		return nil, err
+	}
+
+	s := bufio.NewScanner(f)
+
+	// Ignore first line that's just a timestamp.
+	if !s.Scan() {
+		fcc[fileNumber] = []string{}
+		return []string{}, nil
+	}
+
+	for s.Scan() {
+		lines = append(lines, s.Text())
+	}
+	if err := s.Err(); err != nil {
+		return nil, err
+	}
+	return lines, nil
+}
+
+// clearFilesExcept removes the contents of the fileContentsCache except for the
+// provided file numbers. This helps conserve memory by removing the contents of
+// files that are no longer of interest, which we can be sure of since we are
+// proceeding in order of time.
+func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) {
+	for fNum := range fcc {
+		if !slices.Contains(fileNumbers, fNum) {
+			delete(fcc, fNum)
+		}
+	}
+}
+
+func makeFileName(number int) string {
+	return fmt.Sprintf("%d.txt", number)
+}
+
+func makeFilePath(number int) string {
+	return path.Join(DataFilePath, makeFileName(number))
+}
+
+// orderFiles determines the timestamp version of each file and creates a map of
+// time to file numbers. It sorts the times (since maps are not ordered) so that
+// the map can be iterated in order of time. This allows stepping through the
+// history of the files from the beginning. Using this, we can construct a
+// "chain" of evolution for a given document.
+func orderFiles() (map[int][]int, []int, error) {
+	timeMap := make(map[int][]int)
+
+	dirEntries, err := os.ReadDir(DataFilePath)
+	if err != nil {
+		return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err)
+	}
+	for _, entry := range dirEntries {
+		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
+			continue
+		}
+
+		// Get the numeric representation of the file number.
+		var fileNumber int
+		{
+			numberStr := strings.TrimSuffix(entry.Name(), ".txt")
+			var err error
+			if fileNumber, err = strconv.Atoi(numberStr); err != nil {
+				return nil, nil, fmt.Errorf("invalid file number in file name %q: %w", entry.Name(), err)
+			}
+		}
+
+		filePath := path.Join(DataFilePath, entry.Name())
+		modTime, err := readFileTime(filePath)
+		if err != nil {
+			return nil, nil, err
+		}
+		if timeMap[modTime] == nil {
+			timeMap[modTime] = make([]int, 0)
+		}
+		timeMap[modTime] = append(timeMap[modTime], fileNumber)
+	}
+
+	// Now make a slice of the times and sort them, so we can iterate through
+	// them in order.
+	timeSlice := make([]int, 0, len(timeMap))
+	for k := range timeMap {
+		timeSlice = append(timeSlice, k)
+	}
+	slices.Sort(timeSlice)
+	return timeMap, timeSlice, nil
+}
+
+func log(msg string, args ...any) {
+	if Verbose {
+		slog.Info(msg, args...)
+	}
+}