docgrouper/main.go

374 lines
11 KiB
Go

// Timesheet
// Previously ~1h
// March 13, 2024: 00:00-02:30
// March 19, 2024: 15:00-19:00
// March 23, 2024: 20:00-
package main
import (
"bufio"
"flag"
"fmt"
"log/slog"
"os"
"path"
"slices"
"strconv"
"strings"
"sync"
"sync/atomic"
)
// DataFilePath describes the default location where the file pool can be found.
const DefaultDataFilePath = "files"
// Command line options
var (
DataFilePath string
UseDocPrefix bool
Verbose bool
)
func main() {
if err := run(os.Args); err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(1)
}
}
// run is the main entry point for the program.
func run(args []string) error {
flags := flag.NewFlagSet(args[0], flag.ExitOnError)
flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool")
flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging")
flags.Parse(args[1:])
// SimilarityThreshold is the minimum similarity required for two files
// to be considered related. This value is arbitrary and could be adjusted
// based on the specific requirements of the problem.
const SimilarityThreshold = 0.5
fileTimes, times, err := orderFiles()
if err != nil {
return err
}
var (
// documents is the master list of documents that will be built up.
documents []*Document
// fcc handles reading files and caching contents.
fcc = make(fileContentsCache)
)
for i, timestamp := range times {
_ = i
// fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes))
// Track the files at this timestamp that have been associated with documents, so
// we can identify unassociated files later and then create new documents for them.
var (
wg sync.WaitGroup
associatedFiles sync.Map
)
wg.Add(len(documents))
log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents))
for _, doc := range documents {
// Start a goroutine for each document, to parallelize the
// comparison with the files in the current timestamp. A more robust
// solution would limit the number of concurrent goroutines to avoid
// exhausting system resources, but for this problem we won't have
// more than a couple thousand documents. Goroutines are
// lightweight enough (2K stack) that we can start them pretty
// capriciously.
go func(doc *Document, files []int) {
defer wg.Done()
for _, candidateFileNumber := range files {
// Check to be certain this file hasn't been associated with another
// document already. If it has been, continue to the next file.
if _, ok := associatedFiles.Load(candidateFileNumber); ok {
continue
}
latestFileNumber := doc.LatestAssociatedFile()
overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber)
if err != nil {
fmt.Fprintf(
os.Stderr,
"error comparing files %d and %d: %v\n",
latestFileNumber, candidateFileNumber, err,
)
}
if overlap >= SimilarityThreshold {
// Add file to Document associated list
doc.AssociateFile(candidateFileNumber, timestamp)
associatedFiles.Store(candidateFileNumber, struct{}{})
// We know this document won't be associated with any other files
// with this timestamp, so we can stop looking at files with this
// timestamp, for this document.
return
}
}
}(doc, fileTimes[timestamp])
}
// Wait for all document comparisons to complete for this timestamp.
wg.Wait()
// If we haven't associated all the files with existing documents, we need
// to create new documents for those that remain.
currentNumDocs := len(documents)
for _, fileNumber := range fileTimes[timestamp] {
if _, ok := associatedFiles.Load(fileNumber); !ok {
doc := NewDocument(fileNumber, timestamp)
documents = append(documents, &doc)
}
}
if len(documents) > currentNumDocs {
log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp)
}
// Now we can clear the cache of file contents for files that aren't associated with
// a document, to conserve memory.
var latestDocumentFiles []int
for _, doc := range documents {
latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
}
fcc.clearFilesExcept(latestDocumentFiles)
}
// Output the list of documents, showing their associated files in ascending order.
// Order the documents by their first associated file.
slices.SortFunc(documents, func(a, b *Document) int {
return a.AssociatedFiles[0] - b.AssociatedFiles[0]
})
for _, doc := range documents {
doc.SortAssociatedFiles()
fmt.Println(doc)
}
return nil
}
// DocumentIDSource is a concurrency-safe source from which to identify
// documents. This could easily be something other than an integer, but using
// this allows us to just use the standard library.
var DocumentIDSource atomic.Uint32
// Document stores a document ID and a list of associated files.
type Document struct {
AssociatedFiles []int
LatestTimestamp int
ID uint32
}
// String formats a document for output in the format described in the
// requirements.
func (d Document) String() string {
var sb strings.Builder
for _, f := range d.AssociatedFiles {
sb.WriteString(fmt.Sprintf("%d ", f))
}
if UseDocPrefix {
return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String())
}
return sb.String()
}
// AssociateFile adds a file number to the list of associated files for a
// document, and also records the latest timestamp now associated with the
// document.
func (d *Document) AssociateFile(fileNumber, timestamp int) {
d.AssociatedFiles = append(d.AssociatedFiles, fileNumber)
d.LatestTimestamp = timestamp
}
// LatestAssociatedFile returns the most recent file associated with a document.
func (d Document) LatestAssociatedFile() int {
return d.AssociatedFiles[len(d.AssociatedFiles)-1]
}
// SortAssociatedFiles sorts the list of associated files for a document,
// since the requirements stipulate output in ascending order.
func (d *Document) SortAssociatedFiles() {
slices.Sort(d.AssociatedFiles)
}
// NewDocument creates a new Document struct and initializes an ID and records
// the first file and timestamp associated with it.
func NewDocument(fileNumber, timestamp int) Document {
return Document{
ID: DocumentIDSource.Add(1),
LatestTimestamp: timestamp,
AssociatedFiles: []int{fileNumber},
}
}
// readFileTime reads the first line of the file, which represents a
// time/version. The integer value will be returned.
func readFileTime(filepath string) (int, error) {
file, err := os.Open(filepath)
if err != nil {
return 0, err
}
defer file.Close()
s := bufio.NewScanner(file)
var firstLine string
if s.Scan() {
firstLine = s.Text()
}
if err := s.Err(); err != nil {
return 0, err
}
time, err := strconv.Atoi(firstLine)
if err != nil {
return 0, fmt.Errorf("invalid time %s: %w", firstLine, err)
}
return time, nil
}
// compareFiles computes how much two files overlap, on a scale
// of 0 to 1 by iterating through the files and identifying lines
// that are duplicated.
func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) {
f1, err := fcc.getFileContents(f1Number)
if err != nil {
return 0, fmt.Errorf("file %d: %w", f1Number, err)
}
f2, err := fcc.getFileContents(f2Number)
if err != nil {
return 0, fmt.Errorf("file %d: %w", f2Number, err)
}
histogram := make(map[string]int)
for _, lines := range [][]string{f1, f2} {
for _, line := range lines {
histogram[line]++
}
}
var overlap int
for _, v := range histogram {
if v == 2 {
overlap++
}
}
return float64(overlap) / float64(len(histogram)), nil
}
// fileContentsCache is a cache of file contents, keyed by file number,
// to avoid reading the same file from disk multiple times.
type fileContentsCache map[int][]string
// getFileContents returns the contents of a file, excluding the first timestamp
// line. If the file is already in the cache, the contents are returned from
// there, otherwise the file is read from disk and the contents are cached.
func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) {
if contents, ok := fcc[fileNumber]; ok {
return contents, nil
}
var (
fileName = makeFilePath(fileNumber)
lines []string
)
f, err := os.Open(fileName)
if err != nil {
return nil, err
}
s := bufio.NewScanner(f)
// Ignore first line that's just a timestamp.
if !s.Scan() {
fcc[fileNumber] = []string{}
return []string{}, nil
}
for s.Scan() {
lines = append(lines, s.Text())
}
if err := s.Err(); err != nil {
return nil, err
}
return lines, nil
}
// clearFilesExcept removes the contents of the fileContentsCache except for the
// provided file numbers. This helps conserve memory by removing the contents of
// files that are no longer of interest, which we can be sure of since we are
// proceeding in order of time.
func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) {
for fNum := range fcc {
if !slices.Contains(fileNumbers, fNum) {
delete(fcc, fNum)
}
}
}
func makeFileName(number int) string {
return fmt.Sprintf("%d.txt", number)
}
func makeFilePath(number int) string {
return path.Join(DataFilePath, makeFileName(number))
}
// orderFiles determines the timestamp version of each file and creates a map of
// time to file numbers. It sorts the times (since maps are not ordered) so that
// the map can be iterated in order of time. This allows stepping through the
// history of the files from the beginning. Using this, we can construct a
// "chain" of evolution for a given document.
func orderFiles() (map[int][]int, []int, error) {
timeMap := make(map[int][]int)
dirEntries, err := os.ReadDir(DataFilePath)
if err != nil {
return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err)
}
for _, entry := range dirEntries {
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
continue
}
// Get the numeric representation of the file number.
var fileNumber int
{
numberStr := strings.TrimSuffix(entry.Name(), ".txt")
var err error
if fileNumber, err = strconv.Atoi(numberStr); err != nil {
return nil, nil, fmt.Errorf("invalid file number in file name %q: %w", entry.Name(), err)
}
}
filePath := path.Join(DataFilePath, entry.Name())
modTime, err := readFileTime(filePath)
if err != nil {
return nil, nil, err
}
if timeMap[modTime] == nil {
timeMap[modTime] = make([]int, 0)
}
timeMap[modTime] = append(timeMap[modTime], fileNumber)
}
// Now make a slice of the times and sort them, so we can iterate through
// them in order.
timeSlice := make([]int, 0, len(timeMap))
for k := range timeMap {
timeSlice = append(timeSlice, k)
}
slices.Sort(timeSlice)
return timeMap, timeSlice, nil
}
func log(msg string, args ...any) {
if Verbose {
slog.Info(msg, args...)
}
}