initial commit
This commit is contained in:
commit
5f1a8bc256
|
|
@ -0,0 +1 @@
|
|||
*.log
|
||||
Binary file not shown.
|
|
@ -0,0 +1,5 @@
|
|||
module github.com/ianfoo/steelray-docgrouper
|
||||
|
||||
go 1.22.1
|
||||
|
||||
require github.com/adrg/strutil v0.3.1
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
github.com/adrg/strutil v0.3.1 h1:OLvSS7CSJO8lBii4YmBt8jiK9QOtB9CzCzwl4Ic/Fz4=
|
||||
github.com/adrg/strutil v0.3.1/go.mod h1:8h90y18QLrs11IBffcGX3NW/GFBXCMcNg4M7H6MspPA=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
|
@ -0,0 +1,373 @@
|
|||
// Timesheet
|
||||
// Previously ~1h
|
||||
// March 13, 2024: 00:00-02:30
|
||||
// March 19, 2024: 15:00-19:00
|
||||
// March 23, 2024: 20:00-
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// DataFilePath describes the default location where the file pool can be found.
|
||||
const DefaultDataFilePath = "files"
|
||||
|
||||
// Command line options
|
||||
var (
|
||||
DataFilePath string
|
||||
UseDocPrefix bool
|
||||
Verbose bool
|
||||
)
|
||||
|
||||
func main() {
|
||||
if err := run(os.Args); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// run is the main entry point for the program.
|
||||
func run(args []string) error {
|
||||
flags := flag.NewFlagSet(args[0], flag.ExitOnError)
|
||||
flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool")
|
||||
flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
|
||||
flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging")
|
||||
flags.Parse(args[1:])
|
||||
|
||||
// SimilarityThreshold is the minimum similarity required for two files
|
||||
// to be considered related. This value is arbitrary and could be adjusted
|
||||
// based on the specific requirements of the problem.
|
||||
const SimilarityThreshold = 0.5
|
||||
|
||||
fileTimes, times, err := orderFiles()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var (
|
||||
// documents is the master list of documents that will be built up.
|
||||
documents []*Document
|
||||
|
||||
// fcc handles reading files and caching contents.
|
||||
fcc = make(fileContentsCache)
|
||||
)
|
||||
|
||||
for i, timestamp := range times {
|
||||
_ = i
|
||||
// fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes))
|
||||
// Track the files at this timestamp that have been associated with documents, so
|
||||
// we can identify unassociated files later and then create new documents for them.
|
||||
var (
|
||||
wg sync.WaitGroup
|
||||
associatedFiles sync.Map
|
||||
)
|
||||
|
||||
wg.Add(len(documents))
|
||||
log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents))
|
||||
for _, doc := range documents {
|
||||
// Start a goroutine for each document, to parallelize the
|
||||
// comparison with the files in the current timestamp. A more robust
|
||||
// solution would limit the number of concurrent goroutines to avoid
|
||||
// exhausting system resources, but for this problem we won't have
|
||||
// more than a couple thousand documents. Goroutines are
|
||||
// lightweight enough (2K stack) that we can start them pretty
|
||||
// capriciously.
|
||||
go func(doc *Document, files []int) {
|
||||
defer wg.Done()
|
||||
for _, candidateFileNumber := range files {
|
||||
// Check to be certain this file hasn't been associated with another
|
||||
// document already. If it has been, continue to the next file.
|
||||
if _, ok := associatedFiles.Load(candidateFileNumber); ok {
|
||||
continue
|
||||
}
|
||||
|
||||
latestFileNumber := doc.LatestAssociatedFile()
|
||||
overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber)
|
||||
if err != nil {
|
||||
fmt.Fprintf(
|
||||
os.Stderr,
|
||||
"error comparing files %d and %d: %v\n",
|
||||
latestFileNumber, candidateFileNumber, err,
|
||||
)
|
||||
}
|
||||
if overlap >= SimilarityThreshold {
|
||||
// Add file to Document associated list
|
||||
doc.AssociateFile(candidateFileNumber, timestamp)
|
||||
associatedFiles.Store(candidateFileNumber, struct{}{})
|
||||
|
||||
// We know this document won't be associated with any other files
|
||||
// with this timestamp, so we can stop looking at files with this
|
||||
// timestamp, for this document.
|
||||
return
|
||||
}
|
||||
}
|
||||
}(doc, fileTimes[timestamp])
|
||||
}
|
||||
|
||||
// Wait for all document comparisons to complete for this timestamp.
|
||||
wg.Wait()
|
||||
|
||||
// If we haven't associated all the files with existing documents, we need
|
||||
// to create new documents for those that remain.
|
||||
currentNumDocs := len(documents)
|
||||
for _, fileNumber := range fileTimes[timestamp] {
|
||||
if _, ok := associatedFiles.Load(fileNumber); !ok {
|
||||
doc := NewDocument(fileNumber, timestamp)
|
||||
documents = append(documents, &doc)
|
||||
}
|
||||
}
|
||||
if len(documents) > currentNumDocs {
|
||||
log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp)
|
||||
}
|
||||
|
||||
// Now we can clear the cache of file contents for files that aren't associated with
|
||||
// a document, to conserve memory.
|
||||
var latestDocumentFiles []int
|
||||
for _, doc := range documents {
|
||||
latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
|
||||
}
|
||||
fcc.clearFilesExcept(latestDocumentFiles)
|
||||
}
|
||||
|
||||
// Output the list of documents, showing their associated files in ascending order.
|
||||
// Order the documents by their first associated file.
|
||||
slices.SortFunc(documents, func(a, b *Document) int {
|
||||
return a.AssociatedFiles[0] - b.AssociatedFiles[0]
|
||||
})
|
||||
for _, doc := range documents {
|
||||
doc.SortAssociatedFiles()
|
||||
fmt.Println(doc)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DocumentIDSource is a concurrency-safe source from which to identify
|
||||
// documents. This could easily be something other than an integer, but using
|
||||
// this allows us to just use the standard library.
|
||||
var DocumentIDSource atomic.Uint32
|
||||
|
||||
// Document stores a document ID and a list of associated files.
|
||||
type Document struct {
|
||||
AssociatedFiles []int
|
||||
LatestTimestamp int
|
||||
ID uint32
|
||||
}
|
||||
|
||||
// String formats a document for output in the format described in the
|
||||
// requirements.
|
||||
func (d Document) String() string {
|
||||
var sb strings.Builder
|
||||
for _, f := range d.AssociatedFiles {
|
||||
sb.WriteString(fmt.Sprintf("%d ", f))
|
||||
}
|
||||
if UseDocPrefix {
|
||||
return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String())
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// AssociateFile adds a file number to the list of associated files for a
|
||||
// document, and also records the latest timestamp now associated with the
|
||||
// document.
|
||||
func (d *Document) AssociateFile(fileNumber, timestamp int) {
|
||||
d.AssociatedFiles = append(d.AssociatedFiles, fileNumber)
|
||||
d.LatestTimestamp = timestamp
|
||||
}
|
||||
|
||||
// LatestAssociatedFile returns the most recent file associated with a document.
|
||||
func (d Document) LatestAssociatedFile() int {
|
||||
return d.AssociatedFiles[len(d.AssociatedFiles)-1]
|
||||
}
|
||||
|
||||
// SortAssociatedFiles sorts the list of associated files for a document,
|
||||
// since the requirements stipulate output in ascending order.
|
||||
func (d *Document) SortAssociatedFiles() {
|
||||
slices.Sort(d.AssociatedFiles)
|
||||
}
|
||||
|
||||
// NewDocument creates a new Document struct and initializes an ID and records
|
||||
// the first file and timestamp associated with it.
|
||||
func NewDocument(fileNumber, timestamp int) Document {
|
||||
return Document{
|
||||
ID: DocumentIDSource.Add(1),
|
||||
LatestTimestamp: timestamp,
|
||||
AssociatedFiles: []int{fileNumber},
|
||||
}
|
||||
}
|
||||
|
||||
// readFileTime reads the first line of the file, which represents a
|
||||
// time/version. The integer value will be returned.
|
||||
func readFileTime(filepath string) (int, error) {
|
||||
file, err := os.Open(filepath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
s := bufio.NewScanner(file)
|
||||
var firstLine string
|
||||
if s.Scan() {
|
||||
firstLine = s.Text()
|
||||
}
|
||||
if err := s.Err(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
time, err := strconv.Atoi(firstLine)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time %s: %w", firstLine, err)
|
||||
}
|
||||
return time, nil
|
||||
}
|
||||
|
||||
// compareFiles computes how much two files overlap, on a scale
|
||||
// of 0 to 1 by iterating through the files and identifying lines
|
||||
// that are duplicated.
|
||||
func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) {
|
||||
f1, err := fcc.getFileContents(f1Number)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("file %d: %w", f1Number, err)
|
||||
}
|
||||
f2, err := fcc.getFileContents(f2Number)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("file %d: %w", f2Number, err)
|
||||
}
|
||||
|
||||
histogram := make(map[string]int)
|
||||
for _, lines := range [][]string{f1, f2} {
|
||||
for _, line := range lines {
|
||||
histogram[line]++
|
||||
}
|
||||
}
|
||||
|
||||
var overlap int
|
||||
for _, v := range histogram {
|
||||
if v == 2 {
|
||||
overlap++
|
||||
}
|
||||
}
|
||||
return float64(overlap) / float64(len(histogram)), nil
|
||||
}
|
||||
|
||||
// fileContentsCache is a cache of file contents, keyed by file number,
|
||||
// to avoid reading the same file from disk multiple times.
|
||||
type fileContentsCache map[int][]string
|
||||
|
||||
// getFileContents returns the contents of a file, excluding the first timestamp
|
||||
// line. If the file is already in the cache, the contents are returned from
|
||||
// there, otherwise the file is read from disk and the contents are cached.
|
||||
func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) {
|
||||
if contents, ok := fcc[fileNumber]; ok {
|
||||
return contents, nil
|
||||
}
|
||||
var (
|
||||
fileName = makeFilePath(fileNumber)
|
||||
lines []string
|
||||
)
|
||||
|
||||
f, err := os.Open(fileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
s := bufio.NewScanner(f)
|
||||
|
||||
// Ignore first line that's just a timestamp.
|
||||
if !s.Scan() {
|
||||
fcc[fileNumber] = []string{}
|
||||
return []string{}, nil
|
||||
}
|
||||
|
||||
for s.Scan() {
|
||||
lines = append(lines, s.Text())
|
||||
}
|
||||
if err := s.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return lines, nil
|
||||
}
|
||||
|
||||
// clearFilesExcept removes the contents of the fileContentsCache except for the
|
||||
// provided file numbers. This helps conserve memory by removing the contents of
|
||||
// files that are no longer of interest, which we can be sure of since we are
|
||||
// proceeding in order of time.
|
||||
func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) {
|
||||
for fNum := range fcc {
|
||||
if !slices.Contains(fileNumbers, fNum) {
|
||||
delete(fcc, fNum)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func makeFileName(number int) string {
|
||||
return fmt.Sprintf("%d.txt", number)
|
||||
}
|
||||
|
||||
func makeFilePath(number int) string {
|
||||
return path.Join(DataFilePath, makeFileName(number))
|
||||
}
|
||||
|
||||
// orderFiles determines the timestamp version of each file and creates a map of
|
||||
// time to file numbers. It sorts the times (since maps are not ordered) so that
|
||||
// the map can be iterated in order of time. This allows stepping through the
|
||||
// history of the files from the beginning. Using this, we can construct a
|
||||
// "chain" of evolution for a given document.
|
||||
func orderFiles() (map[int][]int, []int, error) {
|
||||
timeMap := make(map[int][]int)
|
||||
|
||||
dirEntries, err := os.ReadDir(DataFilePath)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err)
|
||||
}
|
||||
for _, entry := range dirEntries {
|
||||
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get the numeric representation of the file number.
|
||||
var fileNumber int
|
||||
{
|
||||
numberStr := strings.TrimSuffix(entry.Name(), ".txt")
|
||||
var err error
|
||||
if fileNumber, err = strconv.Atoi(numberStr); err != nil {
|
||||
return nil, nil, fmt.Errorf("invalid file number in file name %q: %w", entry.Name(), err)
|
||||
}
|
||||
}
|
||||
|
||||
filePath := path.Join(DataFilePath, entry.Name())
|
||||
modTime, err := readFileTime(filePath)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if timeMap[modTime] == nil {
|
||||
timeMap[modTime] = make([]int, 0)
|
||||
}
|
||||
timeMap[modTime] = append(timeMap[modTime], fileNumber)
|
||||
}
|
||||
|
||||
// Now make a slice of the times and sort them, so we can iterate through
|
||||
// them in order.
|
||||
timeSlice := make([]int, 0, len(timeMap))
|
||||
for k := range timeMap {
|
||||
timeSlice = append(timeSlice, k)
|
||||
}
|
||||
slices.Sort(timeSlice)
|
||||
return timeMap, timeSlice, nil
|
||||
}
|
||||
|
||||
func log(msg string, args ...any) {
|
||||
if Verbose {
|
||||
slog.Info(msg, args...)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue