diff --git a/main.go b/main.go index 287307c..7196048 100644 --- a/main.go +++ b/main.go @@ -13,6 +13,7 @@ import ( "flag" "fmt" "log/slog" + "math" "os" "path" "runtime" @@ -21,6 +22,7 @@ import ( "strings" "sync" "sync/atomic" + "time" ) const ( @@ -38,37 +40,72 @@ const ( var ( dataFilePath string similarityThreshold float64 + outputFile string useDocPrefix bool verbose bool numWorkers int ) func main() { - documents, err := run(os.Args) + var ( + start = time.Now() + padding = "\n" + ) + documents, output, err := run(os.Args) + if output != os.Stdout { + defer output.Close() + padding = "" + } if err != nil { fmt.Fprintf(os.Stderr, "error: %v\n", err) os.Exit(-1) } + duration := time.Since(start) + + v := make(Visualizer) for _, doc := range documents { - fmt.Println(doc) + fmt.Fprintln(output, doc) + v.Add(*doc) } + fmt.Fprintf( + os.Stderr, + "%s%d documents identified in %s\n\n%s\n", + padding, + len(documents), + duration.Truncate(time.Millisecond), + v.Render("Distribution for Number of Associated Files per Document", 60), + ) } // run is the main entry point for the program. -func run(args []string) ([]*Document, error) { +func run(args []string) ([]*Document, *os.File, error) { flags := flag.NewFlagSet(args[0], flag.ExitOnError) flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool") flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold") + flags.StringVar(&outputFile, "output", "", "output file (default is stdout)") flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use") flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output") flags.BoolVar(&verbose, "verbose", false, "enable verbose logging") _ = flags.Parse(args[1:]) + output := os.Stdout + if outputFile != "" { + var err error + output, err = os.Create(outputFile) + if err != nil { + return nil, nil, fmt.Errorf("creating output file: %w", err) + } + } + shouldReportStatus := !verbose && output != os.Stdout + if shouldReportStatus { + defer fmt.Fprintln(os.Stderr) + } + // The files need to be processed in order of time, so determine the // timestamp of each file and sort them by time. fileTimes, times, err := orderFiles(dataFilePath) if err != nil { - return nil, err + return nil, nil, err } dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers) @@ -90,6 +127,9 @@ func run(args []string) ([]*Document, error) { ) log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times)) + if shouldReportStatus { + fmt.Fprintf(os.Stderr, "\rProcessing timestamp %d of %d...", i+1, len(times)) + } for i, doc := range dm.Documents { wg.Add(1) dm.WorkCh <- WorkItem{ @@ -129,7 +169,7 @@ func run(args []string) ([]*Document, error) { } dm.Shutdown() - return dm.SortedDocuments(), nil + return dm.SortedDocuments(), output, nil } // WorkItem is what will be sent to the the workers in the worker pool. @@ -286,8 +326,8 @@ func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) histogram := make(map[string]int) for _, line := range f1 { - histogram[line]++ - } + histogram[line]++ + } for _, line := range f2 { histogram[line]-- } @@ -478,6 +518,58 @@ func orderFiles(dir string) (map[int][]int, []int, error) { return timeMap, timeSlice, nil } +// Visualizer is a utility to provide insight into the shape of the data +// processed. +type Visualizer map[int]int + +func (v Visualizer) Add(d Document) { + numAssocFiles := len(d.AssociatedFiles) + v[numAssocFiles]++ +} + +func (v Visualizer) Render(title string, width int) string { + if len(v) == 0 { + return "" + } + + type pair struct { + numAssocFiles int + numDocsWithThisNumFiles int + } + + var ( + slicedMap []pair + totalNumDocs int + ) + for naf, nd := range v { + slicedMap = append(slicedMap, pair{numAssocFiles: naf, numDocsWithThisNumFiles: nd}) + totalNumDocs += nd + } + slices.SortFunc(slicedMap, func(a, b pair) int { + return a.numDocsWithThisNumFiles - b.numDocsWithThisNumFiles + }) + slices.Reverse(slicedMap) + + var sb strings.Builder + sb.WriteString(title) + sb.WriteRune('\n') + sb.WriteString(strings.Repeat("=", width)) + sb.WriteRune('\n') + + scaleFactor := float64(totalNumDocs) / float64(slicedMap[0].numDocsWithThisNumFiles) + for _, p := range slicedMap { + ratio := float64(p.numDocsWithThisNumFiles) / float64(totalNumDocs) + numChars := int(math.Ceil(ratio * float64(width-12) * scaleFactor)) + sb.WriteString(fmt.Sprintf( + "%3d | %s (%.0f%%)\n", + p.numAssocFiles, + strings.Repeat("*", numChars), + float64(p.numDocsWithThisNumFiles)*100/float64(totalNumDocs), + )) + } + return sb.String()[0 : sb.Len()-1] +} + func makeFileName(number int) string { return fmt.Sprintf("%d.txt", number) }