Allow file-based output and add some cosmetics

Allow the important data to be explicitly written to a file via a
command line switch. The default is still stdout, and redirecting
output will still only redirect the important data to the file, ignoring
summary data on stderr.

Add status during runtime and summary upon completion, for a better user
experience.
This commit is contained in:
Ian Molee 2024-04-05 04:57:29 -07:00
parent c8c2d9a9e0
commit e11464082b
1 changed files with 99 additions and 7 deletions

102
main.go
View File

@ -13,6 +13,7 @@ import (
"flag" "flag"
"fmt" "fmt"
"log/slog" "log/slog"
"math"
"os" "os"
"path" "path"
"runtime" "runtime"
@ -21,6 +22,7 @@ import (
"strings" "strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
"time"
) )
const ( const (
@ -38,37 +40,72 @@ const (
var ( var (
dataFilePath string dataFilePath string
similarityThreshold float64 similarityThreshold float64
outputFile string
useDocPrefix bool useDocPrefix bool
verbose bool verbose bool
numWorkers int numWorkers int
) )
func main() { func main() {
documents, err := run(os.Args) var (
start = time.Now()
padding = "\n"
)
documents, output, err := run(os.Args)
if output != os.Stdout {
defer output.Close()
padding = ""
}
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err) fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(-1) os.Exit(-1)
} }
duration := time.Since(start)
v := make(Visualizer)
for _, doc := range documents { for _, doc := range documents {
fmt.Println(doc) fmt.Fprintln(output, doc)
v.Add(*doc)
} }
fmt.Fprintf(
os.Stderr,
"%s%d documents identified in %s\n\n%s\n",
padding,
len(documents),
duration.Truncate(time.Millisecond),
v.Render("Distribution for Number of Associated Files per Document", 60),
)
} }
// run is the main entry point for the program. // run is the main entry point for the program.
func run(args []string) ([]*Document, error) { func run(args []string) ([]*Document, *os.File, error) {
flags := flag.NewFlagSet(args[0], flag.ExitOnError) flags := flag.NewFlagSet(args[0], flag.ExitOnError)
flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool") flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool")
flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold") flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold")
flags.StringVar(&outputFile, "output", "", "output file (default is stdout)")
flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use") flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use")
flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output") flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
flags.BoolVar(&verbose, "verbose", false, "enable verbose logging") flags.BoolVar(&verbose, "verbose", false, "enable verbose logging")
_ = flags.Parse(args[1:]) _ = flags.Parse(args[1:])
output := os.Stdout
if outputFile != "" {
var err error
output, err = os.Create(outputFile)
if err != nil {
return nil, nil, fmt.Errorf("creating output file: %w", err)
}
}
shouldReportStatus := !verbose && output != os.Stdout
if shouldReportStatus {
defer fmt.Fprintln(os.Stderr)
}
// The files need to be processed in order of time, so determine the // The files need to be processed in order of time, so determine the
// timestamp of each file and sort them by time. // timestamp of each file and sort them by time.
fileTimes, times, err := orderFiles(dataFilePath) fileTimes, times, err := orderFiles(dataFilePath)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers) dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers)
@ -90,6 +127,9 @@ func run(args []string) ([]*Document, error) {
) )
log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times)) log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times))
if shouldReportStatus {
fmt.Fprintf(os.Stderr, "\rProcessing timestamp %d of %d...", i+1, len(times))
}
for i, doc := range dm.Documents { for i, doc := range dm.Documents {
wg.Add(1) wg.Add(1)
dm.WorkCh <- WorkItem{ dm.WorkCh <- WorkItem{
@ -129,7 +169,7 @@ func run(args []string) ([]*Document, error) {
} }
dm.Shutdown() dm.Shutdown()
return dm.SortedDocuments(), nil return dm.SortedDocuments(), output, nil
} }
// WorkItem is what will be sent to the the workers in the worker pool. // WorkItem is what will be sent to the the workers in the worker pool.
@ -478,6 +518,58 @@ func orderFiles(dir string) (map[int][]int, []int, error) {
return timeMap, timeSlice, nil return timeMap, timeSlice, nil
} }
// Visualizer is a utility to provide insight into the shape of the data
// processed.
type Visualizer map[int]int
func (v Visualizer) Add(d Document) {
numAssocFiles := len(d.AssociatedFiles)
v[numAssocFiles]++
}
func (v Visualizer) Render(title string, width int) string {
if len(v) == 0 {
return ""
}
type pair struct {
numAssocFiles int
numDocsWithThisNumFiles int
}
var (
slicedMap []pair
totalNumDocs int
)
for naf, nd := range v {
slicedMap = append(slicedMap, pair{numAssocFiles: naf, numDocsWithThisNumFiles: nd})
totalNumDocs += nd
}
slices.SortFunc(slicedMap, func(a, b pair) int {
return a.numDocsWithThisNumFiles - b.numDocsWithThisNumFiles
})
slices.Reverse(slicedMap)
var sb strings.Builder
sb.WriteString(title)
sb.WriteRune('\n')
sb.WriteString(strings.Repeat("=", width))
sb.WriteRune('\n')
scaleFactor := float64(totalNumDocs) / float64(slicedMap[0].numDocsWithThisNumFiles)
for _, p := range slicedMap {
ratio := float64(p.numDocsWithThisNumFiles) / float64(totalNumDocs)
numChars := int(math.Ceil(ratio * float64(width-12) * scaleFactor))
sb.WriteString(fmt.Sprintf(
"%3d | %s (%.0f%%)\n",
p.numAssocFiles,
strings.Repeat("*", numChars),
float64(p.numDocsWithThisNumFiles)*100/float64(totalNumDocs),
))
}
return sb.String()[0 : sb.Len()-1]
}
func makeFileName(number int) string { func makeFileName(number int) string {
return fmt.Sprintf("%d.txt", number) return fmt.Sprintf("%d.txt", number)
} }