Allow file-based output and add some cosmetics
Allow the important data to be explicitly written to a file via a command line switch. The default is still stdout, and redirecting output will still only redirect the important data to the file, ignoring summary data on stderr. Add status during runtime and summary upon completion, for a better user experience.
This commit is contained in:
parent
c8c2d9a9e0
commit
e11464082b
106
main.go
106
main.go
|
|
@ -13,6 +13,7 @@ import (
|
|||
"flag"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math"
|
||||
"os"
|
||||
"path"
|
||||
"runtime"
|
||||
|
|
@ -21,6 +22,7 @@ import (
|
|||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
|
|
@ -38,37 +40,72 @@ const (
|
|||
var (
|
||||
dataFilePath string
|
||||
similarityThreshold float64
|
||||
outputFile string
|
||||
useDocPrefix bool
|
||||
verbose bool
|
||||
numWorkers int
|
||||
)
|
||||
|
||||
func main() {
|
||||
documents, err := run(os.Args)
|
||||
var (
|
||||
start = time.Now()
|
||||
padding = "\n"
|
||||
)
|
||||
documents, output, err := run(os.Args)
|
||||
if output != os.Stdout {
|
||||
defer output.Close()
|
||||
padding = ""
|
||||
}
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(-1)
|
||||
}
|
||||
duration := time.Since(start)
|
||||
|
||||
v := make(Visualizer)
|
||||
for _, doc := range documents {
|
||||
fmt.Println(doc)
|
||||
fmt.Fprintln(output, doc)
|
||||
v.Add(*doc)
|
||||
}
|
||||
fmt.Fprintf(
|
||||
os.Stderr,
|
||||
"%s%d documents identified in %s\n\n%s\n",
|
||||
padding,
|
||||
len(documents),
|
||||
duration.Truncate(time.Millisecond),
|
||||
v.Render("Distribution for Number of Associated Files per Document", 60),
|
||||
)
|
||||
}
|
||||
|
||||
// run is the main entry point for the program.
|
||||
func run(args []string) ([]*Document, error) {
|
||||
func run(args []string) ([]*Document, *os.File, error) {
|
||||
flags := flag.NewFlagSet(args[0], flag.ExitOnError)
|
||||
flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool")
|
||||
flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold")
|
||||
flags.StringVar(&outputFile, "output", "", "output file (default is stdout)")
|
||||
flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use")
|
||||
flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
|
||||
flags.BoolVar(&verbose, "verbose", false, "enable verbose logging")
|
||||
_ = flags.Parse(args[1:])
|
||||
|
||||
output := os.Stdout
|
||||
if outputFile != "" {
|
||||
var err error
|
||||
output, err = os.Create(outputFile)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("creating output file: %w", err)
|
||||
}
|
||||
}
|
||||
shouldReportStatus := !verbose && output != os.Stdout
|
||||
if shouldReportStatus {
|
||||
defer fmt.Fprintln(os.Stderr)
|
||||
}
|
||||
|
||||
// The files need to be processed in order of time, so determine the
|
||||
// timestamp of each file and sort them by time.
|
||||
fileTimes, times, err := orderFiles(dataFilePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers)
|
||||
|
|
@ -90,6 +127,9 @@ func run(args []string) ([]*Document, error) {
|
|||
)
|
||||
|
||||
log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times))
|
||||
if shouldReportStatus {
|
||||
fmt.Fprintf(os.Stderr, "\rProcessing timestamp %d of %d...", i+1, len(times))
|
||||
}
|
||||
for i, doc := range dm.Documents {
|
||||
wg.Add(1)
|
||||
dm.WorkCh <- WorkItem{
|
||||
|
|
@ -129,7 +169,7 @@ func run(args []string) ([]*Document, error) {
|
|||
}
|
||||
|
||||
dm.Shutdown()
|
||||
return dm.SortedDocuments(), nil
|
||||
return dm.SortedDocuments(), output, nil
|
||||
}
|
||||
|
||||
// WorkItem is what will be sent to the the workers in the worker pool.
|
||||
|
|
@ -286,8 +326,8 @@ func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error)
|
|||
|
||||
histogram := make(map[string]int)
|
||||
for _, line := range f1 {
|
||||
histogram[line]++
|
||||
}
|
||||
histogram[line]++
|
||||
}
|
||||
for _, line := range f2 {
|
||||
histogram[line]--
|
||||
}
|
||||
|
|
@ -478,6 +518,58 @@ func orderFiles(dir string) (map[int][]int, []int, error) {
|
|||
return timeMap, timeSlice, nil
|
||||
}
|
||||
|
||||
// Visualizer is a utility to provide insight into the shape of the data
|
||||
// processed.
|
||||
type Visualizer map[int]int
|
||||
|
||||
func (v Visualizer) Add(d Document) {
|
||||
numAssocFiles := len(d.AssociatedFiles)
|
||||
v[numAssocFiles]++
|
||||
}
|
||||
|
||||
func (v Visualizer) Render(title string, width int) string {
|
||||
if len(v) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
type pair struct {
|
||||
numAssocFiles int
|
||||
numDocsWithThisNumFiles int
|
||||
}
|
||||
|
||||
var (
|
||||
slicedMap []pair
|
||||
totalNumDocs int
|
||||
)
|
||||
for naf, nd := range v {
|
||||
slicedMap = append(slicedMap, pair{numAssocFiles: naf, numDocsWithThisNumFiles: nd})
|
||||
totalNumDocs += nd
|
||||
}
|
||||
slices.SortFunc(slicedMap, func(a, b pair) int {
|
||||
return a.numDocsWithThisNumFiles - b.numDocsWithThisNumFiles
|
||||
})
|
||||
slices.Reverse(slicedMap)
|
||||
|
||||
var sb strings.Builder
|
||||
sb.WriteString(title)
|
||||
sb.WriteRune('\n')
|
||||
sb.WriteString(strings.Repeat("=", width))
|
||||
sb.WriteRune('\n')
|
||||
|
||||
scaleFactor := float64(totalNumDocs) / float64(slicedMap[0].numDocsWithThisNumFiles)
|
||||
for _, p := range slicedMap {
|
||||
ratio := float64(p.numDocsWithThisNumFiles) / float64(totalNumDocs)
|
||||
numChars := int(math.Ceil(ratio * float64(width-12) * scaleFactor))
|
||||
sb.WriteString(fmt.Sprintf(
|
||||
"%3d | %s (%.0f%%)\n",
|
||||
p.numAssocFiles,
|
||||
strings.Repeat("*", numChars),
|
||||
float64(p.numDocsWithThisNumFiles)*100/float64(totalNumDocs),
|
||||
))
|
||||
}
|
||||
return sb.String()[0 : sb.Len()-1]
|
||||
}
|
||||
|
||||
func makeFileName(number int) string {
|
||||
return fmt.Sprintf("%d.txt", number)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue