Major refactor: use worker pool
Use a bounded worker pool to prevent creation of hundreds of goroutines contending for scheduling. Add some tests, a Dockerfile, a Makefile, and a readme.
This commit is contained in:
parent
5f1a8bc256
commit
b6de64cde6
|
|
@ -1 +1,3 @@
|
||||||
*.log
|
*.log
|
||||||
|
output.*.txt
|
||||||
|
.vscode
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
FROM golang:1.22 as builder
|
||||||
|
WORKDIR /go/src/docgrouper
|
||||||
|
COPY testdata testdata/
|
||||||
|
COPY *.go go.mod go.sum ./
|
||||||
|
RUN go mod download
|
||||||
|
RUN go test -v ./... && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o docgroup
|
||||||
|
|
||||||
|
FROM gcr.io/distroless/base-nossl-debian12
|
||||||
|
COPY --from=builder /go/src/docgrouper/docgrouper /bin/docgrouper
|
||||||
|
VOLUME [ "/files" ]
|
||||||
|
ENTRYPOINT [ "docgroup" ]
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
DOCKER_IMAGE := "steelray-docgrouper"
|
||||||
|
BINNAME := "steelray-docgrouper"
|
||||||
|
FILES_PATH := "./files.200"
|
||||||
|
|
||||||
|
build: *.go
|
||||||
|
go build -o $(BINNAME) .
|
||||||
|
|
||||||
|
test:
|
||||||
|
go test -v ./...
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f $(BINNAME)
|
||||||
|
|
||||||
|
docker-build:
|
||||||
|
docker build -t $(DOCKER_IMAGE) .
|
||||||
|
|
||||||
|
docker-run:
|
||||||
|
docker run -v $(FILES_PATH):/files $(DOCKER_IMAGE):latest
|
||||||
|
|
||||||
|
.PHONY: docker-build docker-run
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
# Docgrouper
|
||||||
|
|
||||||
|
Given a set of files with an integer timestamp as its first line, identify a set
|
||||||
|
of documents that they represent at various points of the document's life.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
Building **docgrouper** requires [Go](https://go.dev), and can be built by
|
||||||
|
running `make build`. Because Go might not be installed, a `Dockerfile` is
|
||||||
|
provided to test and build a container image. The docker image can be built via
|
||||||
|
the `docker-build` Makefile target.
|
||||||
|
|
||||||
|
## Running
|
||||||
|
|
||||||
|
If running via Docker, the directory where the file pool exists must be mounted
|
||||||
|
into the container, via the `-v` or `--volume` switch, like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run --volume ./host-files:/files steelray-docgrouper
|
||||||
|
```
|
||||||
|
|
||||||
|
This invocation is made available via the `docker-run` Makefile target, but this
|
||||||
|
will only invoke docgrouper with the default command line arguments since
|
||||||
|
arguments cannot be passed to a Makefile target.
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
```
|
||||||
|
-path string
|
||||||
|
path to the file pool (default "files")
|
||||||
|
-prefix
|
||||||
|
use '[doc ###]' prefix for output
|
||||||
|
-threshold float
|
||||||
|
similarity threshold (default 0.5)
|
||||||
|
-verbose
|
||||||
|
enable verbose logging
|
||||||
|
-workers int
|
||||||
|
number of workers to use (default 2*<number-of-cores>)
|
||||||
|
```
|
||||||
4
go.mod
4
go.mod
|
|
@ -1,5 +1,3 @@
|
||||||
module github.com/ianfoo/steelray-docgrouper
|
module github.com/ianfoo/steelray-docgrouper
|
||||||
|
|
||||||
go 1.22.1
|
go 1.22.2
|
||||||
|
|
||||||
require github.com/adrg/strutil v0.3.1
|
|
||||||
|
|
|
||||||
539
main.go
539
main.go
|
|
@ -1,8 +1,11 @@
|
||||||
// Timesheet
|
// Time sheet
|
||||||
// Previously ~1h
|
// Previously ~1h
|
||||||
// March 13, 2024: 00:00-02:30
|
// March 13, 2024: 00:00-02:30
|
||||||
// March 19, 2024: 15:00-19:00
|
// March 19, 2024: 15:00-19:00
|
||||||
// March 23, 2024: 20:00-
|
// March 23, 2024: 20:00-22:00
|
||||||
|
// April 02, 2024: 12:30-17:00
|
||||||
|
// April 04, 2024: 21:00-23:30
|
||||||
|
// April 05, 2024: 00:00-02:00
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
|
@ -12,6 +15,7 @@ import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
"runtime"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
@ -19,143 +23,290 @@ import (
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
)
|
)
|
||||||
|
|
||||||
// DataFilePath describes the default location where the file pool can be found.
|
const (
|
||||||
const DefaultDataFilePath = "files"
|
// defaultSimilarityThreshold is the default minimum similarity required for
|
||||||
|
// two files to be considered related. This value is arbitrary and could be
|
||||||
|
// adjusted based on the specific requirements of the problem.
|
||||||
|
defaultSimilarityThreshold = 0.5
|
||||||
|
|
||||||
|
// defaultDataFilePath describes the default location where the file pool can be
|
||||||
|
// found.
|
||||||
|
defaultDataFilePath = "files"
|
||||||
|
)
|
||||||
|
|
||||||
// Command line options
|
// Command line options
|
||||||
var (
|
var (
|
||||||
DataFilePath string
|
dataFilePath string
|
||||||
UseDocPrefix bool
|
similarityThreshold float64
|
||||||
Verbose bool
|
useDocPrefix bool
|
||||||
|
verbose bool
|
||||||
|
numWorkers int
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
if err := run(os.Args); err != nil {
|
documents, err := run(os.Args)
|
||||||
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||||
os.Exit(1)
|
os.Exit(-1)
|
||||||
|
}
|
||||||
|
for _, doc := range documents {
|
||||||
|
fmt.Println(doc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// run is the main entry point for the program.
|
// run is the main entry point for the program.
|
||||||
func run(args []string) error {
|
func run(args []string) ([]*Document, error) {
|
||||||
flags := flag.NewFlagSet(args[0], flag.ExitOnError)
|
flags := flag.NewFlagSet(args[0], flag.ExitOnError)
|
||||||
flags.StringVar(&DataFilePath, "path", DefaultDataFilePath, "path to the file pool")
|
flags.StringVar(&dataFilePath, "path", defaultDataFilePath, "path to the file pool")
|
||||||
flags.BoolVar(&UseDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
|
flags.Float64Var(&similarityThreshold, "threshold", defaultSimilarityThreshold, "similarity threshold")
|
||||||
flags.BoolVar(&Verbose, "verbose", false, "enable verbose logging")
|
flags.IntVar(&numWorkers, "workers", runtime.NumCPU()*2, "number of workers to use")
|
||||||
flags.Parse(args[1:])
|
flags.BoolVar(&useDocPrefix, "prefix", false, "use '[doc ###]' prefix for output")
|
||||||
|
flags.BoolVar(&verbose, "verbose", false, "enable verbose logging")
|
||||||
|
_ = flags.Parse(args[1:])
|
||||||
|
|
||||||
// SimilarityThreshold is the minimum similarity required for two files
|
// The files need to be processed in order of time, so determine the
|
||||||
// to be considered related. This value is arbitrary and could be adjusted
|
// timestamp of each file and sort them by time.
|
||||||
// based on the specific requirements of the problem.
|
fileTimes, times, err := orderFiles(dataFilePath)
|
||||||
const SimilarityThreshold = 0.5
|
|
||||||
|
|
||||||
fileTimes, times, err := orderFiles()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
dm := NewDocumentManager(dataFilePath, similarityThreshold, numWorkers)
|
||||||
// documents is the master list of documents that will be built up.
|
|
||||||
documents []*Document
|
|
||||||
|
|
||||||
// fcc handles reading files and caching contents.
|
|
||||||
fcc = make(fileContentsCache)
|
|
||||||
)
|
|
||||||
|
|
||||||
for i, timestamp := range times {
|
for i, timestamp := range times {
|
||||||
_ = i
|
|
||||||
// fmt.Printf("\rProcessing timestamp %d/%d", i+1, len(fileTimes))
|
|
||||||
// Track the files at this timestamp that have been associated with documents, so
|
|
||||||
// we can identify unassociated files later and then create new documents for them.
|
|
||||||
var (
|
var (
|
||||||
wg sync.WaitGroup
|
// Track the files at this timestamp that have been associated with
|
||||||
|
// documents, so we can identify unassociated files later and then
|
||||||
|
// create new documents for them. This needs to be distinct for each
|
||||||
|
// timestamp, so it's created inside the timestamp loop's scope.
|
||||||
associatedFiles sync.Map
|
associatedFiles sync.Map
|
||||||
|
|
||||||
|
// We might need to create new documents for files that weren't
|
||||||
|
// associated with any document at this timestamp, so we need to make
|
||||||
|
// sure that this timestamp has been entirely processed first. We do
|
||||||
|
// this by waiting for the workers to indicate they've finished a work
|
||||||
|
// item.
|
||||||
|
wg sync.WaitGroup
|
||||||
)
|
)
|
||||||
|
|
||||||
wg.Add(len(documents))
|
log("processing timestamp", "timestamp", timestamp, "timestampIndex", i, "totalTimestamps", len(times))
|
||||||
log("processing timestamp", "timestamp", timestamp, "numWorkers", len(documents))
|
for i, doc := range dm.Documents {
|
||||||
for _, doc := range documents {
|
wg.Add(1)
|
||||||
// Start a goroutine for each document, to parallelize the
|
dm.WorkCh <- WorkItem{
|
||||||
// comparison with the files in the current timestamp. A more robust
|
doc: doc,
|
||||||
// solution would limit the number of concurrent goroutines to avoid
|
fileNumbers: fileTimes[timestamp],
|
||||||
// exhausting system resources, but for this problem we won't have
|
timestamp: timestamp,
|
||||||
// more than a couple thousand documents. Goroutines are
|
associatedFiles: &associatedFiles,
|
||||||
// lightweight enough (2K stack) that we can start them pretty
|
wg: &wg,
|
||||||
// capriciously.
|
}
|
||||||
go func(doc *Document, files []int) {
|
log(
|
||||||
defer wg.Done()
|
"submitted work",
|
||||||
for _, candidateFileNumber := range files {
|
"documentNumber", i+1,
|
||||||
// Check to be certain this file hasn't been associated with another
|
"documentID", doc.ID,
|
||||||
// document already. If it has been, continue to the next file.
|
"totalDocs", len(dm.Documents),
|
||||||
if _, ok := associatedFiles.Load(candidateFileNumber); ok {
|
"timestamp", timestamp,
|
||||||
continue
|
)
|
||||||
}
|
|
||||||
|
|
||||||
latestFileNumber := doc.LatestAssociatedFile()
|
|
||||||
overlap, err := compareFiles(fcc, latestFileNumber, candidateFileNumber)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Fprintf(
|
|
||||||
os.Stderr,
|
|
||||||
"error comparing files %d and %d: %v\n",
|
|
||||||
latestFileNumber, candidateFileNumber, err,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
if overlap >= SimilarityThreshold {
|
|
||||||
// Add file to Document associated list
|
|
||||||
doc.AssociateFile(candidateFileNumber, timestamp)
|
|
||||||
associatedFiles.Store(candidateFileNumber, struct{}{})
|
|
||||||
|
|
||||||
// We know this document won't be associated with any other files
|
|
||||||
// with this timestamp, so we can stop looking at files with this
|
|
||||||
// timestamp, for this document.
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}(doc, fileTimes[timestamp])
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for all document comparisons to complete for this timestamp.
|
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
// If we haven't associated all the files with existing documents, we need
|
// Now that this timestamp has been fully processed, we can check to see
|
||||||
// to create new documents for those that remain.
|
// what files haven't been associated existing documents, and create new
|
||||||
currentNumDocs := len(documents)
|
// documents for them.
|
||||||
|
var docsAdded int
|
||||||
for _, fileNumber := range fileTimes[timestamp] {
|
for _, fileNumber := range fileTimes[timestamp] {
|
||||||
if _, ok := associatedFiles.Load(fileNumber); !ok {
|
if _, ok := associatedFiles.Load(fileNumber); !ok {
|
||||||
doc := NewDocument(fileNumber, timestamp)
|
dm.AddNewDocument(fileNumber, timestamp)
|
||||||
documents = append(documents, &doc)
|
docsAdded++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(documents) > currentNumDocs {
|
if docsAdded > 0 {
|
||||||
log("created new documents", "numAdded", len(documents)-currentNumDocs, "timestamp", timestamp)
|
log("created new documents", "numAdded", docsAdded, "timestamp", timestamp)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now we can clear the cache of file contents for files that aren't associated with
|
// Free up memory.
|
||||||
// a document, to conserve memory.
|
dm.ShrinkCache()
|
||||||
var latestDocumentFiles []int
|
|
||||||
for _, doc := range documents {
|
|
||||||
latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
|
|
||||||
}
|
|
||||||
fcc.clearFilesExcept(latestDocumentFiles)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output the list of documents, showing their associated files in ascending order.
|
dm.Shutdown()
|
||||||
// Order the documents by their first associated file.
|
return dm.SortedDocuments(), nil
|
||||||
slices.SortFunc(documents, func(a, b *Document) int {
|
|
||||||
return a.AssociatedFiles[0] - b.AssociatedFiles[0]
|
|
||||||
})
|
|
||||||
for _, doc := range documents {
|
|
||||||
doc.SortAssociatedFiles()
|
|
||||||
fmt.Println(doc)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// DocumentIDSource is a concurrency-safe source from which to identify
|
// WorkItem is what will be sent to the the workers in the worker pool.
|
||||||
// documents. This could easily be something other than an integer, but using
|
type WorkItem struct {
|
||||||
// this allows us to just use the standard library.
|
doc *Document
|
||||||
var DocumentIDSource atomic.Uint32
|
fileNumbers []int
|
||||||
|
timestamp int
|
||||||
|
associatedFiles *sync.Map
|
||||||
|
wg *sync.WaitGroup
|
||||||
|
}
|
||||||
|
|
||||||
|
// DocumentManager handles the processing of documents and files. It maintains a
|
||||||
|
// list of documents and a cache of file contents, and uses a pool of workers to
|
||||||
|
// compare documents against files.
|
||||||
|
type DocumentManager struct {
|
||||||
|
// Documents is the list of documents that have been identified.
|
||||||
|
Documents []*Document
|
||||||
|
|
||||||
|
// WorkCh is the channel through which work items are submitted to the workers.
|
||||||
|
WorkCh chan WorkItem
|
||||||
|
|
||||||
|
// docIDSource is a concurrency-safe source from which to identify documents.
|
||||||
|
// This could easily be something other than an integer, but using this allows
|
||||||
|
// us to just use the standard library.
|
||||||
|
docIDSource atomic.Uint32
|
||||||
|
|
||||||
|
similarityThreshold float64
|
||||||
|
fcc *FileContentsCache
|
||||||
|
wg sync.WaitGroup
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDocumentManager creates a new DocumentManager with the specified base path
|
||||||
|
// for the file pool and the specified number of workers.
|
||||||
|
func NewDocumentManager(fileBasePath string, similarityThreshold float64, numWorkers int) *DocumentManager {
|
||||||
|
dm := &DocumentManager{
|
||||||
|
Documents: make([]*Document, 0),
|
||||||
|
similarityThreshold: similarityThreshold,
|
||||||
|
fcc: &FileContentsCache{BaseDir: fileBasePath},
|
||||||
|
WorkCh: make(chan WorkItem),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start workers.
|
||||||
|
for wID := range numWorkers {
|
||||||
|
go dm.ComparisonWorker(wID + 1)
|
||||||
|
}
|
||||||
|
dm.wg.Add(numWorkers)
|
||||||
|
|
||||||
|
return dm
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown cleans up the document manager by closing the work channel to
|
||||||
|
// trigger workers to exit and then waits for all workers to exit.
|
||||||
|
func (dm *DocumentManager) Shutdown() {
|
||||||
|
close(dm.WorkCh)
|
||||||
|
dm.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (dm *DocumentManager) AddNewDocument(fileNumber, timestamp int) {
|
||||||
|
doc := Document{
|
||||||
|
ID: dm.docIDSource.Add(1),
|
||||||
|
LatestTimestamp: timestamp,
|
||||||
|
AssociatedFiles: []int{fileNumber},
|
||||||
|
}
|
||||||
|
dm.Documents = append(dm.Documents, &doc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ShrinkCache removes files from the cache that will never be used again, by
|
||||||
|
// evicting those files that are not associated with any document. Note that
|
||||||
|
// this is not concurrent-safe and should only be called when operations that
|
||||||
|
// could modify the document list are not running. This is an optimization, but
|
||||||
|
// could be removed if memory usage is not a concern.
|
||||||
|
func (dm *DocumentManager) ShrinkCache() {
|
||||||
|
var latestDocumentFiles []int
|
||||||
|
for _, doc := range dm.Documents {
|
||||||
|
latestDocumentFiles = append(latestDocumentFiles, doc.LatestAssociatedFile())
|
||||||
|
}
|
||||||
|
dm.fcc.ClearFilesExcept(latestDocumentFiles)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the list of documents with their associated files in ascending order,
|
||||||
|
// and the documents themselves ordered by the documents by their first
|
||||||
|
// associated file.
|
||||||
|
func (dm *DocumentManager) SortedDocuments() []*Document {
|
||||||
|
// Sort the associated files for each document.
|
||||||
|
for _, doc := range dm.Documents {
|
||||||
|
doc.SortAssociatedFiles()
|
||||||
|
}
|
||||||
|
// Sort the documents by their first associated file number.
|
||||||
|
slices.SortFunc(dm.Documents, func(a, b *Document) int {
|
||||||
|
return a.AssociatedFiles[0] - b.AssociatedFiles[0]
|
||||||
|
})
|
||||||
|
return dm.Documents
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComparisonWorker is a function that receives work items describing a document
|
||||||
|
// and a list of candidate file IDs to compare against. It will compare the
|
||||||
|
// document against each file and if a match is found, associate the file with the
|
||||||
|
// document sent in the work item, and record the file as having been matched.
|
||||||
|
func (dm *DocumentManager) ComparisonWorker(workerID int) {
|
||||||
|
for workItem := range dm.WorkCh {
|
||||||
|
for _, fileNumber := range workItem.fileNumbers {
|
||||||
|
if _, ok := workItem.associatedFiles.Load(fileNumber); ok {
|
||||||
|
// This file has already been matched; skip it.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
latestFileNumber := workItem.doc.LatestAssociatedFile()
|
||||||
|
similarity, err := dm.compareFiles(latestFileNumber, fileNumber)
|
||||||
|
if err != nil {
|
||||||
|
// Simplistic error handling: log the error and continue.
|
||||||
|
slog.Error(
|
||||||
|
"error comparing files",
|
||||||
|
"file1", latestFileNumber,
|
||||||
|
"file2", fileNumber,
|
||||||
|
"document", workItem.doc.ID,
|
||||||
|
"worker", workerID,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If current file doesn't match current document, skip to the next file.
|
||||||
|
if similarity < dm.similarityThreshold {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Current file matches current document, so record this.
|
||||||
|
workItem.doc.AssociateFile(fileNumber, workItem.timestamp)
|
||||||
|
workItem.associatedFiles.Store(fileNumber, struct{}{})
|
||||||
|
log(
|
||||||
|
"match found",
|
||||||
|
"document", workItem.doc.ID,
|
||||||
|
"file", fileNumber,
|
||||||
|
"time", workItem.timestamp,
|
||||||
|
"worker", workerID,
|
||||||
|
)
|
||||||
|
|
||||||
|
// We don't need to consider this document anymore since we've found
|
||||||
|
// a match. End processing and wait for more work.
|
||||||
|
break
|
||||||
|
}
|
||||||
|
workItem.wg.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report that this worker is shutting down.
|
||||||
|
dm.wg.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
// compareFiles computes how much two files overlap, on a scale
|
||||||
|
// of 0 to 1 by iterating through the files and identifying lines
|
||||||
|
// that are duplicated.
|
||||||
|
func (dm *DocumentManager) compareFiles(f1Number, f2Number int) (float64, error) {
|
||||||
|
f1, err := dm.fcc.GetFileContents(f1Number)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("file %d: %w", f1Number, err)
|
||||||
|
}
|
||||||
|
f2, err := dm.fcc.GetFileContents(f2Number)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("file %d: %w", f2Number, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
histogram := make(map[string]int)
|
||||||
|
for _, lines := range [][]string{f1, f2} {
|
||||||
|
for _, line := range lines {
|
||||||
|
// Skip blank lines, which can throw off the count.
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
histogram[line]++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var overlap int
|
||||||
|
for _, v := range histogram {
|
||||||
|
if v == 2 {
|
||||||
|
overlap++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return float64(overlap) / float64(len(histogram)), nil
|
||||||
|
}
|
||||||
|
|
||||||
// Document stores a document ID and a list of associated files.
|
// Document stores a document ID and a list of associated files.
|
||||||
type Document struct {
|
type Document struct {
|
||||||
|
|
@ -171,8 +322,8 @@ func (d Document) String() string {
|
||||||
for _, f := range d.AssociatedFiles {
|
for _, f := range d.AssociatedFiles {
|
||||||
sb.WriteString(fmt.Sprintf("%d ", f))
|
sb.WriteString(fmt.Sprintf("%d ", f))
|
||||||
}
|
}
|
||||||
if UseDocPrefix {
|
if useDocPrefix {
|
||||||
return fmt.Sprintf("[doc %4d] %s", d.ID, sb.String())
|
return fmt.Sprintf("[doc %4d] %s", d.ID, strings.TrimSpace(sb.String()))
|
||||||
}
|
}
|
||||||
return sb.String()
|
return sb.String()
|
||||||
}
|
}
|
||||||
|
|
@ -186,23 +337,78 @@ func (d *Document) AssociateFile(fileNumber, timestamp int) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// LatestAssociatedFile returns the most recent file associated with a document.
|
// LatestAssociatedFile returns the most recent file associated with a document.
|
||||||
|
// Note that this presumes that the list of associated files is sorted in
|
||||||
|
// temporal order based on the timestamp at the head of the file.
|
||||||
func (d Document) LatestAssociatedFile() int {
|
func (d Document) LatestAssociatedFile() int {
|
||||||
return d.AssociatedFiles[len(d.AssociatedFiles)-1]
|
return d.AssociatedFiles[len(d.AssociatedFiles)-1]
|
||||||
}
|
}
|
||||||
|
|
||||||
// SortAssociatedFiles sorts the list of associated files for a document,
|
// SortAssociatedFiles sorts the list of associated files for a document, since
|
||||||
// since the requirements stipulate output in ascending order.
|
// the requirements stipulate output in ascending numerical order. Note that
|
||||||
|
// this changes the order of associated files from their original temporal
|
||||||
|
// order, so must only be invoked when the work is entirely finished.
|
||||||
func (d *Document) SortAssociatedFiles() {
|
func (d *Document) SortAssociatedFiles() {
|
||||||
slices.Sort(d.AssociatedFiles)
|
slices.Sort(d.AssociatedFiles)
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewDocument creates a new Document struct and initializes an ID and records
|
// FileContentsCache is a cache of file contents, keyed by file number,
|
||||||
// the first file and timestamp associated with it.
|
// to avoid reading the same file from disk multiple times.
|
||||||
func NewDocument(fileNumber, timestamp int) Document {
|
type FileContentsCache struct {
|
||||||
return Document{
|
BaseDir string
|
||||||
ID: DocumentIDSource.Add(1),
|
cache sync.Map
|
||||||
LatestTimestamp: timestamp,
|
}
|
||||||
AssociatedFiles: []int{fileNumber},
|
|
||||||
|
// GetFileContents returns the contents of a file, excluding the first timestamp
|
||||||
|
// line. If the file is already in the cache, the contents are returned from
|
||||||
|
// there, otherwise the file is read from disk and the contents are cached.
|
||||||
|
func (fcc *FileContentsCache) GetFileContents(fileNumber int) ([]string, error) {
|
||||||
|
if contents, ok := fcc.cache.Load(fileNumber); ok {
|
||||||
|
return contents.([]string), nil
|
||||||
|
}
|
||||||
|
var (
|
||||||
|
fileName = makeFilePath(fcc.BaseDir, fileNumber)
|
||||||
|
lines []string
|
||||||
|
)
|
||||||
|
|
||||||
|
f, err := os.Open(fileName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
s := bufio.NewScanner(f)
|
||||||
|
|
||||||
|
// Read first line and ignore it since it's just the timestamp.
|
||||||
|
_ = s.Scan()
|
||||||
|
|
||||||
|
// Read file and store contents in cache.
|
||||||
|
for s.Scan() {
|
||||||
|
lines = append(lines, s.Text())
|
||||||
|
}
|
||||||
|
if err := s.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
fcc.cache.Store(fileNumber, lines)
|
||||||
|
return lines, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ClearFilesExcept removes the contents of the fileContentsCache except for the
|
||||||
|
// provided file numbers. This helps conserve memory by removing the contents of
|
||||||
|
// files that are no longer of interest, which we can be sure of since we are
|
||||||
|
// proceeding in order of time.
|
||||||
|
func (fcc *FileContentsCache) ClearFilesExcept(fileNumbers []int) {
|
||||||
|
// Build up a list of entries to delete to avoid modifying the concurrent
|
||||||
|
// map while iterating over it.
|
||||||
|
var toDelete []int
|
||||||
|
fcc.cache.Range(func(key, _ any) bool {
|
||||||
|
storedFileNum := key.(int)
|
||||||
|
if !slices.Contains(fileNumbers, storedFileNum) {
|
||||||
|
toDelete = append(toDelete, storedFileNum)
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
for _, k := range toDelete {
|
||||||
|
fcc.cache.Delete(k)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -231,104 +437,17 @@ func readFileTime(filepath string) (int, error) {
|
||||||
return time, nil
|
return time, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// compareFiles computes how much two files overlap, on a scale
|
|
||||||
// of 0 to 1 by iterating through the files and identifying lines
|
|
||||||
// that are duplicated.
|
|
||||||
func compareFiles(fcc fileContentsCache, f1Number, f2Number int) (float64, error) {
|
|
||||||
f1, err := fcc.getFileContents(f1Number)
|
|
||||||
if err != nil {
|
|
||||||
return 0, fmt.Errorf("file %d: %w", f1Number, err)
|
|
||||||
}
|
|
||||||
f2, err := fcc.getFileContents(f2Number)
|
|
||||||
if err != nil {
|
|
||||||
return 0, fmt.Errorf("file %d: %w", f2Number, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
histogram := make(map[string]int)
|
|
||||||
for _, lines := range [][]string{f1, f2} {
|
|
||||||
for _, line := range lines {
|
|
||||||
histogram[line]++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var overlap int
|
|
||||||
for _, v := range histogram {
|
|
||||||
if v == 2 {
|
|
||||||
overlap++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return float64(overlap) / float64(len(histogram)), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// fileContentsCache is a cache of file contents, keyed by file number,
|
|
||||||
// to avoid reading the same file from disk multiple times.
|
|
||||||
type fileContentsCache map[int][]string
|
|
||||||
|
|
||||||
// getFileContents returns the contents of a file, excluding the first timestamp
|
|
||||||
// line. If the file is already in the cache, the contents are returned from
|
|
||||||
// there, otherwise the file is read from disk and the contents are cached.
|
|
||||||
func (fcc fileContentsCache) getFileContents(fileNumber int) ([]string, error) {
|
|
||||||
if contents, ok := fcc[fileNumber]; ok {
|
|
||||||
return contents, nil
|
|
||||||
}
|
|
||||||
var (
|
|
||||||
fileName = makeFilePath(fileNumber)
|
|
||||||
lines []string
|
|
||||||
)
|
|
||||||
|
|
||||||
f, err := os.Open(fileName)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
s := bufio.NewScanner(f)
|
|
||||||
|
|
||||||
// Ignore first line that's just a timestamp.
|
|
||||||
if !s.Scan() {
|
|
||||||
fcc[fileNumber] = []string{}
|
|
||||||
return []string{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
for s.Scan() {
|
|
||||||
lines = append(lines, s.Text())
|
|
||||||
}
|
|
||||||
if err := s.Err(); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return lines, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// clearFilesExcept removes the contents of the fileContentsCache except for the
|
|
||||||
// provided file numbers. This helps conserve memory by removing the contents of
|
|
||||||
// files that are no longer of interest, which we can be sure of since we are
|
|
||||||
// proceeding in order of time.
|
|
||||||
func (fcc fileContentsCache) clearFilesExcept(fileNumbers []int) {
|
|
||||||
for fNum := range fcc {
|
|
||||||
if !slices.Contains(fileNumbers, fNum) {
|
|
||||||
delete(fcc, fNum)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func makeFileName(number int) string {
|
|
||||||
return fmt.Sprintf("%d.txt", number)
|
|
||||||
}
|
|
||||||
|
|
||||||
func makeFilePath(number int) string {
|
|
||||||
return path.Join(DataFilePath, makeFileName(number))
|
|
||||||
}
|
|
||||||
|
|
||||||
// orderFiles determines the timestamp version of each file and creates a map of
|
// orderFiles determines the timestamp version of each file and creates a map of
|
||||||
// time to file numbers. It sorts the times (since maps are not ordered) so that
|
// time to file numbers. It sorts the times (since maps are not ordered) so that
|
||||||
// the map can be iterated in order of time. This allows stepping through the
|
// the map can be iterated in order of time. This allows stepping through the
|
||||||
// history of the files from the beginning. Using this, we can construct a
|
// history of the files from the beginning. Using this, we can construct a
|
||||||
// "chain" of evolution for a given document.
|
// "chain" of evolution for a given document.
|
||||||
func orderFiles() (map[int][]int, []int, error) {
|
func orderFiles(dir string) (map[int][]int, []int, error) {
|
||||||
timeMap := make(map[int][]int)
|
timeMap := make(map[int][]int)
|
||||||
|
|
||||||
dirEntries, err := os.ReadDir(DataFilePath)
|
dirEntries, err := os.ReadDir(dir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, fmt.Errorf("reading directory %s: %w", DataFilePath, err)
|
return nil, nil, fmt.Errorf("reading directory %s: %w", dir, err)
|
||||||
}
|
}
|
||||||
for _, entry := range dirEntries {
|
for _, entry := range dirEntries {
|
||||||
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
|
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".txt") {
|
||||||
|
|
@ -345,7 +464,7 @@ func orderFiles() (map[int][]int, []int, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
filePath := path.Join(DataFilePath, entry.Name())
|
filePath := path.Join(dir, entry.Name())
|
||||||
modTime, err := readFileTime(filePath)
|
modTime, err := readFileTime(filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
|
|
@ -366,8 +485,16 @@ func orderFiles() (map[int][]int, []int, error) {
|
||||||
return timeMap, timeSlice, nil
|
return timeMap, timeSlice, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func makeFileName(number int) string {
|
||||||
|
return fmt.Sprintf("%d.txt", number)
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeFilePath(dataFilePath string, number int) string {
|
||||||
|
return path.Join(dataFilePath, makeFileName(number))
|
||||||
|
}
|
||||||
|
|
||||||
func log(msg string, args ...any) {
|
func log(msg string, args ...any) {
|
||||||
if Verbose {
|
if verbose {
|
||||||
slog.Info(msg, args...)
|
slog.Info(msg, args...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,123 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"path/filepath"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestReadFileTime(t *testing.T) {
|
||||||
|
tt := []struct {
|
||||||
|
fileName string
|
||||||
|
expectedTimestamp int
|
||||||
|
}{
|
||||||
|
{"1.txt", 3},
|
||||||
|
{"2.txt", 5},
|
||||||
|
{"3.txt", 11},
|
||||||
|
}
|
||||||
|
for _, tc := range tt {
|
||||||
|
t.Run(tc.fileName, func(t *testing.T) {
|
||||||
|
filePath := filepath.Join("testdata", tc.fileName)
|
||||||
|
timestamp, err := readFileTime(filePath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal("error reading file time: ", err)
|
||||||
|
}
|
||||||
|
if timestamp != tc.expectedTimestamp {
|
||||||
|
t.Errorf("expected %d, got %d", tc.expectedTimestamp, timestamp)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestOrderFiles(t *testing.T) {
|
||||||
|
var (
|
||||||
|
expectedOrder = []int{3, 5, 11}
|
||||||
|
expectedMap = map[int][]int{
|
||||||
|
3: {1},
|
||||||
|
5: {2, 7},
|
||||||
|
11: {3, 4, 5},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
fileMap, order, err := orderFiles("testdata")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal("error ordering files: ", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(order, expectedOrder) {
|
||||||
|
t.Errorf("expected %v, got %v", expectedOrder, order)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(fileMap, expectedMap) {
|
||||||
|
t.Errorf("expected %v, got %v", expectedMap, fileMap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFileContentsCache(t *testing.T) {
|
||||||
|
fcc := FileContentsCache{BaseDir: "testdata"}
|
||||||
|
cases := []struct {
|
||||||
|
fileNumber int
|
||||||
|
contents string
|
||||||
|
}{
|
||||||
|
{1, "foo foo foo"},
|
||||||
|
{2, "bar bar bar"},
|
||||||
|
{3, "baz baz baz"},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test initial reads.
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(fmt.Sprintf("initial read %d", c.fileNumber), func(t *testing.T) {
|
||||||
|
got, err := fcc.GetFileContents(c.fileNumber)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal("error getting file contents: ", err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(got, []string{c.contents}) {
|
||||||
|
t.Errorf("expected %q, got %q", c.contents, got)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure files are actually stored in cache.
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(fmt.Sprintf("cache check %d", c.fileNumber), func(t *testing.T) {
|
||||||
|
if got, ok := fcc.cache.Load(c.fileNumber); !ok {
|
||||||
|
t.Fatalf("file %d not found in cache", c.fileNumber)
|
||||||
|
} else if !reflect.DeepEqual(got, []string{c.contents}) {
|
||||||
|
t.Fatalf("expected %q, got %q", c.contents, got)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test clear-except operation.
|
||||||
|
t.Run("clear except", func(t *testing.T) {
|
||||||
|
fcc.ClearFilesExcept([]int{1})
|
||||||
|
if _, ok := fcc.cache.Load(1); !ok {
|
||||||
|
t.Fatal("file 1 not found in cache, expected to be kept")
|
||||||
|
}
|
||||||
|
if _, ok := fcc.cache.Load(2); ok {
|
||||||
|
t.Fatal("file 2 found in cache, expected to be cleared")
|
||||||
|
}
|
||||||
|
if _, ok := fcc.cache.Load(3); ok {
|
||||||
|
t.Fatal("file 3 found in cache, expected to be cleared")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEndToEnd(t *testing.T) {
|
||||||
|
docs, err := run([]string{"argv0", "-path", "testdata/e2e"})
|
||||||
|
want := []int{1, 6, 9, 12, 14, 18}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal("error running program: ", err)
|
||||||
|
}
|
||||||
|
if len(docs) != 1 {
|
||||||
|
t.Fatalf("expected %d documents, got %d", 1, len(docs))
|
||||||
|
}
|
||||||
|
doc := docs[0]
|
||||||
|
if doc.ID != 1 {
|
||||||
|
t.Errorf("expected ID %d, got %d", 0, doc.ID)
|
||||||
|
}
|
||||||
|
if doc.LatestTimestamp != 5 {
|
||||||
|
t.Errorf("expected latest timestamp %d, got %d", 3, doc.LatestTimestamp)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(doc.AssociatedFiles, want) {
|
||||||
|
t.Errorf("expected associated files %v, got %v", want, doc.AssociatedFiles)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
3
|
||||||
|
foo foo foo
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
5
|
||||||
|
bar bar bar
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
11
|
||||||
|
baz baz baz
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
11
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
11
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
5
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
0
|
||||||
|
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||||
|
That Funky Music”. When it was later released as its own single, it became an
|
||||||
|
international smash hit.
|
||||||
|
|
||||||
|
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||||
|
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||||
|
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||||
|
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||||
|
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||||
|
over Queen’s “Under Pressure”, was the B-side.
|
||||||
|
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
5
|
||||||
|
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||||
|
That Funky Music”. When it was later released as its own single, it became an
|
||||||
|
international smash hit.
|
||||||
|
|
||||||
|
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||||
|
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||||
|
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||||
|
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||||
|
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||||
|
over Queen’s “Under Pressure”, was the B-side.
|
||||||
|
|
||||||
|
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||||
|
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||||
|
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||||
|
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||||
|
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||||
|
with “Play That Funky Music” as its flipside.
|
||||||
|
|
||||||
|
The song began climbing charts around the world, eventually reaching the top 10
|
||||||
|
in twelve countries – hitting #1 in six of them including the UK and the US
|
||||||
|
(where it became the first chart-topping rap single in history).
|
||||||
|
|
||||||
|
“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when
|
||||||
|
confronted about it, Ice claimed he’d altered it, but he later admitted he
|
||||||
|
actually hadn’t. The parties settled out of court for an undisclosed sum, and
|
||||||
|
members of Queen, plus the guest vocalist on the original song David Bowie, were
|
||||||
|
also given songwriting credits.
|
||||||
|
|
||||||
|
The song is credited for ‘making hip-hop an acceptable genre to mainstream
|
||||||
|
media’ and continues to be popular into the 2000s. It was certified Gold in 2005
|
||||||
|
for selling 500K digital downloads and named by VH1 the #29 top song of the 90s.
|
||||||
|
|
||||||
|
But it also has its share of negative feedback, with MTV ranking it the #9 worst
|
||||||
|
video in history, and Houston Press calling it the worst song to come from
|
||||||
|
Texas. In 2012, actor/comedian Adam Scott discussed the song’s opening lyrics on
|
||||||
|
Conan, pointing out how ridiculous they sound when analyzed.
|
||||||
|
|
@ -0,0 +1,18 @@
|
||||||
|
1
|
||||||
|
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||||
|
That Funky Music”. When it was later released as its own single, it became an
|
||||||
|
international smash hit.
|
||||||
|
|
||||||
|
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||||
|
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||||
|
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||||
|
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||||
|
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||||
|
over Queen’s “Under Pressure”, was the B-side.
|
||||||
|
|
||||||
|
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||||
|
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||||
|
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||||
|
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||||
|
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||||
|
with “Play That Funky Music” as its flipside.
|
||||||
|
|
@ -0,0 +1,49 @@
|
||||||
|
4
|
||||||
|
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||||
|
That Funky Music”. When it was later released as its own single, it became an
|
||||||
|
international smash hit.
|
||||||
|
|
||||||
|
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||||
|
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||||
|
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||||
|
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||||
|
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||||
|
over Queen’s “Under Pressure”, was the B-side.
|
||||||
|
|
||||||
|
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||||
|
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||||
|
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||||
|
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||||
|
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||||
|
with “Play That Funky Music” as its flipside.
|
||||||
|
|
||||||
|
The song began climbing charts around the world, eventually reaching the top 10
|
||||||
|
in twelve countries – hitting #1 in six of them including the UK and the US
|
||||||
|
(where it became the first chart-topping rap single in history).
|
||||||
|
|
||||||
|
That year “Ice Ice Baby” was certified Platinum two months after its release and
|
||||||
|
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
|
||||||
|
Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t
|
||||||
|
Touch This”.
|
||||||
|
|
||||||
|
Death Row Records' CEO, Suge Knight, learned that Marvin “Chocolate” Johnson, a
|
||||||
|
Death Row signee, was a co-writer on the track, so he invited Ice to his Los
|
||||||
|
Angeles hotel room to negotiate payment. It has been rumored that Vanilla Ice
|
||||||
|
was hung over a balcony during the negotations, but Ice has denied these rumors
|
||||||
|
several times, insisting, “He didn’t have to hang me from no balcony or slap me
|
||||||
|
around or nothing”.
|
||||||
|
|
||||||
|
“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when
|
||||||
|
confronted about it, Ice claimed he’d altered it, but he later admitted he
|
||||||
|
actually hadn’t. The parties settled out of court for an undisclosed sum, and
|
||||||
|
members of Queen, plus the guest vocalist on the original song David Bowie, were
|
||||||
|
also given songwriting credits.
|
||||||
|
|
||||||
|
The song is credited for ‘making hip-hop an acceptable genre to mainstream
|
||||||
|
media’ and continues to be popular into the 2000s. It was certified Gold in 2005
|
||||||
|
for selling 500K digital downloads and named by VH1 the #29 top song of the 90s.
|
||||||
|
|
||||||
|
But it also has its share of negative feedback, with MTV ranking it the #9 worst
|
||||||
|
video in history, and Houston Press calling it the worst song to come from
|
||||||
|
Texas. In 2012, actor/comedian Adam Scott discussed the song’s opening lyrics on
|
||||||
|
Conan, pointing out how ridiculous they sound when analyzed.
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
2
|
||||||
|
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||||
|
That Funky Music”. When it was later released as its own single, it became an
|
||||||
|
international smash hit.
|
||||||
|
|
||||||
|
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||||
|
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||||
|
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||||
|
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||||
|
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||||
|
over Queen’s “Under Pressure”, was the B-side.
|
||||||
|
|
||||||
|
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||||
|
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||||
|
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||||
|
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||||
|
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||||
|
with “Play That Funky Music” as its flipside.
|
||||||
|
|
||||||
|
The song began climbing charts around the world, eventually reaching the top 10
|
||||||
|
in twelve countries – hitting #1 in six of them including the UK and the US
|
||||||
|
(where it became the first chart-topping rap single in history).
|
||||||
|
|
||||||
|
That year “Ice Ice Baby” was certified Platinum two months after its release and
|
||||||
|
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
|
||||||
|
Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t
|
||||||
|
Touch This”.
|
||||||
|
|
@ -0,0 +1,40 @@
|
||||||
|
3
|
||||||
|
“Ice Ice Baby” was originally the flipside to Vanilla Ice’s debut single “Play
|
||||||
|
That Funky Music”. When it was later released as its own single, it became an
|
||||||
|
international smash hit.
|
||||||
|
|
||||||
|
After every record company turned Vanilla Ice’s original demos down, Tommy Quon
|
||||||
|
(the owner of the Dallas club City Lights) had the club’s DJ Earthquake produce
|
||||||
|
two tracks for release on his label Ultrax Records. “Play That Funky Music”,
|
||||||
|
based on the 1976 hit of the same name, was the A-side, and “Ice Ice Baby”,
|
||||||
|
based on the Alpha Phi Alpha fraternity chant in the film School Daze spoken
|
||||||
|
over Queen’s “Under Pressure”, was the B-side.
|
||||||
|
|
||||||
|
A DJ at an FM station in Georgia liked the B-side better and played it on air.
|
||||||
|
Soon it became that station’s #1 requested song, leading stations in Tennessee
|
||||||
|
and Texas to do the same. It also became Video Jukebox’s most requested video.
|
||||||
|
After SBK Records' founder was played the song over the phone, he signed Ice the
|
||||||
|
next day. In August of 1990, his label released “Ice Ice Baby” as the A-side
|
||||||
|
with “Play That Funky Music” as its flipside.
|
||||||
|
|
||||||
|
The song began climbing charts around the world, eventually reaching the top 10
|
||||||
|
in twelve countries – hitting #1 in six of them including the UK and the US
|
||||||
|
(where it became the first chart-topping rap single in history).
|
||||||
|
|
||||||
|
That year “Ice Ice Baby” was certified Platinum two months after its release and
|
||||||
|
was ranked the #45 song of 1990 by Billboard. The song was also nominated for a
|
||||||
|
Grammy that year for Best Solo Rap Performance, but lost to MC Hammer’s “U Can’t
|
||||||
|
Touch This”.
|
||||||
|
|
||||||
|
Death Row Records' CEO, Suge Knight, learned that Marvin “Chocolate” Johnson, a
|
||||||
|
Death Row signee, was a co-writer on the track, so he invited Ice to his Los
|
||||||
|
Angeles hotel room to negotiate payment. It has been rumored that Vanilla Ice
|
||||||
|
was hung over a balcony during the negotations, but Ice has denied these rumors
|
||||||
|
several times, insisting, “He didn’t have to hang me from no balcony or slap me
|
||||||
|
around or nothing”.
|
||||||
|
|
||||||
|
“Ice Ice Baby” contains an uncleared sample of Queen’s “Under Pressure”, so when
|
||||||
|
confronted about it, Ice claimed he’d altered it, but he later admitted he
|
||||||
|
actually hadn’t. The parties settled out of court for an undisclosed sum, and
|
||||||
|
members of Queen, plus the guest vocalist on the original song David Bowie, were
|
||||||
|
also given songwriting credits.
|
||||||
Loading…
Reference in New Issue