package com.scaleunlimited.cascading.ml;
import cascading.operation.Identity;
import cascading.operation.state.Counter;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.pipe.assembly.CountBy;
import cascading.pipe.assembly.SumBy;
import cascading.pipe.assembly.Unique;
import cascading.tuple.Fields;
/**
* Cascading sub-assembly that generates TF*IDF values for every unique term.
*
* The <termsPipe> passed to the constructor must contain tuples with the following fields:
*
* - a "doc" field, which is a string with a document identifier.
* - a "term" field, which is a string.
* - a "termcount" field, which is an integer.
*
* The output is a pipe that contains tuples with the following fields:
*
* - a "term" field, which is a string
* - a "tf-idf" field, which is a float.
*
*/
@SuppressWarnings("serial")
public class TfIdfAssembly extends SubAssembly {
public static final String DOC_FN = "doc";
public static final String TERM_FN = "term";
public static final String TERM_COUNT_FN = "termcount";
public static final String TF_IDF_FN = "tf-idf";
private static enum Counters {
TOTAL_DOCS
}
public TfIdfAssembly(Pipe termsPipe) {
super(termsPipe);
// For each term, we need to get a per-document count.
Pipe termPerDocCountPipe = new Pipe("term count per doc pipe", termsPipe);
termPerDocCountPipe = new SumBy(termPerDocCountPipe,
new Fields("TERM_FN"),
new Fields(TERM_COUNT_FN),
new Fields("term-per-doc-count"),
Integer.class);
// We need the count of # of documents that contain each term.
Pipe termDocCountPipe = new Pipe("term doc count pipe", termsPipe);
termDocCountPipe = new Each(termDocCountPipe, new Fields(DOC_FN, TERM_FN), new Identity());
termDocCountPipe = new Unique(termDocCountPipe, new Fields(DOC_FN, TERM_FN));
// We'll also need a total document count - let's split this off the pipe after we've
// got unique doc/term pairs, as that will have less data.
Pipe docCountPipe = new Pipe("total doc count pipe", termDocCountPipe);
docCountPipe = new Each(docCountPipe, new Fields(DOC_FN), new Identity());
docCountPipe = new Unique(docCountPipe, new Fields(DOC_FN));
docCountPipe = new Each(docCountPipe, new Counter(Counters.TOTAL_DOCS));
// TODO add custom function that gets counter value, writes to HDFS?
// Now continue with figuring out the document count for each term.
termDocCountPipe = new CountBy(termDocCountPipe, new Fields(TERM_FN), new Fields("term-doc-count"));
}
}