TfIdfAssembly.java example

Explorer

cascading.utils-master
- src
  - main
    - java
  - test
    - java
      - com
        scaleunlimited
        cascading
        AbstractPlatformTest.java
        BaseBufferTest.java
        BaseFunctionTest.java
        BaseSolrDatumTest.java
        DatumCompilerTest.java
        DatumTest.java
        FlowBreakTest.java
        FlowCountersTest.java
        FlowMonitorTest.java
        FlowResultTest.java
        FlowRunnerTest.java
        FlowUtilsTest.java
        GroupLimitTest.java
        LoggingFlowProcessTest.java
        LoggingUtilsTest.java
        MyDatumEnum.java
        MyDatumTemplate.java
        MyUUIDDatumTemplate.java
        PartitioningKeyTest.java
        PayloadDatumTest.java
        PayloadTest.java
        SomeDatumTemplate.java
        StdDeviationTest.java
        TupleLoggerTest.java
        UUIDWritableTest.java
        UniqueCountTest.java
        hadoop
        HadoopPathTest.java
        HadoopPlatformTest.java
        NullSinkTapHadoopTest.java
        test
        MiniClusterPlatformTest.java
        TestMiniDFSCluster.java
        TestMiniMRClientCluster.java
        local
        DirectoryTapTest.java
        InMemoryTapLocalTest.java
        KryoSchemeTest.java
        LocalPathTest.java
        LocalPlatformTest.java
        NullSinkTapLocalTest.java
        TextLineSchemeTest.java
        ml
        SimHashTest.java
        TopTermsByLLRTest.java
        maps
        StringMapTest.java
        StringSetTest.java

package com.scaleunlimited.cascading.ml;

import cascading.operation.Identity;
import cascading.operation.state.Counter;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.pipe.assembly.CountBy;
import cascading.pipe.assembly.SumBy;
import cascading.pipe.assembly.Unique;
import cascading.tuple.Fields;

/**
 * Cascading sub-assembly that generates TF*IDF values for every unique term.
 * 
 * The <termsPipe> passed to the constructor must contain tuples with the following fields:
 * 
 *  - a "doc" field, which is a string with a document identifier.
 *  - a "term" field, which is a string.
 *  - a "termcount" field, which is an integer.
 *  
 * The output is a pipe that contains tuples with the following fields:
 * 
 *  - a "term" field, which is a string
 *  - a "tf-idf" field, which is a float.
 *
 */
@SuppressWarnings("serial")
public class TfIdfAssembly extends SubAssembly {

    public static final String DOC_FN = "doc";
    public static final String TERM_FN = "term";
    public static final String TERM_COUNT_FN = "termcount";
    public static final String TF_IDF_FN = "tf-idf";
    
    private static enum Counters {
        TOTAL_DOCS
    }
    
    public TfIdfAssembly(Pipe termsPipe) {
        super(termsPipe);
        
        // For each term, we need to get a per-document count.
        Pipe termPerDocCountPipe = new Pipe("term count per doc pipe", termsPipe);
        termPerDocCountPipe = new SumBy(termPerDocCountPipe, 
                                        new Fields("TERM_FN"), 
                                        new Fields(TERM_COUNT_FN), 
                                        new Fields("term-per-doc-count"), 
                                        Integer.class);
        
        // We need the count of # of documents that contain each term.
        Pipe termDocCountPipe = new Pipe("term doc count pipe", termsPipe);
        termDocCountPipe = new Each(termDocCountPipe, new Fields(DOC_FN, TERM_FN), new Identity());
        termDocCountPipe = new Unique(termDocCountPipe, new Fields(DOC_FN, TERM_FN));

        // We'll also need a total document count - let's split this off the pipe after we've
        // got unique doc/term pairs, as that will have less data.
        Pipe docCountPipe = new Pipe("total doc count pipe", termDocCountPipe);
        docCountPipe = new Each(docCountPipe, new Fields(DOC_FN), new Identity());
        docCountPipe = new Unique(docCountPipe, new Fields(DOC_FN));
        docCountPipe = new Each(docCountPipe, new Counter(Counters.TOTAL_DOCS));
        // TODO add custom function that gets counter value, writes to HDFS?
        
        // Now continue with figuring out the document count for each term.
        termDocCountPipe = new CountBy(termDocCountPipe, new Fields(TERM_FN), new Fields("term-doc-count"));

    }
}