package com.scaleunlimited.cascading.ml; import java.util.ArrayList; import java.util.Iterator; import junit.framework.Assert; import org.junit.Test; import cascading.flow.Flow; import cascading.flow.local.LocalFlowConnector; import cascading.flow.local.LocalFlowProcess; import cascading.operation.Debug; import cascading.pipe.Each; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.pipe.SubAssembly; import cascading.tap.SinkMode; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; import com.scaleunlimited.cascading.local.InMemoryTap; public class TopTermsByLLRTest extends Assert { private static class SplitterParser implements ITermsParser { private String _text; private boolean _shingle; public SplitterParser(boolean shingle) { _shingle = shingle; } @Override public Iterator<String> iterator() { String[] words = _text.split(" "); ArrayList<String> terms = new ArrayList<String>(words.length * 2); for (int i = 0; i < words.length; i++) { terms.add(words[i]); if (_shingle && (i + 1 < words.length)) { terms.add(words[i] + " " + words[i + 1]); } } return terms.iterator(); } @Override public void reset(String text) { _text = text; } @Override public int getNumWords(String term) { return 1; } } private static class TestFilter implements ITermsFilter { @Override public boolean filter(double llrScore, String term, ITermsParser parser) { return llrScore < 1.0; } @Override public int getMaxResults() { return 20; } } @Test public void testLlrScores() throws Exception { InMemoryTap sourceTap = new InMemoryTap(new Fields("docId", "text")); TupleEntryCollector writer = sourceTap.openForWrite(new LocalFlowProcess()); writer.add(new Tuple("1", "aaa xxx")); writer.add(new Tuple("1", "xxx")); writer.add(new Tuple("2", "xxx bbb")); writer.close(); Pipe p = new Pipe("docs"); SubAssembly ttbllr = new TopTermsByLLR(p, new SplitterParser(false), 1); Pipe results = new Pipe("scores", ttbllr.getTails()[0]); results = new GroupBy(results, new Fields("docId")); results = new Each(results, new Debug("scored", true)); Fields resultFields = new Fields("docId", "terms", "scores"); InMemoryTap sinkTap = new InMemoryTap(resultFields, resultFields, SinkMode.REPLACE); Flow f = new LocalFlowConnector().connect(sourceTap, sinkTap, results); f.complete(); TupleEntryIterator iter = sinkTap.openForRead(new LocalFlowProcess()); // We should have one entry for doc "1", and one entry for doc "2". Each entry // should have a single entry assertTrue(iter.hasNext()); TupleEntry te = iter.next(); assertEquals("1", te.getString("docId")); Tuple terms = (Tuple)te.getObject("terms"); assertNotNull(terms); assertEquals(1, terms.size()); assertEquals("aaa", terms.getString(0)); // The score for the term "aaa" in document 1 should be based on // k11 = 1 (count of term in doc 1) // k12 = 2 (count of other terms in doc 1) // k21 = 0 (count of term in other docs) // k22 = 2 (count of other terms in other docs) Tuple scores = (Tuple)te.getObject("scores"); assertEquals(1, scores.size()); double score = LogLikelihood.rootLogLikelihoodRatio(1, 2, 0, 2); assertEquals(score, scores.getDouble(0), .0001); assertTrue(iter.hasNext()); te = iter.next(); assertEquals("2", te.getString("docId")); terms = (Tuple)te.getObject("terms"); assertNotNull(terms); assertEquals(1, terms.size()); assertEquals("bbb", terms.getString(0)); // The score for the term "bbb" in document 2 should be based on // k11 = 1 (count of term in doc 2) // k12 = 1 (count of other terms in doc 2) // k21 = 0 (count of term in other docs) // k22 = 3 (count of other terms in other docs) scores = (Tuple)te.getObject("scores"); assertEquals(1, scores.size()); score = LogLikelihood.rootLogLikelihoodRatio(1, 1, 0, 3); assertEquals(score, scores.getDouble(0), .0001); assertFalse(iter.hasNext()); iter.close(); } @Test public void testFiltering() throws Exception { Fields groupFields = new Fields("docnum1", "docnum2"); InMemoryTap sourceTap = new InMemoryTap(groupFields.append(new Fields("content"))); TupleEntryCollector writer = sourceTap.openForWrite(new LocalFlowProcess()); writer.add(new Tuple("1", "a", "aaa xxx xxx")); writer.add(new Tuple("1", "a", "aaa xxx")); writer.add(new Tuple("2", "b", "xxx xxx bbb bbb")); writer.close(); Pipe p = new Pipe("docs"); SubAssembly ttbllr = new TopTermsByLLR( p, new SplitterParser(true), new TestFilter(), groupFields, new Fields("content"), 100); Pipe results = new Pipe("scores", ttbllr.getTails()[0]); results = new GroupBy(results, groupFields); results = new Each(results, new Debug("scored", true)); Fields resultFields = groupFields.append(new Fields("terms", "scores")); InMemoryTap sinkTap = new InMemoryTap(resultFields, resultFields, SinkMode.REPLACE); Flow f = new LocalFlowConnector().connect(sourceTap, sinkTap, results); f.complete(); TupleEntryIterator iter = sinkTap.openForRead(new LocalFlowProcess()); // We should have one entry for doc "1", and one entry for doc "2". Each entry // should have a single entry assertTrue(iter.hasNext()); TupleEntry te = iter.next(); assertEquals("1", te.getString("docnum1")); assertEquals("a", te.getString("docnum2")); Tuple terms = (Tuple)te.getObject("terms"); assertNotNull(terms); assertEquals(2, terms.size()); assertEquals("aaa", terms.getString(0)); assertEquals("aaa xxx", terms.getString(1)); // The score for the term "aaa" in document 1 should be based on // k11 = 2 (count of term in doc 1) // k12 = 6 (count of other terms in doc 1) // k21 = 0 (count of term in other docs) // k22 = 7 (count of other terms in other docs) Tuple scores = (Tuple)te.getObject("scores"); assertEquals(2, scores.size()); double score = LogLikelihood.rootLogLikelihoodRatio(2, 6, 0, 7); assertEquals(score, scores.getDouble(0), .0001); assertTrue(iter.hasNext()); te = iter.next(); assertEquals("2", te.getString("docnum1")); assertEquals("b", te.getString("docnum2")); terms = (Tuple)te.getObject("terms"); assertNotNull(terms); assertEquals(3, terms.size()); assertEquals("bbb", terms.getString(0)); assertEquals("bbb bbb", terms.getString(1)); assertEquals("xxx bbb", terms.getString(2)); // The score for the term "bbb" in document 2 should be based on // k11 = 2 (count of term in doc 2) // k12 = 5 (count of other terms in doc 2) // k21 = 0 (count of term in other docs) // k22 = 8 (count of other terms in other docs) scores = (Tuple)te.getObject("scores"); assertEquals(3, scores.size()); score = LogLikelihood.rootLogLikelihoodRatio(2, 5, 0, 8); assertEquals(score, scores.getDouble(0), .0001); assertFalse(iter.hasNext()); iter.close(); } }