package edu.washington.escience.myria.operator.apply; import static org.junit.Assert.assertEquals; import org.junit.Test; import com.google.common.collect.ImmutableList; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.expression.ConstantExpression; import edu.washington.escience.myria.expression.Expression; import edu.washington.escience.myria.expression.ExpressionOperator; import edu.washington.escience.myria.expression.NgramExpression; import edu.washington.escience.myria.expression.VariableExpression; import edu.washington.escience.myria.operator.Apply; import edu.washington.escience.myria.operator.BatchTupleSource; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleBatchBuffer; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.TestEnvVars; public class ApplyNgramTest { private static final long EXPECTED_RESULTS = 2 * TupleUtils.getBatchSize(Type.STRING_TYPE) + 1; private static final long NGRAM_LEN = 3; private static final long CHAR_SEQ_LEN = EXPECTED_RESULTS + NGRAM_LEN - 1; @Test public void testApply() throws DbException { final Schema schema = Schema.ofFields("char_sequence", Type.STRING_TYPE); final Schema expectedResultSchema = Schema.ofFields("ngrams", Type.STRING_TYPE); final TupleBatchBuffer input = new TupleBatchBuffer(schema); StringBuilder sb = new StringBuilder(); for (int i = 0; i < CHAR_SEQ_LEN; ++i) { sb.append((char) i); } input.putString(0, sb.toString()); ImmutableList.Builder<Expression> expressions = ImmutableList.builder(); ExpressionOperator colIdx = new VariableExpression(0); ExpressionOperator ngramLen = new ConstantExpression(NGRAM_LEN); ExpressionOperator split = new NgramExpression(colIdx, ngramLen); Expression expr = new Expression("ngrams", split); expressions.add(expr); Apply apply = new Apply(new BatchTupleSource(input), expressions.build()); apply.open(TestEnvVars.get()); int rowIdx = 0; while (!apply.eos()) { TupleBatch result = apply.nextReady(); if (result != null) { assertEquals(expectedResultSchema, result.getSchema()); for (int batchIdx = 0; batchIdx < result.numTuples(); ++batchIdx, ++rowIdx) { char[] ngramChars = new char[] {(char) rowIdx, (char) (rowIdx + 1), (char) (rowIdx + 2)}; String ngram = new String(ngramChars); assertEquals(ngram, result.getString(0, batchIdx)); } } } assertEquals(EXPECTED_RESULTS, rowIdx); apply.close(); } @Test public void testApplywithCounter() throws DbException { final Schema schema = Schema.ofFields("char_sequence", Type.STRING_TYPE); final Schema expectedResultSchema = Schema.ofFields("ngrams", Type.STRING_TYPE, "flatmapid", Type.INT_TYPE); final TupleBatchBuffer input = new TupleBatchBuffer(schema); StringBuilder sb = new StringBuilder(); for (int i = 0; i < CHAR_SEQ_LEN; ++i) { sb.append((char) i); } input.putString(0, sb.toString()); ImmutableList.Builder<Expression> expressions = ImmutableList.builder(); ExpressionOperator colIdx = new VariableExpression(0); ExpressionOperator ngramLen = new ConstantExpression(NGRAM_LEN); ExpressionOperator split = new NgramExpression(colIdx, ngramLen); Expression expr = new Expression("ngrams", split); expressions.add(expr); Apply apply = new Apply(new BatchTupleSource(input), expressions.build(), true); apply.open(TestEnvVars.get()); int rowIdx = 0; while (!apply.eos()) { TupleBatch result = apply.nextReady(); if (result != null) { assertEquals(expectedResultSchema, result.getSchema()); for (int batchIdx = 0; batchIdx < result.numTuples(); ++batchIdx, ++rowIdx) { char[] ngramChars = new char[] {(char) rowIdx, (char) (rowIdx + 1), (char) (rowIdx + 2)}; String ngram = new String(ngramChars); int fltmapid = rowIdx; assertEquals(ngram, result.getString(0, batchIdx)); assertEquals(fltmapid, result.getInt(1, batchIdx)); } } } assertEquals(EXPECTED_RESULTS, rowIdx); apply.close(); } }