package edu.umd.cloud9.integration.example.bigram;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.Random;
import junit.framework.JUnit4TestAdapter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.junit.Test;
import tl.lin.data.pair.PairOfStrings;
import com.google.common.base.Joiner;
import edu.umd.cloud9.example.bigram.BigramRelativeFrequencyJson;
import edu.umd.cloud9.integration.IntegrationUtils;
public class BigramRelativeFrequencyIT {
private static final TupleFactory TUPLE_FACTORY = TupleFactory.getInstance();
private static final Random random = new Random();
private static final Path collectionPath = new Path("data/bible+shakes.nopunc.gz");
private static final String tmpPrefix = "tmp-"
+ BigramRelativeFrequencyIT.class.getCanonicalName() + "-" + random.nextInt(10000);
@Test
public void testBigramRelativeFrequencyBase() throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
assertTrue(fs.exists(collectionPath));
String[] args = new String[] { "hadoop --config src/test/resources/hadoop-local-conf/ jar",
IntegrationUtils.getJar("target", "cloud9"),
edu.umd.cloud9.example.bigram.BigramRelativeFrequency.class.getCanonicalName(),
"-input", collectionPath.toString(),
"-output", tmpPrefix + "-base",
"-numReducers", "1"};
IntegrationUtils.exec(Joiner.on(" ").join(args));
SequenceFile.Reader reader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(new Path(tmpPrefix + "-base/part-r-00000")));
PairOfStrings pair = new PairOfStrings();
FloatWritable f = new FloatWritable();
reader.next(pair, f);
assertEquals("&c", pair.getLeftElement());
assertEquals("*", pair.getRightElement());
assertEquals(17f, f.get(), 10e-6);
for (int i = 0; i < 100; i++) {
reader.next(pair, f);
}
assertEquals("'dear", pair.getLeftElement());
assertEquals("*", pair.getRightElement());
assertEquals(2f, f.get(), 10e-6);
reader.next(pair, f);
assertEquals("'dear", pair.getLeftElement());
assertEquals("lord", pair.getRightElement());
assertEquals(1f, f.get(), 10e-6);
reader.close();
}
@Test
public void testBigramRelativeFrequencyJson() throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
assertTrue(fs.exists(collectionPath));
String[] args = new String[] { "hadoop --config src/test/resources/hadoop-local-conf/ jar",
IntegrationUtils.getJar("target", "cloud9"),
edu.umd.cloud9.example.bigram.BigramRelativeFrequencyJson.class.getCanonicalName(),
"-input", collectionPath.toString(),
"-output", tmpPrefix + "-json",
"-numReducers", "1"};
IntegrationUtils.exec(Joiner.on(" ").join(args));
SequenceFile.Reader reader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(new Path(tmpPrefix + "-json/part-r-00000")));
BigramRelativeFrequencyJson.MyTuple json = new BigramRelativeFrequencyJson.MyTuple();
FloatWritable f = new FloatWritable();
reader.next(json, f);
assertEquals("&c", json.getJsonObject().get("Left").getAsString());
assertEquals("*", json.getJsonObject().get("Right").getAsString());
assertEquals(17f, f.get(), 10e-6);
for (int i = 0; i < 100; i++) {
reader.next(json, f);
}
assertEquals("'dear", json.getJsonObject().get("Left").getAsString());
assertEquals("*", json.getJsonObject().get("Right").getAsString());
assertEquals(2f, f.get(), 10e-6);
reader.next(json, f);
assertEquals("'dear", json.getJsonObject().get("Left").getAsString());
assertEquals("lord", json.getJsonObject().get("Right").getAsString());
assertEquals(1f, f.get(), 10e-6);
reader.close();
}
@Test
public void testBigramRelativeFrequencyTuple() throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
assertTrue(fs.exists(collectionPath));
String[] args = new String[] { "hadoop --config src/test/resources/hadoop-local-conf/ jar",
IntegrationUtils.getJar("target", "cloud9"),
edu.umd.cloud9.example.bigram.BigramRelativeFrequencyTuple.class.getCanonicalName(),
"-input", collectionPath.toString(),
"-output", tmpPrefix + "-tuple",
"-numReducers", "1"};
IntegrationUtils.exec(Joiner.on(" ").join(args));
SequenceFile.Reader reader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(new Path(tmpPrefix + "-tuple/part-r-00000")));
Tuple tuple = TUPLE_FACTORY.newTuple();
FloatWritable f = new FloatWritable();
reader.next(tuple, f);
assertEquals("&c", tuple.get(0).toString());
assertEquals("*", tuple.get(1).toString());
assertEquals(17f, f.get(), 10e-6);
for (int i = 0; i < 100; i++) {
reader.next(tuple, f);
}
assertEquals("'dear", tuple.get(0).toString());
assertEquals("*", tuple.get(1).toString());
assertEquals(2f, f.get(), 10e-6);
reader.next(tuple, f);
assertEquals("'dear", tuple.get(0).toString());
assertEquals("lord", tuple.get(1).toString());
assertEquals(1f, f.get(), 10e-6);
reader.close();
}
public static junit.framework.Test suite() {
return new JUnit4TestAdapter(BigramRelativeFrequencyIT.class);
}
}