/** * */ package org.archive.hadoop.pig; import java.io.IOException; import junit.framework.Assert; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.archive.hadoop.mapreduce.LFOnlyLineRecordReader; import org.archive.hadoop.pig.CDXLoader; import org.junit.Test; /** * @author kenji * */ public class CDXLoaderTest extends Assert { public static class FixtureLineRecordReader extends LFOnlyLineRecordReader { String[] lines; int pos; public FixtureLineRecordReader(String[] lines) { this.lines = lines; this.pos = -1; } @Override public boolean nextKeyValue() throws IOException { pos++; return pos < lines.length; } @Override public Text getCurrentValue() { if (pos >= lines.length) return null; Text value = new Text(lines[pos]); return value; } } /* * test for catching common error to forget aligning type cast in prepareToRead * with InputFormat class. */ @Test public void testPrepareToRead() throws IOException, InterruptedException { CDXLoader t = new CDXLoader(); // most arguments can be null as they are not used in CDXLoader#prepareToRead() InputFormat<?, ?> fmt = t.getInputFormat(); t.prepareToRead(fmt.createRecordReader(null, null), null); } @Test public void testRegularLine() throws IOException { CDXLoader t = new CDXLoader(); String[] lines = { " CDX N b a m s k r M S V g", "101,78,123,109)/robots.txt 20120103084508 http://109.123.78.101/robots.txt text/html 404 22RZA2NQT3RZUCQYJYZDPVZRNYIR72SN - - 561 55426267 WIDE-20120103083324-crawl410/WIDE-20120103083324-00000.warc.gz" }; t.prepareToRead(new FixtureLineRecordReader(lines), null); Tuple tuple = t.getNext(); // CDXLoader should skip the header line. //System.err.println(tuple.get(0).getClass()); assertEquals(new DataByteArray("101,78,123,109)/robots.txt"), tuple.get(0)); assertEquals(new DataByteArray("20120103084508"), tuple.get(1)); assertEquals(new DataByteArray("http://109.123.78.101/robots.txt"), tuple.get(2)); assertEquals(new DataByteArray("text/html"), tuple.get(3)); assertEquals(new DataByteArray("404"), tuple.get(4)); assertEquals(new DataByteArray("22RZA2NQT3RZUCQYJYZDPVZRNYIR72SN"), tuple.get(5)); assertEquals(null, tuple.get(6)); assertEquals(null, tuple.get(7)); assertEquals(new DataByteArray("561"), tuple.get(8)); assertEquals(new DataByteArray("55426267"), tuple.get(9)); assertEquals(new DataByteArray("WIDE-20120103083324-crawl410/WIDE-20120103083324-00000.warc.gz"), tuple.get(10)); } /* * Raw space in redirect URL. Current CDX generator writes out whatever text found in Location header * and meta-refresh, with absolutely no escaping. Spaces in redirect URL is fairly common. */ @Test public void testSpaceInRedirect() throws IOException { CDXLoader t = new CDXLoader(); String[] lines = { " CDX N b a m s k r M S V g", "131,125,136,219)/ 20120103084049 http://219.136.125.131/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ http://219.136.125.131/selfLogon.do?hoge=1 2 - 449 34133512 WIDE-20120103083324-crawl410/WIDE-20120103083324-00000.warc.gz" }; t.prepareToRead(new FixtureLineRecordReader(lines), null); Tuple tuple = t.getNext(); // CDXLoader should skip the header line. //System.err.println(tuple.get(0).getClass()); assertEquals(11, tuple.size()); assertEquals(new DataByteArray("http://219.136.125.131/selfLogon.do?hoge=1 2"), tuple.get(6)); } /* * CR in redirect URL. this problem is frequently found in redirect URLs coming from meta-refresh. */ @Test public void testCRInRedirect() throws IOException { CDXLoader t = new CDXLoader(); String[] lines = { " CDX N b a m s k r M S V g", "de,nuernberger)/rente 20110129033909 http://www.nuernberger.de/rente/ text/html 200 COWKAOTVKBT6YVW26BLW7T235FAWRSUR http://www.nuernberger.de/produkte/vorsorge_fuer_jung___alt/rente/\r - 581 468301945 COM-20110129023303-crawl306/COM-20110129025311-00174.warc.gz" }; t.prepareToRead(new FixtureLineRecordReader(lines), null); Tuple tuple = t.getNext(); // CDXLoader should skip the header line. //System.err.println(tuple.get(0).getClass()); assertEquals(11, tuple.size()); assertEquals(new DataByteArray("http://www.nuernberger.de/produkte/vorsorge_fuer_jung___alt/rente/\r"), tuple.get(6)); } }