/** * */ package org.archive.hadoop.pig; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.pig.LoadFunc; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; /** * @author kenji * */ public class CrawlLogLoader extends LoadFunc { private static final Log log = LogFactory.getLog(CrawlLogLoader.class); private LineRecordReader in = null; private PigSplit split; public Tuple getNext() throws IOException { TupleFactory tupleFactory = TupleFactory.getInstance(); String line; while (in.nextKeyValue()) { Text val = in.getCurrentValue(); line = val.toString(); if (line.length() > 0 && line.charAt(line.length() - 1) == '\r') { line = line.substring(0, line.length() - 1); } // 0: timestamp (YYYY-mm-ddTHH:MM:SSZ) // 1: status // 2: size (digits or "-") // 3: URI // 4: path // 5: via URI // 6: content-type // 7: thread (#\d+) // 8: starttime+duration, or "-" for exclusion case (status < 0) // 9: content hash (sha1:BASE32) // 10: - // 11: - String[] fields = line.split("\\s+"); if (fields.length >= 12) { List<DataByteArray> list = new ArrayList<DataByteArray>(); for (int i = 0; i < fields.length; i++) { if (i == 2) { if (!Pattern.matches("\\d+$", fields[i])) fields[i] = "-1"; list.add(new DataByteArray(fields[i])); } else if (i == 8) { String[] startDuration = fields[i].split("\\+", 2); if (startDuration.length == 1) { // column 8 is "-" for crawl exclusion (ex. by robots.txt) if (startDuration[0].equals("-")) { list.add(null); list.add(null); } else { // abnormal case (no "+") - should never happen list.add(new DataByteArray(startDuration[0])); list.add(null); log.warn("unexpected value (no '+') \"" + fields[i] + "\" in column 8 of line \"" + line + "\", split " + split); } } else { // normal case list.add(new DataByteArray(startDuration[0])); list.add(new DataByteArray(startDuration[1])); if (startDuration[0].length() == 0 || startDuration[1].length() == 0) { log.warn("unexpected value (empty sub-field) \"" + fields[i] + "\" in column 8 of line \"" + line + "\", split " + split); } } } else { list.add(new DataByteArray(fields[i])); } } return tupleFactory.newTuple(list); } } return null; } @Override public InputFormat<LongWritable, Text> getInputFormat() throws IOException { return new TextInputFormat(); } @SuppressWarnings("unchecked") @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { in = (LineRecordReader)reader; this.split = split; } @Override public void setLocation(String location, Job job) throws IOException { FileInputFormat.setInputPaths(job, location); } }