package com.caseystella.util.pig.loader; import com.caseystella.util.common.interpret.xpath.Config; import com.caseystella.util.common.interpret.xpath.Field; import com.caseystella.util.pig.Helper; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.pig.*; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.jdom2.Document; import org.jdom2.JDOMException; import org.jdom2.input.SAXBuilder; import java.io.IOException; import java.io.StringReader; import java.nio.ByteBuffer; import java.util.EnumMap; import java.util.List; import java.util.Map; /** * Created by cstella on 9/4/14. */ public class XPathLoader extends LoadFunc implements LoadMetadata, LoadPushDown { Config config; String configFile = null; RecordReader<LongWritable, Text> reader; private SAXBuilder builder = new SAXBuilder(); public XPathLoader(String configFile) { this.configFile = configFile; } @Override public void setLocation(String location, Job job) throws IOException { FileInputFormat.setInputPaths(job, location); } @Override public InputFormat getInputFormat() throws IOException { return new TextInputFormat(); } private Config getConfig() throws IOException { if(config == null) { config = Config.load(Helper.open(configFile, XPathLoader.class)); } return config; } @Override public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException { getConfig(); reader = recordReader; } @Override public Tuple getNext() throws IOException { Tuple t = null; try { boolean notDone = reader.nextKeyValue(); if (!notDone) { return null; } t = TupleFactory.getInstance().newTuple(); String value = reader.getCurrentValue().toString(); Document doc = null; try { doc = builder.build(new StringReader(value)); for(Map.Entry<String, String> entry : getConfig().getContent(doc).entrySet()) { t.append(entry.getValue()); } } catch (JDOMException e) { throw new RuntimeException("Unable to parse XML: " + value); } } catch (InterruptedException e) { throw new IOException("Unable to read next value", e); } return t; } @Override public ResourceSchema getSchema(String s, Job job) throws IOException { Helper.addFileToContext(configFile, XPathLoader.class); ResourceSchema ret = new ResourceSchema(); ResourceSchema.ResourceFieldSchema[] fields = new ResourceSchema.ResourceFieldSchema[getConfig().getFields().length]; int i = 0; for(Field f : getConfig().getFields()) { fields[i] = new ResourceSchema.ResourceFieldSchema(new Schema.FieldSchema(f.getName(), DataType.CHARARRAY)); i++; } ret.setFields(fields); return ret; } @Override public ResourceStatistics getStatistics(String s, Job job) throws IOException { return null; } @Override public String[] getPartitionKeys(String s, Job job) throws IOException { return new String[0]; } @Override public void setPartitionFilter(Expression expression) throws IOException { } @Override public List<OperatorSet> getFeatures() { return null; } @Override public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException { return null; } }