XPathLoader.java example

Explorer

analytics_util-master
- components
  - common
    - src
      - main
        java
        com
        caseystella
        pmml
        config
        Config.java
        Consumer.java
        Model.java
        provider
        ConsumerProvider.java
        DataBinding.java
        ModelProvider.java
        util
        common
        enrich
        ExtractContent.java
        hadoop
        ingest
        AtomicPut.java
        Config.java
        DirDiff.java
        input
        Util.java
        fixed
        FixedWidthInputFormat.java
        FixedWidthReader.java
        FixedWidthRecordReader.java
        whole
        WholeFileInputFormat.java
        WholeFileRecordReader.java
        interpret
        fixed
        Config.java
        Field.java
        converter
        BinaryConverter.java
        Converter.java
        IConverter.java
        TextConverter.java
        xpath
        Config.java
        Field.java
  - pig
    - src
      - main
        java
        com
        caseystella
        util
        pig
        Helper.java
        loader
        FixedWidthLoader.java
        WholeFileLoader.java
        XPathLoader.java
        udf
        ContentExtractor.java
        GetToken.java
  - syntactic_analysis
    - src
      - main
        java
        com
        caseystella
        cli
        SummarizerCLI.java
        input
        CSVHandler.java
        InputHandler.java
        Mode.java
        SQLHandler.java
        output
        CursesVisualize.java
        parser
        DateBaseListener.java
        DateLexer.java
        DateListener.java
        DateParser.java
        summarize
        Summarizer.java
        Summary.java
        SynonymHandler.java
        TotalSummary.java
        TypedColumn.java
        TypedColumnWithModifier.java
        TypedColumnWithModifierAndValue.java
        type
        TypeHandler.java
        TypeInference.java
        ValueSummary.java
        util
        ConversionUtils.java
        JSONUtils.java
        LogLikelihood.java
      - test
        java
        com
        caseystella
        summarize
        SummarizerIntegrationTest.java
        type
        TypeImputerTest.java

package com.caseystella.util.pig.loader;

import com.caseystella.util.common.interpret.xpath.Config;
import com.caseystella.util.common.interpret.xpath.Field;
import com.caseystella.util.pig.Helper;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.*;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.jdom2.Document;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;

import java.io.IOException;
import java.io.StringReader;
import java.nio.ByteBuffer;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;

/**
 * Created by cstella on 9/4/14.
 */
public class XPathLoader extends LoadFunc implements LoadMetadata, LoadPushDown {
    Config config;
    String configFile = null;
    RecordReader<LongWritable, Text> reader;
    private SAXBuilder builder = new SAXBuilder();
    public XPathLoader(String configFile)
    {
       this.configFile = configFile;
    }
    @Override
    public void setLocation(String location, Job job) throws IOException {
        FileInputFormat.setInputPaths(job, location);
    }

    @Override
    public InputFormat getInputFormat() throws IOException {

        return new TextInputFormat();
    }
    private Config getConfig() throws IOException {
        if(config == null)
        {
            config = Config.load(Helper.open(configFile, XPathLoader.class));
        }
        return config;
    }
    @Override
    public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException {
        getConfig();
        reader = recordReader;
    }

    @Override
    public Tuple getNext() throws IOException {
        Tuple t = null;
        try {
            boolean notDone = reader.nextKeyValue();
            if (!notDone) {
                return null;
            }
            t = TupleFactory.getInstance().newTuple();
            String value = reader.getCurrentValue().toString();
            Document doc = null;
            try {
                doc = builder.build(new StringReader(value));
                for(Map.Entry<String, String> entry : getConfig().getContent(doc).entrySet())
                {
                    t.append(entry.getValue());
                }
            } catch (JDOMException e) {
                throw new RuntimeException("Unable to parse XML: " + value);
            }

        }
        catch (InterruptedException e) {
            throw new IOException("Unable to read next value", e);
        }
        return t;
    }

    @Override
    public ResourceSchema getSchema(String s, Job job) throws IOException {
        Helper.addFileToContext(configFile, XPathLoader.class);
        ResourceSchema ret = new ResourceSchema();
        ResourceSchema.ResourceFieldSchema[] fields = new ResourceSchema.ResourceFieldSchema[getConfig().getFields().length];
        int i = 0;
        for(Field f : getConfig().getFields())
        {
            fields[i] = new ResourceSchema.ResourceFieldSchema(new Schema.FieldSchema(f.getName(), DataType.CHARARRAY));
            i++;
        }
        ret.setFields(fields);
        return ret;
    }

    @Override
    public ResourceStatistics getStatistics(String s, Job job) throws IOException {
        return null;
    }

    @Override
    public String[] getPartitionKeys(String s, Job job) throws IOException {
        return new String[0];
    }

    @Override
    public void setPartitionFilter(Expression expression) throws IOException {

    }

    @Override
    public List<OperatorSet> getFeatures() {
        return null;
    }

    @Override
    public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException {
        return null;
    }
}