package net.iponweb.hadoop.streaming.parquet; import com.google.common.base.Joiner; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.parquet.example.data.simple.SimpleGroup; import org.apache.parquet.hadoop.Footer; import org.apache.parquet.hadoop.ParquetInputFormat; import org.apache.parquet.hadoop.ParquetInputSplit; import org.apache.parquet.hadoop.ParquetRecordReader; import org.apache.parquet.hadoop.api.ReadSupport; import org.apache.parquet.hadoop.mapred.Container; import org.apache.parquet.hadoop.util.ContextUtil; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class ParquetAsTextInputFormat extends org.apache.hadoop.mapred.FileInputFormat<Text, Text> { protected ParquetInputFormat<SimpleGroup> realInputFormat = new ParquetInputFormat<>(); @Override public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { return new TextRecordReaderWrapper(realInputFormat, split, job, reporter); } @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { //List<Footer> footers = getFooters(job); JobContext cnt = ContextUtil.newJobContext(job, new JobID("xxx", 0)); List<org.apache.hadoop.mapreduce.InputSplit> splits = realInputFormat.getSplits(cnt); if (splits == null) return null; InputSplit[] resultSplits = new InputSplit[splits.size()]; int i = 0; for (org.apache.hadoop.mapreduce.InputSplit split : splits) { resultSplits[i++] = new StreamingParquetInputSplitWrapper(split); } return resultSplits; } public List<Footer> getFooters(JobConf job) throws IOException { return realInputFormat.getFooters(job, Arrays.asList(super.listStatus(job))); } protected static class TextRecordReaderWrapper implements RecordReader<Text, Text> { private ParquetRecordReader<SimpleGroup> realReader; private long splitLen; // for getPos() protected Container<SimpleGroup> valueContainer = null; private boolean firstRecord = false; private boolean eof = false; private List<String> ls; public TextRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat, InputSplit oldSplit, JobConf oldJobConf, Reporter reporter) throws IOException { splitLen = oldSplit.getLength(); try { ReadSupport<SimpleGroup> rs = ParquetInputFormat.getReadSupportInstance(oldJobConf); realReader = new ParquetRecordReader<>(rs); realReader.initialize(((StreamingParquetInputSplitWrapper)oldSplit).realSplit, oldJobConf, reporter); oldJobConf.set("map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString()); oldJobConf.set("mapreduce.map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString()); // read once to gain access to key and value objects if (realReader.nextKeyValue()) { firstRecord = true; valueContainer = new Container<>(); SimpleGroup v = realReader.getCurrentValue(); valueContainer.set(v); ls = groupToStrings(v); } else { eof = true; } } catch (InterruptedException e) { Thread.interrupted(); throw new IOException(e); } } @Override public void close() throws IOException { realReader.close(); } protected List<String> groupToStrings(SimpleGroup grp) { ArrayList<String> s = new ArrayList<>(); for (int n = 0; n < grp.getType().getFieldCount(); n ++) { Type field = grp.getType().getType(n); try { if (!field.isPrimitive()) s.addAll(groupToStrings((SimpleGroup) grp.getGroup(n, 0))); // array of groups not (yet) supported else if (field.getRepetition() == Type.Repetition.REPEATED) { boolean is_binary = field.asPrimitiveType().getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY; StringBuilder sb = new StringBuilder("["); ArrayList<String> arr = new ArrayList<>(); for (int i = 0; i < grp.getFieldRepetitionCount(n); i ++) arr.add(is_binary ? "\"" + grp.getValueToString(n, i) + "\"" : grp.getValueToString(n, i)); sb.append(Joiner.on(", ").join(arr)); sb.append("]"); s.add(sb.toString()); } else s.add(grp.getValueToString(n, 0)); } catch (RuntimeException e) { if(e.getMessage().startsWith("not found") && field.getRepetition() == Type.Repetition.OPTIONAL) s.add(""); else throw e; } } return s; } @Override public Text createKey() { return valueContainer == null ? new Text() : fetchKey(); } @Override public Text createValue() { return valueContainer == null ? new Text() : fetchValue(); } protected Text fetchKey() { return new Text(ls.get(0)); } protected Text fetchValue() { return new Text(Joiner.on("\t").join(ls.subList(1, ls.size()))); } @Override public long getPos() throws IOException { return (long) (splitLen * getProgress()); } @Override public float getProgress() throws IOException { try { return realReader.getProgress(); } catch (InterruptedException e) { Thread.interrupted(); throw new IOException(e); } } @Override public boolean next(Text key, Text value) throws IOException { if (eof) { return false; } try { if (!firstRecord && realReader.nextKeyValue()) { SimpleGroup g = realReader.getCurrentValue(); ls = groupToStrings(g); } if (firstRecord) firstRecord = false; if (key != null) key.set(fetchKey()); if (value != null) value.set(fetchValue()); return true; } catch (InterruptedException e) { throw new IOException(e); } } } private static class StreamingParquetInputSplitWrapper implements InputSplit { FileSplit realSplit; @SuppressWarnings("unused") // MapReduce instantiates this. public StreamingParquetInputSplitWrapper() {} public StreamingParquetInputSplitWrapper(org.apache.hadoop.mapreduce.InputSplit split) throws IOException { this.realSplit = (FileSplit)split; } @Override public long getLength() throws IOException { return realSplit.getLength(); } @Override public String[] getLocations() throws IOException { return realSplit.getLocations(); } @Override public void readFields(DataInput in) throws IOException { realSplit = new ParquetInputSplit(); realSplit.readFields(in); } @Override public void write(DataOutput out) throws IOException { realSplit.write(out); } } }