/**
* Copyright 2014 IPONWEB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.iponweb.hadoop.streaming.parquet;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetRecordWriter;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Stack;
public class TextRecordWriterWrapper implements RecordWriter<Text, Text> {
protected ParquetRecordWriter<SimpleGroup> realWriter;
protected MessageType schema;
protected SimpleGroupFactory factory;
private static final String TAB ="\t";
protected ArrayList<PathAction> recorder;
TextRecordWriterWrapper(ParquetRecordWriter<SimpleGroup> w, FileSystem fs, JobConf conf, String name, Progressable progress)
throws IOException {
realWriter = w;
schema = GroupWriteSupport.getSchema(conf);
factory = new SimpleGroupFactory(schema);
recorder = new ArrayList<>();
ArrayList<String[]> Paths = (ArrayList<String[]>)schema.getPaths();
Iterator<String[]> pi = Paths.listIterator();
String[] prevPath = {};
short grpDepth = 0;
while (pi.hasNext()) {
String p[] = pi.next();
// Find longest common path between prev_path and current
ArrayList<String> commonPath = new ArrayList<String>();
for (int n = 0; n < prevPath.length; n++) {
if (n < p.length && p[n].equals(prevPath[n])) {
commonPath.add(p[n]);
} else
break;
}
// If current element is not inside previous group, restore to the group of common path
for (int n = commonPath.size(); n < prevPath.length - 1; n++) {
recorder.add(new PathAction(PathAction.ActionType.GROUPEND));
grpDepth --;
}
// If current element is not right after common path, create all required groups
for (int n = commonPath.size(); n < p.length - 1; n++) {
PathAction a = new PathAction(PathAction.ActionType.GROUPSTART);
a.setName(p[n]);
recorder.add(a);
grpDepth ++;
}
prevPath = p;
PathAction a = new PathAction(PathAction.ActionType.FIELD);
Type colType = schema.getType(p);
a.setType(colType.asPrimitiveType().getPrimitiveTypeName());
a.setRepetition(colType.getRepetition());
a.setName(p[p.length - 1]);
recorder.add(a);
}
// Close trailing groups
while(grpDepth -- > 0)
recorder.add(new PathAction(PathAction.ActionType.GROUPEND));
}
@Override
public void close(Reporter reporter) throws IOException {
try {
realWriter.close(null);
} catch (InterruptedException e) {
Thread.interrupted();
throw new IOException(e);
}
}
@Override
public void write(Text key, Text value) throws IOException {
Group grp = factory.newGroup();
String[] strK = key.toString().split(TAB,-1);
String[] strV = value == null ? new String[0] : value.toString().split(TAB,-1);
String kv_combined[] = (String[]) ArrayUtils.addAll(strK, strV);
Iterator<PathAction> ai = recorder.iterator();
Stack<Group> groupStack = new Stack<>();
groupStack.push(grp);
int i = 0;
try {
while(ai.hasNext()) {
PathAction a = ai.next();
switch (a.getAction()) {
case GROUPEND:
grp = groupStack.pop();
break;
case GROUPSTART:
groupStack.push(grp);
grp = grp.addGroup(a.getName());
break;
case FIELD:
String s = null;
PrimitiveType.PrimitiveTypeName primType = a.getType();
String colName = a.getName();
if (i < kv_combined.length)
s = kv_combined[i ++];
if (s == null) {
if (a.getRepetition() == Type.Repetition.OPTIONAL) {
i ++;
continue;
}
s = primType == PrimitiveType.PrimitiveTypeName.BINARY ? "" : "0";
}
// If we have 'repeated' field, assume that we should expect JSON-encoded array
// Convert array and append all values
int repetition = 1;
boolean repeated = false;
ArrayList<String> s_vals = null;
if (a.getRepetition() == Type.Repetition.REPEATED) {
repeated = true;
s_vals = new ArrayList<>();
ObjectMapper mapper = new ObjectMapper();
JsonNode node = mapper.readTree(s);
Iterator <JsonNode> itr = node.iterator();
repetition = 0;
while(itr.hasNext()) {
String o;
switch (primType) {
case BINARY:
o = itr.next().getTextValue(); // No array-of-objects!
break;
case BOOLEAN:
o = String.valueOf(itr.next().getBooleanValue());
break;
default:
o = String.valueOf(itr.next().getNumberValue());
}
s_vals.add(o);
repetition ++;
}
}
for (int j = 0; j < repetition; j ++) {
if (repeated)
// extract new s
s = s_vals.get(j);
try {
switch (primType) {
case INT32:
grp.append(colName, new Double(Double.parseDouble(s)).intValue());
break;
case INT64:
case INT96:
grp.append(colName, new Double(Double.parseDouble(s)).longValue());
break;
case DOUBLE:
grp.append(colName, Double.parseDouble(s));
break;
case FLOAT:
grp.append(colName, Float.parseFloat(s));
break;
case BOOLEAN:
grp.append(colName, s.equalsIgnoreCase("true") || s.equals("1"));
break;
case BINARY:
grp.append(colName, Binary.fromString(s));
break;
default:
throw new RuntimeException("Can't handle type " + primType);
}
} catch (NumberFormatException e) {
grp.append(colName, 0);
}
}
}
}
realWriter.write(null, (SimpleGroup)grp);
} catch (InterruptedException e) {
Thread.interrupted();
throw new IOException(e);
}
catch (Exception e) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
e.printStackTrace(new PrintStream(out));
throw new RuntimeException("Failed on record " + grp + ", schema=" + schema + ", path action=" + recorder +
" exception = " + e.getClass() + ", msg=" + e.getMessage() + ", cause=" + e.getCause() + ", trace=" + out.toString());
}
}
}