package net.iponweb.hadoop.streaming.avro;
import com.google.common.base.Joiner;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonProcessingException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class GenericDataTSV extends GenericData {
GenericData.Record getDatum(String tsv, Schema s) throws IOException, JsonProcessingException {
List<String> tsvStrings = Arrays.asList(tsv.split("\t",-1));
return getDatum(tsvStrings.iterator(), s);
}
GenericData.Record getDatum(Iterator<String> tsvi, Schema s) throws IOException, JsonProcessingException {
List<Schema.Field> fields = s.getFields();
GenericData.Record innerDatum = new Record(s);
Iterator<Schema.Field> it = fields.iterator();
int m = 0;
while(it.hasNext()) {
Schema.Field f = it.next();
String t;
Schema type = f.schema();
switch (type.getType()) {
case UNION:
t = tsvi.hasNext() ? tsvi.next() : "";
// Get union branches, try to apply branch to value
List<Schema> branches = f.schema().getTypes();
Iterator<Schema> br = branches.iterator();
boolean hasNull = false;
boolean hasString = false;
boolean hasArray = false;
Schema payload = null;
while (br.hasNext()) {
Schema bs = br.next();
switch (bs.getType()) {
case ARRAY:
hasArray = true; // Currently we only support [array,null] case
payload = bs.getElementType();
break;
case NULL:
hasNull = true;
break;
case STRING:
hasString = true;
default:
payload = bs;
}
}
// If we have string branch - put the data because string can handle everything
// If we haven't string, but have null and data is empty string - put null
// Otherwise try to put primitive as is
if (hasArray && !t.isEmpty()) { // assuming that array is already json-encoded
innerDatum.put(m, createArray(payload, t));
}
else if (hasString) {
innerDatum.put(m, t);
}
else if (hasNull && t.equals("")) {
innerDatum.put(m, null);
}
else {
if (payload != null)
putPrimitive(innerDatum, m, payload, t);
}
break;
case RECORD:
Schema innerSchema = f.schema();
innerDatum.put(m, getDatum(tsvi, innerSchema));
break;
case ARRAY:
t = tsvi.hasNext() ? tsvi.next() : "[]";
innerDatum.put(m, createArray(type.getElementType(), t.isEmpty() ? "[]" : t));
break;
default:
t = tsvi.hasNext() ? tsvi.next() : "";
putPrimitive(innerDatum, m, type, t);
}
m ++;
}
return innerDatum;
}
private Array<java.io.Serializable> createArray(Schema type, String t)
throws IOException, JsonProcessingException {
ObjectMapper mapper = new ObjectMapper();
JsonNode node = mapper.readTree(t);
Iterator <JsonNode> i = node.iterator();
Array<java.io.Serializable> arr = new GenericData.Array<java.io.Serializable>(node.size(), Schema.createArray(type));
while(i.hasNext()) {
switch (type.getType()) {
case INT:
arr.add(i.next().getIntValue());
break;
case FLOAT:
case DOUBLE:
arr.add(i.next().getDoubleValue());
break;
default:
arr.add(i.next().getTextValue()); // No array-of-objects!
}
}
return arr;
}
private void putPrimitive(GenericData.Record datum, int pos, Schema type, String t) {
float fv;
double dv;
try {
switch (type.getType()) {
case BOOLEAN:
datum.put(pos, t.equals("true") || t.equals("1"));
break;
case INT:
datum.put(pos, Integer.parseInt(t));
break;
case LONG:
datum.put(pos, Long.parseLong(t));
break;
case FLOAT:
if (t.equalsIgnoreCase("NaN"))
fv = Float.NaN;
else if (t.equalsIgnoreCase("-Inf"))
fv = Float.NEGATIVE_INFINITY;
else if (t.equalsIgnoreCase("+Inf") || t.equalsIgnoreCase("Inf"))
fv = Float.POSITIVE_INFINITY;
else
fv = Float.parseFloat(t);
datum.put(pos, fv);
break;
case DOUBLE:
if (t.equalsIgnoreCase("NaN"))
dv = Double.NaN;
else if (t.equalsIgnoreCase("-Inf"))
dv = Double.NEGATIVE_INFINITY;
else if (t.equalsIgnoreCase("+Inf") || t.equalsIgnoreCase("Inf"))
dv = Double.POSITIVE_INFINITY;
else
dv = Double.parseDouble(t);
datum.put(pos, dv);
break;
case ENUM:
datum.put(pos, new EnumSymbol(type,t));
break;
default:
datum.put(pos, t);
}
}
catch (NumberFormatException e) {
datum.put(pos, null);
}
}
@Override
public String toString(Object datum) {
StringBuilder sb = new StringBuilder();
toString(datum, sb);
return sb.toString();
}
@Override
public void toString(Object datum, StringBuilder sb) {
List<Schema.Field> fields = ((GenericDataTSV.Record)datum).getSchema().getFields();
Iterator<Schema.Field> i = fields.iterator();
int n = 0;
while(i.hasNext()) {
Schema.Field f = i.next();
Object val = ((GenericDataTSV.Record) datum).get(n++);
fieldToString(f, val, sb);
if (i.hasNext())
sb.append("\t");
}
}
public String toString(Object datum,int start,int stop) {
StringBuilder sb = new StringBuilder();
ArrayList<Schema.Field> fields = (ArrayList<Schema.Field>)((GenericDataTSV.Record)datum).getSchema().getFields();
if (stop == -1)
stop += fields.size();
for (int i = start; i <= stop; i ++) {
Schema.Field f = fields.get(i);
Object val = ((GenericDataTSV.Record) datum).get(i);
fieldToString(f, val, sb);
if (i != stop)
sb.append("\t");
}
return sb.toString();
}
private void arrayToString(Object val, Schema schema, StringBuilder sb) {
sb.append("[");
ArrayList<String> arr = new ArrayList<String>();
Schema.Type s = schema.getElementType().getType();
Iterator it = ((GenericDataTSV.Array) val).iterator();
while (it.hasNext())
arr.add(s == Schema.Type.STRING ?
"\"" + it.next().toString() + "\"" : it.next().toString());
sb.append(Joiner.on(", ").join(arr));
sb.append("]");
}
private void fieldToString(Schema.Field f, Object val, StringBuilder sb) {
switch (f.schema().getType()) {
case RECORD:
sb.append(toString(val));
break;
case NULL:
break;
case ARRAY:
arrayToString(val, f.schema(), sb);
break;
case UNION:
// all fields are wrapped into unions...
boolean hasArray = false;
List<Schema> tps = f.schema().getTypes();
Iterator<Schema> tt = tps.iterator();
Schema arraySchema = null;
while(tt.hasNext())
if((arraySchema = tt.next()).getType() == Schema.Type.ARRAY) {
hasArray = true;
break;
}
if (hasArray && !val.toString().equals("null")) {
arrayToString(val, arraySchema, sb);
break;
}
default:
if (val != null)
sb.append(val.toString());
}
}
}