/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Modified in IPONWEB, 2014 * */ package net.iponweb.hadoop.streaming.avro; import org.apache.avro.AvroTypeException; import org.apache.avro.Schema; import org.apache.avro.io.ParsingDecoder; import org.apache.avro.io.parsing.JsonGrammarGenerator; import org.apache.avro.io.parsing.Parser; import org.apache.avro.io.parsing.Symbol; import org.apache.avro.util.Utf8; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.codehaus.jackson.*; import java.io.EOFException; import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.*; /* This class is based on original JsonDecoder from Avro with attempt to * handle complex schemas. With somewhat limited success it can. Known limitation * is schema with union of two or more named records or other named types. * Simpler unions seem to work well based on experience of running this on prod servers * And we can convert "NaN" back to NaN if it is placed in double or float fields */ public class IOWJsonDecoder extends ParsingDecoder implements Parser.ActionHandler { private JsonParser in; private static JsonFactory jsonFactory = new JsonFactory(); static final String CHARSET = "ISO-8859-1"; ReorderBuffer currentReorderBuffer; Stack<ReorderBuffer> reorderBuffers = new Stack<ReorderBuffer>(); private static final Log LOG = LogFactory.getLog(IOWJsonDecoder.class); private static class ReorderBuffer { public Map<String, List<JsonElement>> savedFields = new HashMap<String, List<JsonElement>>(); public JsonParser origParser = null; } public IOWJsonDecoder(Symbol s, String str) throws IOException { super(s); in = jsonFactory.createJsonParser(str); in.nextToken(); } public IOWJsonDecoder(Schema s, String str) throws IOException { this(getSymbol(s), str); } private static Symbol getSymbol(Schema schema) { if (null == schema) { throw new NullPointerException("Schema cannot be null!"); } return new JsonGrammarGenerator().generate(schema); } @Override public ByteBuffer readBytes(ByteBuffer old) throws IOException { advance(Symbol.BYTES); if (in.getCurrentToken() == JsonToken.VALUE_STRING) { byte[] result = readByteArray(); in.nextToken(); return ByteBuffer.wrap(result); } else { throw error("bytes"); } } private void advance(Symbol symbol) throws IOException { this.parser.processTrailingImplicitActions(); if (in.getCurrentToken() == null && this.parser.depth() == 1) throw new EOFException(); parser.advance(symbol); } @Override public void readNull() throws IOException { advance(Symbol.NULL); if (in.getCurrentToken() == JsonToken.VALUE_NULL) { in.nextToken(); } else { throw error("null"); } } @Override public boolean readBoolean() throws IOException { advance(Symbol.BOOLEAN); JsonToken t = in.getCurrentToken(); if (t == JsonToken.VALUE_TRUE || t == JsonToken.VALUE_FALSE) { in.nextToken(); return t == JsonToken.VALUE_TRUE; } else { String s = in.getText(); if(s.equals("false") || s.equals("FALSE") || s.equals("0")) { in.nextToken(); return false; } else if(s.equals("true") || s.equals("TRUE") || s.equals("1")) { in.nextToken(); return true; } throw error("boolean"); } } @Override public int readInt() throws IOException { advance(Symbol.INT); if (in.getCurrentToken().isNumeric()) { int result = in.getIntValue(); in.nextToken(); return result; } else { try { String s = in.getText(); in.nextToken(); return Integer.parseInt(s); } catch(Exception e) { throw error("int (" + e.getMessage() + ")"); } } } @Override public long readLong() throws IOException { advance(Symbol.LONG); if (in.getCurrentToken().isNumeric()) { long result = in.getLongValue(); in.nextToken(); return result; } else { try { String s = in.getText(); in.nextToken(); return Long.parseLong(s); } catch(Exception e) { throw error("long (" + e.getMessage() + ")"); } } } @Override public float readFloat() throws IOException { advance(Symbol.FLOAT); if (in.getCurrentToken().isNumeric()) { float result = in.getFloatValue(); in.nextToken(); return result; } else { try { String s = in.getText(); in.nextToken(); if (s.equals("NaN")) { return Float.NaN; } else if (s.equals("-Inf")) { return Float.NEGATIVE_INFINITY; } else if (s.equals("+Inf")) { return Float.POSITIVE_INFINITY; } else { return Float.parseFloat(s); } } catch (Exception e) { throw error("float (" + e.getMessage() + ")"); } } } @Override public double readDouble() throws IOException { advance(Symbol.DOUBLE); if (in.getCurrentToken().isNumeric()) { double result = in.getDoubleValue(); in.nextToken(); return result; } else { try { String s = in.getText(); in.nextToken(); if (s.equals("NaN")) { return Double.NaN; } else if (s.equals("-Inf")) { return Double.NEGATIVE_INFINITY; } else if (s.equals("+Inf")) { return Double.POSITIVE_INFINITY; } else { return Double.parseDouble(s); } } catch (Exception e) { throw error("double (" + e.getMessage() + ")"); } } } @Override public Utf8 readString(Utf8 old) throws IOException { return new Utf8(readString()); } @Override public String readString() throws IOException { advance(Symbol.STRING); if (parser.topSymbol() == Symbol.MAP_KEY_MARKER) { parser.advance(Symbol.MAP_KEY_MARKER); if (in.getCurrentToken() != JsonToken.FIELD_NAME) { throw error("map-key"); } } else { if (in.getCurrentToken() != JsonToken.VALUE_STRING) { throw error("string"); } } String result = in.getText(); in.nextToken(); return result; } @Override public void skipString() throws IOException { advance(Symbol.STRING); if (parser.topSymbol() == Symbol.MAP_KEY_MARKER) { parser.advance(Symbol.MAP_KEY_MARKER); if (in.getCurrentToken() != JsonToken.FIELD_NAME) { throw error("map-key"); } } else { if (in.getCurrentToken() != JsonToken.VALUE_STRING) { throw error("string"); } } in.nextToken(); } private byte[] readByteArray() throws IOException { byte[] result = in.getText().getBytes(CHARSET); return result; } @Override public void skipBytes() throws IOException { advance(Symbol.BYTES); if (in.getCurrentToken() == JsonToken.VALUE_STRING) { in.nextToken(); } else { throw error("bytes"); } } private void checkFixed(int size) throws IOException { advance(Symbol.FIXED); Symbol.IntCheckAction top = (Symbol.IntCheckAction) parser.popSymbol(); if (size != top.size) { throw new AvroTypeException( "Incorrect length for fixed binary: expected " + top.size + " but received " + size + " bytes."); } } @Override public void readFixed(byte[] bytes, int start, int len) throws IOException { checkFixed(len); if (in.getCurrentToken() == JsonToken.VALUE_STRING) { byte[] result = readByteArray(); in.nextToken(); if (result.length != len) { throw new AvroTypeException("Expected fixed length " + len + ", but got" + result.length); } System.arraycopy(result, 0, bytes, start, len); } else { throw error("fixed"); } } @Override public void skipFixed(int length) throws IOException { checkFixed(length); doSkipFixed(length); } private void doSkipFixed(int length) throws IOException { if (in.getCurrentToken() == JsonToken.VALUE_STRING) { byte[] result = readByteArray(); in.nextToken(); if (result.length != length) { throw new AvroTypeException("Expected fixed length " + length + ", but got" + result.length); } } else { throw error("fixed"); } } @Override protected void skipFixed() throws IOException { advance(Symbol.FIXED); Symbol.IntCheckAction top = (Symbol.IntCheckAction) parser.popSymbol(); doSkipFixed(top.size); } @Override public int readEnum() throws IOException { advance(Symbol.ENUM); Symbol.EnumLabelsAction top = (Symbol.EnumLabelsAction) parser.popSymbol(); if (in.getCurrentToken() == JsonToken.VALUE_STRING) { in.getText(); int n = top.findLabel(in.getText()); if (n >= 0) { in.nextToken(); return n; } throw new AvroTypeException("Unknown symbol in enum " + in.getText()); } else { throw error("fixed"); } } @Override public long readArrayStart() throws IOException { advance(Symbol.ARRAY_START); if (in.getCurrentToken() == JsonToken.START_ARRAY) { in.nextToken(); return doArrayNext(); } else { throw error("array-start"); } } @Override public long arrayNext() throws IOException { advance(Symbol.ITEM_END); return doArrayNext(); } private long doArrayNext() throws IOException { if (in.getCurrentToken() == JsonToken.END_ARRAY) { parser.advance(Symbol.ARRAY_END); in.nextToken(); return 0; } else { return 1; } } @Override public long skipArray() throws IOException { advance(Symbol.ARRAY_START); if (in.getCurrentToken() == JsonToken.START_ARRAY) { in.skipChildren(); in.nextToken(); advance(Symbol.ARRAY_END); } else { throw error("array-start"); } return 0; } @Override public long readMapStart() throws IOException { advance(Symbol.MAP_START); if (in.getCurrentToken() == JsonToken.START_OBJECT) { in.nextToken(); return doMapNext(); } else { throw error("map-start"); } } @Override public long mapNext() throws IOException { advance(Symbol.ITEM_END); return doMapNext(); } private long doMapNext() throws IOException { if (in.getCurrentToken() == JsonToken.END_OBJECT) { in.nextToken(); advance(Symbol.MAP_END); return 0; } else { return 1; } } @Override public long skipMap() throws IOException { advance(Symbol.MAP_START); if (in.getCurrentToken() == JsonToken.START_OBJECT) { in.skipChildren(); in.nextToken(); advance(Symbol.MAP_END); } else { throw error("map-start"); } return 0; } @Override public int readIndex() throws IOException { advance(Symbol.UNION); Symbol.Alternative a = (Symbol.Alternative) parser.popSymbol(); int n; JsonToken token = in.getCurrentToken(); if (token == JsonToken.VALUE_NULL) { n = a.findLabel("null"); if (n < 0) throw error("null"); parser.pushSymbol(a.getSymbol(n)); return n; } if (token == JsonToken.START_OBJECT) { // Iterate over union branches. Assume that first SEQUENCE symbol is our guessed branch n = 0; for (Symbol s : a.symbols) { if (s.kind == Symbol.Kind.SEQUENCE) break; n ++; } if (n == a.symbols.length) throw new AvroTypeException("Union has no branch of type 'record' or 'array'"); parser.pushSymbol(a.getSymbol(n)); return n; } else { // Token is not an object // Check if we have enum somewhere in our symbols. If so, check our value against enum values // If found, push that enum into parser n = 0; for (Symbol s : a.symbols) { if(s.kind == Symbol.Kind.SEQUENCE) for(Symbol seq : s.production) { if (seq.kind == Symbol.Kind.EXPLICIT_ACTION && seq instanceof org.apache.avro.io.parsing.Symbol.EnumLabelsAction) { Symbol.EnumLabelsAction en = (Symbol.EnumLabelsAction)seq; if (en.findLabel(in.getText()) >= 0) { parser.pushSymbol(a.getSymbol(n)); return n; } } } n ++; } // Trying to guess correct branch using token type GUESSING: switch(token) { case START_ARRAY: n = a.findLabel("array"); if (n < 0) throw error("array"); parser.pushSymbol(a.getSymbol(n)); break; case VALUE_NUMBER_FLOAT: final String floats[] = { "float", "double" }; for(String b : floats) { n = a.findLabel(b); if (n >= 0) { parser.pushSymbol(a.getSymbol(n)); break GUESSING; } } throw error("float or double"); case VALUE_NUMBER_INT: final String integers[] = { "int","long","float","double" }; for(String b : integers) { n = a.findLabel(b); if (n >= 0) { parser.pushSymbol(a.getSymbol(n)); break GUESSING; } } throw error("int, long, float or double"); case VALUE_FALSE: case VALUE_TRUE: n = a.findLabel("boolean"); if (n < 0) throw error("boolean"); parser.pushSymbol(a.getSymbol(n)); break; case VALUE_STRING: final String nanCandidates[] = { "float", "double" }; n = a.findLabel("string"); if (n < 0 && (in.getText().equals("NaN") || in.getText().equals("Inf") || in.getText().equals("-Inf") || in.getText().equals("+Inf"))) { // Try to substitute NaN for(String nn : nanCandidates) { if ((n = a.findLabel(nn)) >= 0) { parser.pushSymbol(a.getSymbol(n)); break GUESSING; } } throw error("string (is NaN/Inf, but no string, double or float branches found)"); } else if (n < 0) { // Try to make double of this string String s = in.getText(); try { Double.parseDouble(s); } catch (NumberFormatException e) { throw error("string (not looks like number) and no string branch found"); } // String could be converted to double; Try integer as well try { Integer.parseInt(s); if ((n = a.findLabel("int")) >= 0) { parser.pushSymbol(a.getSymbol(n)); break; } } catch (NumberFormatException e) {} // Now try to find float or double (same as nanCandidates just by coincident for(String nn : nanCandidates) { if ((n = a.findLabel(nn)) >= 0) { parser.pushSymbol(a.getSymbol(n)); break GUESSING; } } throw error("string and also looks like number but no string nor numeric branches found"); } else parser.pushSymbol(a.getSymbol(n)); break; default: throw error("start-union"); } return n; } } @Override public Symbol doAction(Symbol input, Symbol top) throws IOException { if (top instanceof Symbol.FieldAdjustAction) { Symbol.FieldAdjustAction fa = (Symbol.FieldAdjustAction) top; String name = fa.fname; if (currentReorderBuffer != null) { List<JsonElement> node = currentReorderBuffer.savedFields.get(name); if (node != null) { currentReorderBuffer.savedFields.remove(name); currentReorderBuffer.origParser = in; in = makeParser(node); return null; } } if (in.getCurrentToken() == JsonToken.FIELD_NAME) { do { String fn = in.getText(); in.nextToken(); if (name.equals(fn)) { return null; } else { if (currentReorderBuffer == null) { currentReorderBuffer = new ReorderBuffer(); } currentReorderBuffer.savedFields.put(fn, getVaueAsTree(in)); } } while (in.getCurrentToken() == JsonToken.FIELD_NAME); throw new AvroTypeException("Expected field name not found: " + fa.fname); } } else if (top == Symbol.FIELD_END) { if (currentReorderBuffer != null && currentReorderBuffer.origParser != null) { in = currentReorderBuffer.origParser; currentReorderBuffer.origParser = null; } } else if (top == Symbol.RECORD_START) { if (in.getCurrentToken() == JsonToken.START_OBJECT) { in.nextToken(); reorderBuffers.push(currentReorderBuffer); currentReorderBuffer = null; } else { throw error("record-start"); } } else if (top == Symbol.RECORD_END || top == Symbol.UNION_END) { if (in.getCurrentToken() == JsonToken.END_OBJECT) { in.nextToken(); if (top == Symbol.RECORD_END) { if (currentReorderBuffer != null && !currentReorderBuffer.savedFields.isEmpty()) { throw error("Unknown fields: " + currentReorderBuffer.savedFields.keySet()); } currentReorderBuffer = reorderBuffers.pop(); } } else { throw error(top == Symbol.RECORD_END ? "record-end" : "union-end"); } } else { throw new AvroTypeException("Unknown action symbol " + top); } return null; } private static class JsonElement { public final JsonToken token; public final String value; public JsonElement(JsonToken t, String value) { this.token = t; this.value = value; } public JsonElement(JsonToken t) { this(t, null); } } private static List<JsonElement> getVaueAsTree(JsonParser in) throws IOException { int level = 0; List<JsonElement> result = new ArrayList<JsonElement>(); do { JsonToken t = in.getCurrentToken(); switch (t) { case START_OBJECT: case START_ARRAY: level++; result.add(new JsonElement(t)); break; case END_OBJECT: case END_ARRAY: level--; result.add(new JsonElement(t)); break; case FIELD_NAME: case VALUE_STRING: case VALUE_NUMBER_INT: case VALUE_NUMBER_FLOAT: case VALUE_TRUE: case VALUE_FALSE: case VALUE_NULL: result.add(new JsonElement(t, in.getText())); break; } in.nextToken(); } while (level != 0); result.add(new JsonElement(null)); return result; } private JsonParser makeParser(final List<JsonElement> elements) throws IOException { return new JsonParser() { int pos = 0; @Override public ObjectCodec getCodec() { throw new UnsupportedOperationException(); } @Override public void setCodec(ObjectCodec c) { throw new UnsupportedOperationException(); } @Override public void close() throws IOException { throw new UnsupportedOperationException(); } @Override public JsonToken nextToken() throws IOException { pos++; return elements.get(pos).token; } @Override public JsonParser skipChildren() throws IOException { int level = 0; do { switch(elements.get(pos++).token) { case START_ARRAY: case START_OBJECT: level++; break; case END_ARRAY: case END_OBJECT: level--; break; } } while (level > 0); return this; } @Override public boolean isClosed() { throw new UnsupportedOperationException(); } @Override public String getCurrentName() throws IOException { throw new UnsupportedOperationException(); } @Override public JsonStreamContext getParsingContext() { throw new UnsupportedOperationException(); } @Override public JsonLocation getTokenLocation() { throw new UnsupportedOperationException(); } @Override public JsonLocation getCurrentLocation() { throw new UnsupportedOperationException(); } @Override public String getText() throws IOException { return elements.get(pos).value; } @Override public char[] getTextCharacters() throws IOException { throw new UnsupportedOperationException(); } @Override public int getTextLength() throws IOException { throw new UnsupportedOperationException(); } @Override public int getTextOffset() throws IOException { throw new UnsupportedOperationException(); } @Override public Number getNumberValue() throws IOException { throw new UnsupportedOperationException(); } @Override public NumberType getNumberType() throws IOException { throw new UnsupportedOperationException(); } @Override public int getIntValue() throws IOException { return Integer.parseInt(getText()); } @Override public long getLongValue() throws IOException { return Long.parseLong(getText()); } @Override public BigInteger getBigIntegerValue() throws IOException { throw new UnsupportedOperationException(); } @Override public float getFloatValue() throws IOException { return Float.parseFloat(getText()); } @Override public double getDoubleValue() throws IOException { return Double.parseDouble(getText()); } @Override public BigDecimal getDecimalValue() throws IOException { throw new UnsupportedOperationException(); } @Override public byte[] getBinaryValue(Base64Variant b64variant) throws IOException { throw new UnsupportedOperationException(); } @Override public JsonToken getCurrentToken() { return elements.get(pos).token; } }; } private AvroTypeException error(String type) { String val; String fld; String loc; String cntx; String lastToken; try { val = in.getText(); } catch (Exception e) { val = "*UNKNOWN*"; } try { fld = in.getCurrentName(); } catch (Exception e) { fld = "*UNKNOWN*"; } try { loc = in.getCurrentLocation().toString(); } catch (Exception e) { loc = "*UNKNOWN*"; } try { lastToken = in.getLastClearedToken().toString(); } catch (Exception e) { lastToken = "*UNKNOWN*"; } try { cntx = in.getParsingContext().getCurrentName(); } catch (Exception e) { cntx = "*UNKNOWN*"; } return new AvroTypeException("Expected " + type + ". Got " + in.getCurrentToken() + " (" + val + ") at field '" + fld + "'" + ". Location: '" + loc + "', last token = '" + lastToken + "', context: '" + cntx + "'"); } }