ResolvingGrammarGenerator.java example

Explorer
avro-master
- doc
  - examples
    - java-example
      - src
        main
        java
        example
        GenericMain.java
        SpecificMain.java
    - mr-example
      - src
        main
        java
        example
        AvroWordCount.java
        GenerateData.java
        MapReduceAvroWordCount.java
        MapReduceColorCount.java
        MapredColorCount.java
- lang
  - java
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro.io.parsing;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.avro.AvroTypeException;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.io.Encoder;
import org.apache.avro.io.EncoderFactory;
import org.codehaus.jackson.JsonNode;

/**
 * The class that generates a resolving grammar to resolve between two
 * schemas.
 */
public class ResolvingGrammarGenerator extends ValidatingGrammarGenerator {
  /**
   * Resolves the writer schema <tt>writer</tt> and the reader schema
   * <tt>reader</tt> and returns the start symbol for the grammar generated.
   * @param writer    The schema used by the writer
   * @param reader    The schema used by the reader
   * @return          The start symbol for the resolving grammar
   * @throws IOException
   */
  public final Symbol generate(Schema writer, Schema reader)
    throws IOException {
    return Symbol.root(generate(writer, reader, new HashMap<LitS, Symbol>()));
  }

  /**
   * Resolves the writer schema <tt>writer</tt> and the reader schema
   * <tt>reader</tt> and returns the start symbol for the grammar generated.
   * If there is already a symbol in the map <tt>seen</tt> for resolving the
   * two schemas, then that symbol is returned. Otherwise a new symbol is
   * generated and returnd.
   * @param writer    The schema used by the writer
   * @param reader    The schema used by the reader
   * @param seen      The <reader-schema, writer-schema> to symbol
   * map of start symbols of resolving grammars so far.
   * @return          The start symbol for the resolving grammar
   * @throws IOException
   */
  public Symbol generate(Schema writer, Schema reader,
                                Map<LitS, Symbol> seen) throws IOException
  {
    final Schema.Type writerType = writer.getType();
    final Schema.Type readerType = reader.getType();

    if (writerType == readerType) {
      switch (writerType) {
      case NULL:
        return Symbol.NULL;
      case BOOLEAN:
        return Symbol.BOOLEAN;
      case INT:
        return Symbol.INT;
      case LONG:
        return Symbol.LONG;
      case FLOAT:
        return Symbol.FLOAT;
      case DOUBLE:
        return Symbol.DOUBLE;
      case STRING:
        return Symbol.STRING;
      case BYTES:
        return Symbol.BYTES;
      case FIXED:
        if (writer.getFullName().equals(reader.getFullName())
            && writer.getFixedSize() == reader.getFixedSize()) {
          return Symbol.seq(Symbol.intCheckAction(writer.getFixedSize()),
              Symbol.FIXED);
        }
        break;

      case ENUM:
        if (writer.getFullName() == null
                || writer.getFullName().equals(reader.getFullName())) {
          return Symbol.seq(mkEnumAdjust(writer.getEnumSymbols(),
                  reader.getEnumSymbols()), Symbol.ENUM);
        }
        break;

      case ARRAY:
        return Symbol.seq(Symbol.repeat(Symbol.ARRAY_END,
                generate(writer.getElementType(),
                reader.getElementType(), seen)),
            Symbol.ARRAY_START);

      case MAP:
        return Symbol.seq(Symbol.repeat(Symbol.MAP_END,
                generate(writer.getValueType(),
                reader.getValueType(), seen), Symbol.STRING),
            Symbol.MAP_START);
      case RECORD:
        return resolveRecords(writer, reader, seen);
      case UNION:
        return resolveUnion(writer, reader, seen);
      default:
        throw new AvroTypeException("Unkown type for schema: " + writerType);
      }
    } else {  // writer and reader are of different types
      if (writerType == Schema.Type.UNION) {
        return resolveUnion(writer, reader, seen);
      }

      switch (readerType) {
      case LONG:
        switch (writerType) {
        case INT:
          return Symbol.resolve(super.generate(writer, seen), Symbol.LONG);
        }
        break;

      case FLOAT:
        switch (writerType) {
        case INT:
        case LONG:
          return Symbol.resolve(super.generate(writer, seen), Symbol.FLOAT);
        }
        break;

      case DOUBLE:
        switch (writerType) {
        case INT:
        case LONG:
        case FLOAT:
          return Symbol.resolve(super.generate(writer, seen), Symbol.DOUBLE);
        }
        break;

      case BYTES:
        switch (writerType) {
        case STRING:
          return Symbol.resolve(super.generate(writer, seen), Symbol.BYTES);
        }
        break;

      case STRING:
        switch (writerType) {
        case BYTES:
          return Symbol.resolve(super.generate(writer, seen), Symbol.STRING);
        }
        break;

      case UNION:
        int j = bestBranch(reader, writer, seen);
        if (j >= 0) {
          Symbol s = generate(writer, reader.getTypes().get(j), seen);
          return Symbol.seq(Symbol.unionAdjustAction(j, s), Symbol.UNION);
        }
        break;
      case NULL:
      case BOOLEAN:
      case INT:
      case ENUM:
      case ARRAY:
      case MAP:
      case RECORD:
      case FIXED:
        break;
      default:
        throw new RuntimeException("Unexpected schema type: " + readerType);
      }
    }
    return Symbol.error("Found " + writer.getFullName()
                        + ", expecting " + reader.getFullName());
  }

  private Symbol resolveUnion(Schema writer, Schema reader,
      Map<LitS, Symbol> seen) throws IOException {
    List<Schema> alts = writer.getTypes();
    final int size = alts.size();
    Symbol[] symbols = new Symbol[size];
    String[] labels = new String[size];

    /**
     * We construct a symbol without filling the arrays. Please see
     * {@link Symbol#production} for the reason.
     */
    int i = 0;
    for (Schema w : alts) {
      symbols[i] = generate(w, reader, seen);
      labels[i] = w.getFullName();
      i++;
    }
    return Symbol.seq(Symbol.alt(symbols, labels),
                      Symbol.writerUnionAction());
  }

  private Symbol resolveRecords(Schema writer, Schema reader,
      Map<LitS, Symbol> seen) throws IOException {
    LitS wsc = new LitS2(writer, reader);
    Symbol result = seen.get(wsc);
    if (result == null) {
      List<Field> wfields = writer.getFields();
      List<Field> rfields = reader.getFields();

      // First, compute reordering of reader fields, plus
      // number elements in the result's production
      Field[] reordered = new Field[rfields.size()];
      int ridx = 0;
      int count = 1 + wfields.size();

      for (Field f : wfields) {
        Field rdrField = reader.getField(f.name());
        if (rdrField != null) {
          reordered[ridx++] = rdrField;
        }
      }

      for (Field rf : rfields) {
        String fname = rf.name();
        if (writer.getField(fname) == null) {
          if (rf.defaultValue() == null) {
            result = Symbol.error("Found " + writer.getFullName()
                                  + ", expecting " + reader.getFullName()
                                  + ", missing required field " + fname);
            seen.put(wsc, result);
            return result;
          } else {
            reordered[ridx++] = rf;
            count += 3;
          }
        }
      }

      Symbol[] production = new Symbol[count];
      production[--count] = Symbol.fieldOrderAction(reordered);

      /**
       * We construct a symbol without filling the array. Please see
       * {@link Symbol#production} for the reason.
       */
      result = Symbol.seq(production);
      seen.put(wsc, result);

      /*
       * For now every field in read-record with no default value
       * must be in write-record.
       * Write record may have additional fields, which will be
       * skipped during read.
       */

      // Handle all the writer's fields
      for (Field wf : wfields) {
        String fname = wf.name();
        Field rf = reader.getField(fname);
        if (rf == null) {
          production[--count] =
            Symbol.skipAction(generate(wf.schema(), wf.schema(), seen));
        } else {
          production[--count] =
            generate(wf.schema(), rf.schema(), seen);
        }
      }

      // Add default values for fields missing from Writer
      for (Field rf : rfields) {
        String fname = rf.name();
        Field wf = writer.getField(fname);
        if (wf == null) {
          byte[] bb = getBinary(rf.schema(), rf.defaultValue());
          production[--count] = Symbol.defaultStartAction(bb);
          production[--count] = generate(rf.schema(), rf.schema(), seen);
          production[--count] = Symbol.DEFAULT_END_ACTION;
        }
      }
    }
    return result;
  }

  private static EncoderFactory factory = new EncoderFactory().configureBufferSize(32);
  /**
   * Returns the Avro binary encoded version of <tt>n</tt> according to
   * the schema <tt>s</tt>.
   * @param s The schema for encoding
   * @param n The Json node that has the value to be encoded.
   * @return  The binary encoded version of <tt>n</tt>.
   * @throws IOException
   */
  private static byte[] getBinary(Schema s, JsonNode n) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    Encoder e = factory.binaryEncoder(out, null);
    encode(e, s, n);
    e.flush();
    return out.toByteArray();
  }

  /**
   * Encodes the given Json node <tt>n</tt> on to the encoder <tt>e</tt>
   * according to the schema <tt>s</tt>.
   * @param e The encoder to encode into.
   * @param s The schema for the object being encoded.
   * @param n The Json node to encode.
   * @throws IOException
   * @deprecated internal method
   */
  @Deprecated
  public static void encode(Encoder e, Schema s, JsonNode n)
    throws IOException {
    switch (s.getType()) {
    case RECORD:
      for (Field f : s.getFields()) {
        String name = f.name();
        JsonNode v = n.get(name);
        if (v == null) {
          v = f.defaultValue();
        }
        if (v == null) {
          throw new AvroTypeException("No default value for: " + name);
        }
        encode(e, f.schema(), v);
      }
      break;
    case ENUM:
      e.writeEnum(s.getEnumOrdinal(n.getTextValue()));
      break;
    case ARRAY:
      e.writeArrayStart();
      e.setItemCount(n.size());
      Schema i = s.getElementType();
      for (JsonNode node : n) {
        e.startItem();
        encode(e, i, node);
      }
      e.writeArrayEnd();
      break;
    case MAP:
      e.writeMapStart();
      e.setItemCount(n.size());
      Schema v = s.getValueType();
      for (Iterator<String> it = n.getFieldNames(); it.hasNext();) {
        e.startItem();
        String key = it.next();
        e.writeString(key);
        encode(e, v, n.get(key));
      }
      e.writeMapEnd();
      break;
    case UNION:
      e.writeIndex(0);
      encode(e, s.getTypes().get(0), n);
      break;
    case FIXED:
      if (!n.isTextual())
        throw new AvroTypeException("Non-string default value for fixed: "+n);
      byte[] bb = n.getTextValue().getBytes("ISO-8859-1");
      if (bb.length != s.getFixedSize()) {
        bb = Arrays.copyOf(bb, s.getFixedSize());
      }
      e.writeFixed(bb);
      break;
    case STRING:
      if (!n.isTextual())
        throw new AvroTypeException("Non-string default value for string: "+n);
      e.writeString(n.getTextValue());
      break;
    case BYTES:
      if (!n.isTextual())
        throw new AvroTypeException("Non-string default value for bytes: "+n);
      e.writeBytes(n.getTextValue().getBytes("ISO-8859-1"));
      break;
    case INT:
      if (!n.isNumber())
        throw new AvroTypeException("Non-numeric default value for int: "+n);
      e.writeInt(n.getIntValue());
      break;
    case LONG:
      if (!n.isNumber())
        throw new AvroTypeException("Non-numeric default value for long: "+n);
      e.writeLong(n.getLongValue());
      break;
    case FLOAT:
      if (!n.isNumber())
        throw new AvroTypeException("Non-numeric default value for float: "+n);
      e.writeFloat((float) n.getDoubleValue());
      break;
    case DOUBLE:
      if (!n.isNumber())
        throw new AvroTypeException("Non-numeric default value for double: "+n);
      e.writeDouble(n.getDoubleValue());
      break;
    case BOOLEAN:
      if (!n.isBoolean())
        throw new AvroTypeException("Non-boolean default for boolean: "+n);
      e.writeBoolean(n.getBooleanValue());
      break;
    case NULL:
      if (!n.isNull())
        throw new AvroTypeException("Non-null default value for null type: "+n);
      e.writeNull();
      break;
    }
  }

  private static Symbol mkEnumAdjust(List<String> wsymbols,
      List<String> rsymbols){
    Object[] adjustments = new Object[wsymbols.size()];
    for (int i = 0; i < adjustments.length; i++) {
      int j = rsymbols.indexOf(wsymbols.get(i));
      adjustments[i] = (j == -1 ? "No match for " + wsymbols.get(i)
                                : new Integer(j));
    }
    return Symbol.enumAdjustAction(rsymbols.size(), adjustments);
  }

  /**
   * This checks if the symbol itself is an error or if there is an error in
   * its production.
   *
   * When the symbol is created for a record, this checks whether the record
   * fields are present (the symbol is not an error action) and that all of the
   * fields have a non-error action. Record fields may have nested error
   * actions.
   *
   * @return true if the symbol is an error or if its production has an error
   */
  private boolean hasMatchError(Symbol sym) {
    if (sym instanceof Symbol.ErrorAction) {
      return true;
    } else {
      for (int i = 0; i < sym.production.length; i += 1) {
        if (sym.production[i] instanceof Symbol.ErrorAction) {
          return true;
        }
      }
    }
    return false;
  }

  private int bestBranch(Schema r, Schema w, Map<LitS, Symbol> seen) throws IOException {
    Schema.Type vt = w.getType();
      // first scan for exact match
      int j = 0;
      int structureMatch = -1;
      for (Schema b : r.getTypes()) {
        if (vt == b.getType())
          if (vt == Schema.Type.RECORD || vt == Schema.Type.ENUM ||
              vt == Schema.Type.FIXED) {
            String vname = w.getFullName();
            String bname = b.getFullName();
            // return immediately if the name matches exactly according to spec
            if (vname != null && vname.equals(bname))
              return j;

            if (vt == Schema.Type.RECORD &&
                !hasMatchError(resolveRecords(w, b, seen))) {
              String vShortName = w.getName();
              String bShortName = b.getName();
              // use the first structure match or one where the name matches
              if ((structureMatch < 0) ||
                  (vShortName != null && vShortName.equals(bShortName))) {
                structureMatch = j;
              }
            }
          } else
            return j;
        j++;
      }

      // if there is a record structure match, return it
      if (structureMatch >= 0)
        return structureMatch;

      // then scan match via numeric promotion
      j = 0;
      for (Schema b : r.getTypes()) {
        switch (vt) {
        case INT:
          switch (b.getType()) {
          case LONG: case DOUBLE:
            return j;
          }
          break;
        case LONG:
        case FLOAT:
          switch (b.getType()) {
          case DOUBLE:
            return j;
          }
          break;
        case STRING:
          switch (b.getType()) {
          case BYTES:
            return j;
          }
          break;
        case BYTES:
          switch (b.getType()) {
          case STRING:
            return j;
          }
          break;
        }
        j++;
      }
      return -1;
  }

  /**
   * Clever trick which differentiates items put into
   * <code>seen</code> by {@link ValidatingGrammarGenerator#validating validating()}
   * from those put in by {@link ValidatingGrammarGenerator#resolving resolving()}.
   */
   static class LitS2 extends ValidatingGrammarGenerator.LitS {
     public Schema expected;
     public LitS2(Schema actual, Schema expected) {
       super(actual);
       this.expected = expected;
     }
     public boolean equals(Object o) {
       if (! (o instanceof LitS2)) return false;
       LitS2 other = (LitS2) o;
       return actual == other.actual && expected == other.expected;
     }
     public int hashCode() {
       return super.hashCode() + expected.hashCode();
     }
   }
}