/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.converter.filter; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.generic.GenericRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.WorkUnitState; import gobblin.converter.AvroToAvroConverterBase; import gobblin.converter.DataConversionException; import gobblin.converter.SchemaConversionException; import gobblin.converter.SingleRecordIterable; import gobblin.util.AvroUtils; /** * Converts schema and data by choosing only selected fields provided by user. */ public class AvroFieldsPickConverter extends AvroToAvroConverterBase { private static final Logger LOG = LoggerFactory.getLogger(AvroFieldsPickConverter.class); private static final Splitter SPLITTER_ON_COMMA = Splitter.on(',').trimResults().omitEmptyStrings(); private static final Splitter SPLITTER_ON_DOT = Splitter.on('.').trimResults().omitEmptyStrings(); /** * Convert the schema to contain only specified field. This will reuse AvroSchemaFieldRemover by listing fields not specified and remove it * from the schema * 1. Retrieve list of fields from property * 2. Traverse schema and get list of fields to be removed * 3. While traversing also confirm specified fields from property also exist * 4. Convert schema by using AvroSchemaFieldRemover * * Each Avro Record type increments depth and from input depth is represented by '.'. Avro schema is always expected to start with Record type * and first record type is depth 0 and won't be represented by '.'. As it's always expected to start with Record type, it's not necessary to disambiguate. * After first record type, if it reaches another record type, the prefix of the field name will be * "[Record name].". * * Example: * { "namespace": "example.avro", "type": "record", "name": "user", "fields": [ { "name": "name", "type": "string" }, { "name": "favorite_number", "type": [ "int", "null" ] }, { "type": "record", "name": "address", "fields": [ { "name": "city", "type": "string" } ] } ] } * If user wants to only choose name and city, the input parameter should be "name,address.city". Note that it is not user.name as first record is depth zero. * {@inheritDoc} * @see gobblin.converter.AvroToAvroConverterBase#convertSchema(org.apache.avro.Schema, gobblin.configuration.WorkUnitState) */ @Override public Schema convertSchema(Schema inputSchema, WorkUnitState workUnit) throws SchemaConversionException { LOG.info("Converting schema " + inputSchema); String fieldsStr = workUnit.getProp(ConfigurationKeys.CONVERTER_AVRO_FIELD_PICK_FIELDS); Preconditions.checkNotNull(fieldsStr, ConfigurationKeys.CONVERTER_AVRO_FIELD_PICK_FIELDS + " is required for converter " + this.getClass().getSimpleName()); LOG.info("Converting schema to selected fields: " + fieldsStr); try { return createSchema(inputSchema, fieldsStr); } catch (Exception e) { throw new SchemaConversionException(e); } } /** * Creates Schema containing only specified fields. * * Traversing via either fully qualified names or input Schema is quite inefficient as it's hard to align each other. * Also, as Schema's fields is immutable, all the fields need to be collected before updating field in Schema. Figuring out all * required field in just input Schema and fully qualified names is also not efficient as well. * * This is where Trie comes into picture. Having fully qualified names in Trie means, it is aligned with input schema and also it can * provides all children on specific prefix. This solves two problems mentioned above. * * 1. Based on fully qualified field name, build a Trie to present dependencies. * 2. Traverse the Trie. If it's leaf, add field. If it's not a leaf, recurse with child schema. * * @param schema * @param fieldsStr * @return */ private static Schema createSchema(Schema schema, String fieldsStr) { List<String> fields = SPLITTER_ON_COMMA.splitToList(fieldsStr); TrieNode root = buildTrie(fields); return createSchemaHelper(schema, root); } private static Schema createSchemaHelper(Schema inputSchema, TrieNode node) { Schema newRecord = Schema.createRecord(inputSchema.getName(), inputSchema.getDoc(), inputSchema.getNamespace(), inputSchema.isError()); List<Field> newFields = Lists.newArrayList(); for (TrieNode child : node.children.values()) { Field innerSrcField = inputSchema.getField(child.val); Preconditions.checkNotNull(innerSrcField, child.val + " does not exist under " + inputSchema); if (child.children.isEmpty()) { //Leaf newFields.add( new Field(innerSrcField.name(), innerSrcField.schema(), innerSrcField.doc(), innerSrcField.defaultValue())); } else { Schema innerSrcSchema = innerSrcField.schema(); Schema innerDestSchema = createSchemaHelper(innerSrcSchema, child); //Recurse of schema Field innerDestField = new Field(innerSrcField.name(), innerDestSchema, innerSrcField.doc(), innerSrcField.defaultValue()); newFields.add(innerDestField); } } newRecord.setFields(newFields); return newRecord; } private static TrieNode buildTrie(List<String> fqns) { TrieNode root = new TrieNode(null); for (String fqn : fqns) { root.add(fqn); } return root; } private static class TrieNode { private String val; private Map<String, TrieNode> children; TrieNode(String val) { this.val = val; this.children = Maps.newLinkedHashMap(); } void add(String fqn) { addHelper(this, SPLITTER_ON_DOT.splitToList(fqn).iterator(), fqn); } void addHelper(TrieNode node, Iterator<String> fqnIterator, String fqn) { if (!fqnIterator.hasNext()) { return; } String val = fqnIterator.next(); TrieNode child = node.children.get(val); if (child == null) { child = new TrieNode(val); node.children.put(val, child); } else if (!fqnIterator.hasNext()) { //Leaf but there's existing record throw new IllegalArgumentException("Duplicate record detected: " + fqn); } addHelper(child, fqnIterator, fqn); } @Override public String toString() { return "[val: " + this.val + " , children: " + this.children.values() + " ]"; } } @Override public Iterable<GenericRecord> convertRecord(Schema outputSchema, GenericRecord inputRecord, WorkUnitState workUnit) throws DataConversionException { try { return new SingleRecordIterable<>(AvroUtils.convertRecordSchema(inputRecord, outputSchema)); } catch (IOException e) { throw new DataConversionException(e); } } }