/** * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.cdk.data.hbase.avro; import com.cloudera.cdk.data.DatasetException; import com.cloudera.cdk.data.PartitionKey; import com.cloudera.cdk.data.SchemaNotFoundException; import com.cloudera.cdk.data.SchemaValidationException; import com.cloudera.cdk.data.hbase.impl.BaseEntityMapper; import com.cloudera.cdk.data.hbase.impl.EntityMapper; import com.cloudera.cdk.data.hbase.impl.EntitySchema; import com.cloudera.cdk.data.hbase.impl.EntitySerDe; import com.cloudera.cdk.data.hbase.impl.HBaseUtils; import com.cloudera.cdk.data.hbase.impl.KeySchema; import com.cloudera.cdk.data.hbase.impl.KeySerDe; import com.cloudera.cdk.data.hbase.impl.PutAction; import com.cloudera.cdk.data.hbase.impl.SchemaManager; import com.cloudera.cdk.data.hbase.manager.generated.ManagedSchemaEntityVersion; import java.io.IOException; import java.util.Arrays; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.hbase.client.Increment; import org.apache.hadoop.hbase.client.Result; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.node.ObjectNode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * An entity mapper that is able to support multiple Avro schema versions in a * table. It uses a special set of columns in the table to determine the schema * version of each row, and uses that schema to properly deserialize the row. * Writing to tables is tied to a single schema. * * @param <KEY> * @param <ENTITY> */ public class VersionedAvroEntityMapper<ENTITY extends IndexedRecord> implements EntityMapper<ENTITY> { private static Logger LOG = LoggerFactory .getLogger(VersionedAvroEntityMapper.class); /** * The schema parser we'll use to parse managed schemas. */ private static final AvroKeyEntitySchemaParser schemaParser = new AvroKeyEntitySchemaParser(); private final SchemaManager schemaManager; private final String tableName; private final String entityName; private final Class<ENTITY> entityClass; private final AvroKeySchema keySchema; private final AvroEntitySchema entitySchema; private final int version; private final boolean specific; private final ConcurrentHashMap<Integer, EntityMapper<ENTITY>> entityMappers = new ConcurrentHashMap<Integer, EntityMapper<ENTITY>>(); private final String managedSchemaEntityVersionSchema; private EntityMapper<ManagedSchemaEntityVersion> managedSchemaEntityVersionEntityMapper; /** * Builder for the VersionedAvroEntityMapper. This is the only way to * construct one. */ public static class Builder { private SchemaManager schemaManager; private String tableName; private String entityName; private boolean specific; private String genericSchemaString; public Builder setSchemaManager(SchemaManager schemaManager) { this.schemaManager = schemaManager; return this; } public Builder setTableName(String tableName) { this.tableName = tableName; return this; } public Builder setEntityName(String entityName) { this.entityName = entityName; return this; } public Builder setSpecific(boolean specific) { this.specific = specific; return this; } public Builder setGenericSchemaString(String genericSchemaString) { this.genericSchemaString = genericSchemaString; return this; } public <E extends IndexedRecord> VersionedAvroEntityMapper<E> build() { return new VersionedAvroEntityMapper<E>(this); } } @SuppressWarnings("unchecked") private VersionedAvroEntityMapper(Builder builder) { this.schemaManager = builder.schemaManager; this.tableName = builder.tableName; this.entityName = builder.entityName; this.specific = builder.specific; this.managedSchemaEntityVersionSchema = getManagedSchemaEntityVersionSchema(entityName); if (!specific) { // Must be a Generic avro record. Tie the key and entity classes to // GenericRecords entityClass = (Class<ENTITY>) GenericRecord.class; // Get the key schema to use from the schema manager. keySchema = (AvroKeySchema) schemaManager.getKeySchema(tableName, entityName); // The entiySchema to use for writes may have been passed into the // Builder. If not, use the highest version schema registered in the // schema manager. if (builder.genericSchemaString != null) { entitySchema = schemaParser.parseEntitySchema(builder.genericSchemaString); } else { entitySchema = (AvroEntitySchema) schemaManager.getEntitySchema( tableName, entityName); } // verify that this schema exists in the managed schema table, and get its // version. This method will throw a SchemaNotFoundException if it can't // find it. version = schemaManager.getEntityVersion(tableName, entityName, entitySchema); } else { // specific record, so initialize the schema, class, and version member // variables appropriately. try { keySchema = (AvroKeySchema) schemaManager.getKeySchema(tableName, entityName); // Get the highest version schema to determine the key and entity record // class names. Initialize the key and entity class members from those // names. AvroEntitySchema mostRecentEntitySchema = (AvroEntitySchema) schemaManager .getEntitySchema(tableName, entityName); String entityClassName = mostRecentEntitySchema.getAvroSchema() .getFullName(); entityClass = (Class<ENTITY>) Class.forName(entityClassName); // Initialize the entitySchema from the SCHEMA$ field on the class. This // will be or schema we'll use to write with. String entitySchemaString = entityClass.getField("SCHEMA$").get(null) .toString(); entitySchema = schemaParser.parseEntitySchema(entitySchemaString); // verify that this schema exists in the managed schema table, and get // its // version. This method will throw a SchemaNotFoundException if it can't // find it. version = schemaManager.getEntityVersion(tableName, entityName, entitySchema); } catch (ClassNotFoundException e) { String msg = "StorageKey or entity class not found. Make sure the specific " + "record instances are on the classpath."; LOG.error(msg, e); throw new DatasetException(msg, e); } catch (SecurityException e) { String msg = "Cannot access key or entity class."; LOG.error(msg, e); throw new DatasetException(msg, e); } catch (NoSuchFieldException e) { String msg = "SCHEMA$ field not found in the entity class"; LOG.error(msg, e); throw new DatasetException(msg, e); } catch (IllegalAccessException e) { String msg = "Not allowed to access SCHEMA$ field in the entity class"; LOG.error(msg, e); throw new DatasetException(msg, e); } } // Initialize the entity mappers this object wraps. There will be one entity // mapper per version of the schema. When deserializing a row, we'll use the // entity mapper that configured for that schema version. updateEntityMappers(); // Initialize the entity mapper used to deserialize the special // ManagedSchemaEntityVersion record from each row. initializeEntityVersionEntityMapper(); } @Override public ENTITY mapToEntity(Result result) { ManagedSchemaEntityVersion versionRecord = managedSchemaEntityVersionEntityMapper .mapToEntity(result); int resultVersion = 0; if (versionRecord != null) { resultVersion = versionRecord.getSchemaVersion(); } if (entityMappers.containsKey(resultVersion)) { return entityMappers.get(resultVersion).mapToEntity(result); } else { schemaManager.refreshManagedSchemaCache(tableName, entityName); updateEntityMappers(); if (entityMappers.containsKey(resultVersion)) { return entityMappers.get(resultVersion).mapToEntity(result); } else { String msg = "Could not find schema for " + tableName + ", " + entityName + ", with version " + resultVersion; LOG.error(msg); throw new SchemaNotFoundException(msg); } } } @Override public PutAction mapFromEntity(ENTITY entity) { EntityMapper<ENTITY> entityMapper = entityMappers.get(version); PutAction entityPut = entityMapper.mapFromEntity(entity); ManagedSchemaEntityVersion versionRecord = ManagedSchemaEntityVersion .newBuilder().setSchemaVersion(version).build(); PutAction versionPut = managedSchemaEntityVersionEntityMapper .mapFromEntity(versionRecord); byte[] keyBytes = entityPut.getPut().getRow(); versionPut = HBaseUtils.mergePutActions(keyBytes, Arrays.asList(versionPut)); return HBaseUtils.mergePutActions(keyBytes, Arrays.asList(entityPut, versionPut)); } @Override public Increment mapToIncrement(PartitionKey key, String fieldName, long incrementValue) { return entityMappers.get(version).mapToIncrement(key, fieldName, incrementValue); } @Override public long mapFromIncrementResult(Result result, String fieldName) { return entityMappers.get(version).mapFromIncrementResult(result, fieldName); } @Override public Set<String> getRequiredColumns() { Set<String> requiredColumns = entityMappers.get(version) .getRequiredColumns(); requiredColumns.addAll(managedSchemaEntityVersionEntityMapper .getRequiredColumns()); return requiredColumns; } @Override public Set<String> getRequiredColumnFamilies() { Set<String> requiredColumns = entityMappers.get(version) .getRequiredColumnFamilies(); requiredColumns.addAll(managedSchemaEntityVersionEntityMapper .getRequiredColumnFamilies()); return requiredColumns; } @Override public KeySchema getKeySchema() { return keySchema; } @Override public EntitySchema getEntitySchema() { return entitySchema; } @Override public KeySerDe getKeySerDe() { return entityMappers.get(version).getKeySerDe(); } @Override public EntitySerDe<ENTITY> getEntitySerDe() { return entityMappers.get(version).getEntitySerDe(); } /** * Initialize the entity mapper we'll use to convert the schema version * metadata in each row to a ManagedSchemaEntityVersion record. */ private void initializeEntityVersionEntityMapper() { // Create a special key serde that doesn't try to serialize/deserialize // the key for the ManagedSchemaEntityVersion record. This schema // doesn't have any key mapping types since it's added to every table. KeySerDe keySerDe = new KeySerDe() { @Override public byte[] serialize(PartitionKey partitionKey) { return new byte[0]; } @Override public byte[] serialize(Object... keyPartValues) { return new byte[0]; } @Override public PartitionKey deserialize(byte[] keyBytes) { return null; } }; AvroEntitySchema avroEntitySchema = schemaParser .parseEntitySchema(managedSchemaEntityVersionSchema); avroEntitySchema = AvroUtils.mergeSpecificStringTypes( ManagedSchemaEntityVersion.class, avroEntitySchema); AvroEntityComposer<ManagedSchemaEntityVersion> entityComposer = new AvroEntityComposer<ManagedSchemaEntityVersion>( avroEntitySchema, true); AvroEntitySerDe<ManagedSchemaEntityVersion> entitySerDe = new AvroEntitySerDe<ManagedSchemaEntityVersion>( entityComposer, avroEntitySchema, avroEntitySchema, true); this.managedSchemaEntityVersionEntityMapper = new BaseEntityMapper<ManagedSchemaEntityVersion>( keySchema, avroEntitySchema, keySerDe, entitySerDe); } /** * Update the map of wrapped entity mappers to reflect the most recent entity * schema metadata returned by the schemaManager. */ private void updateEntityMappers() { for (Entry<Integer, EntitySchema> entry : schemaManager.getEntitySchemas( tableName, entityName).entrySet()) { if (!entityMappers.containsKey(entry.getKey())) { AvroEntitySchema writtenSchema = (AvroEntitySchema) entry.getValue(); EntityMapper<ENTITY> entityMapper = constructWrappedEntityMapper( keySchema, entitySchema, writtenSchema, entityClass); entityMappers.put(entry.getKey(), entityMapper); } } } /** * * @param keySchema * @param readSchema * @param writeSchema * @param keyClass * @param entityClass * @return */ @SuppressWarnings({ "rawtypes", "unchecked" }) private EntityMapper<ENTITY> constructWrappedEntityMapper( AvroKeySchema keySchema, AvroEntitySchema readSchema, AvroEntitySchema writeSchema, Class entityClass) { if (specific) { keySchema = AvroUtils.mergeSpecificStringTypes(entityClass, keySchema); readSchema = AvroUtils.mergeSpecificStringTypes(entityClass, readSchema); AvroEntityComposer entityComposer = new AvroEntityComposer(readSchema, true); AvroEntitySerDe entitySerDe = new AvroEntitySerDe(entityComposer, readSchema, writeSchema, true); KeySerDe keySerDe = new AvroKeySerDe(keySchema.getAvroSchema(), keySchema.getPartitionStrategy()); return new BaseEntityMapper(keySchema, readSchema, keySerDe, entitySerDe); } else { KeySerDe keySerDe = new AvroKeySerDe(keySchema.getAvroSchema(), keySchema.getPartitionStrategy()); AvroEntityComposer entityComposer = new AvroEntityComposer(readSchema, false); AvroEntitySerDe entitySerDe = new AvroEntitySerDe(entityComposer, readSchema, writeSchema, false); return new BaseEntityMapper(keySchema, readSchema, keySerDe, entitySerDe); } } private static String getManagedSchemaEntityVersionSchema(String entityName) { String avroSchemaString = AvroUtils .inputStreamToString(VersionedAvroEntityMapper.class .getResourceAsStream("/ManagedSchemaEntityVersion.avsc")); JsonNode jsonNode = rawSchemaAsJsonNode(avroSchemaString); ObjectMapper mapper = new ObjectMapper(); ObjectNode mappingNode = mapper.createObjectNode(); mappingNode.put("type", "column"); mappingNode.put("value", "_s:sv_" + entityName); ((ObjectNode)jsonNode.get("fields").get(0)).put("mapping", mappingNode); return jsonNode.toString(); } private static JsonNode rawSchemaAsJsonNode(String rawSchema) { ObjectMapper mapper = new ObjectMapper(); JsonNode avroRecordSchemaJson; try { avroRecordSchemaJson = mapper.readValue(rawSchema, JsonNode.class); } catch (IOException e) { throw new SchemaValidationException( "Could not parse the avro record as JSON.", e); } return avroRecordSchemaJson; } }