/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.hbase.tool;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import org.apache.commons.collections.MultiHashMap;
import org.apache.hadoop.hbase.util.Bytes;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.ValidationException;
import org.kitesdk.data.hbase.avro.AvroEntitySchema;
import org.kitesdk.data.hbase.avro.AvroKeyEntitySchemaParser;
import org.kitesdk.data.hbase.avro.AvroKeySchema;
import org.kitesdk.data.hbase.avro.AvroUtils;
import org.kitesdk.data.hbase.impl.Constants;
import org.kitesdk.data.hbase.impl.KeySchema;
import org.kitesdk.data.hbase.impl.SchemaManager;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Utility class for managing Managed Schemas in HBase Common.
*/
public class SchemaTool {
// Wait for 600 seconds (10 minutes) for all the tables to be available
private static final int MAX_SECOND_WAIT_FOR_TABLE_CREATION = 600;
private static final Logger LOG = LoggerFactory.getLogger(SchemaTool.class);
private static final String CLASSPATH_PREFIX = "classpath:";
private static final AvroKeyEntitySchemaParser parser = new AvroKeyEntitySchemaParser();
private static final ObjectMapper mapper = new ObjectMapper();
private static final JsonFactory factory = mapper.getJsonFactory();
private final SchemaManager schemaManager;
private final HBaseAdmin hbaseAdmin;
public SchemaTool(HBaseAdmin hbaseAdmin, SchemaManager entityManager) {
this.hbaseAdmin = hbaseAdmin;
this.schemaManager = entityManager;
}
/**
* Scans the schemaDirectory for avro schemas, and creates or migrates HBase
* Common managed schemas managed by this instances entity manager.
*
* @param schemaDirectory
* The directory to recursively scan for avro schema files. This
* directory can be a directory on the classpath, including a
* directory that is embeddded in a jar on the classpath. In both of
* those cases, the schemaDirectory should be prefixed with
* classpath:
* @param createTableAndFamilies
* If true, will create the table for each schema if it doesn't
* exist, and will create families if they don't exist.
*/
public void createOrMigrateSchemaDirectory(String schemaDirectory,
boolean createTableAndFamilies) throws InterruptedException {
List<String> schemaStrings;
if (schemaDirectory.startsWith(CLASSPATH_PREFIX)) {
URL dirURL = getClass().getClassLoader().getResource(
schemaDirectory.substring(CLASSPATH_PREFIX.length()));
if (dirURL != null && dirURL.getProtocol().equals("file")) {
try {
schemaStrings = getSchemaStringsFromDir(new File(dirURL.toURI()));
} catch (URISyntaxException e) {
throw new DatasetException(e);
}
} else if (dirURL != null && dirURL.getProtocol().equals("jar")) {
String jarPath = dirURL.getPath().substring(5,
dirURL.getPath().indexOf("!"));
schemaStrings = getSchemaStringsFromJar(jarPath,
schemaDirectory.substring(CLASSPATH_PREFIX.length()));
} else {
String msg = "Could not find classpath resource: " + schemaDirectory;
LOG.error(msg);
throw new DatasetException(msg);
}
} else {
schemaStrings = getSchemaStringsFromDir(new File(schemaDirectory));
}
Map<String, List<String>> tableEntitySchemaMap = new HashMap<String, List<String>>();
for (String schemaString : schemaStrings) {
List<String> tables = getTablesFromSchemaString(schemaString);
for (String table : tables) {
if (tableEntitySchemaMap.containsKey(table)) {
tableEntitySchemaMap.get(table).add(schemaString);
} else {
List<String> entityList = new ArrayList<String>();
entityList.add(schemaString);
tableEntitySchemaMap.put(table, entityList);
}
}
}
// Validate if for every key schema there is atleast one entity schemas
for (Entry<String, List<String>> entry : tableEntitySchemaMap.entrySet()) {
String table = entry.getKey();
List<String> entitySchemas = entry.getValue();
if (entitySchemas.size() == 0) {
String msg =
"Table requested, but no entity schemas for Table: " + table;
LOG.error(msg);
throw new ValidationException(msg);
}
}
// Migrate the schemas in a batch, collect all the table descriptors
// that require a schema migration
Collection<HTableDescriptor> tableDescriptors = Lists.newArrayList();
for (Entry<String, List<String>> entry : tableEntitySchemaMap.entrySet()) {
String table = entry.getKey();
for (String entitySchemaString : entry.getValue()) {
boolean migrationRequired = prepareManagedSchema(table, entitySchemaString);
// Optimization: If no migration is req, then no change in the table
if (migrationRequired) {
tableDescriptors.add(
prepareTableDescriptor(table, entitySchemaString));
}
}
}
if (createTableAndFamilies) {
createTables(tableDescriptors);
}
}
/**
* Creates a new managed schema, or migrates an existing one if one exists for
* the table name, entity name pair.
*
* @param tableName
* The name of the table we'll be creating or migrating a schema for.
* @param entitySchemaFilePath
* The absolute file path to the entity schema file.
* @param createTableAndFamilies
* If true, will create the table for this schema if it doesn't
* exist, and will create families if they don't exist.
*/
public void createOrMigrateSchemaFile(String tableName,
String entitySchemaFilePath, boolean createTableAndFamilies)
throws InterruptedException {
createOrMigrateSchemaFile(tableName, new File(entitySchemaFilePath),
createTableAndFamilies);
}
/**
* Creates a new managed schema, or migrates an existing one if one exists for
* the table name, entity name pair.
*
* @param tableName
* The name of the table we'll be creating or migrating a schema for.
* @param entitySchemaFile
* The entity schema file.
* @param createTableAndFamilies
* If true, will create the table for this schema if it doesn't
* exist, and will create families if they don't exist.
*/
public void createOrMigrateSchemaFile(String tableName,
File entitySchemaFile, boolean createTableAndFamilies)
throws InterruptedException {
createOrMigrateSchema(tableName, getSchemaStringFromFile(entitySchemaFile),
createTableAndFamilies);
}
/**
* Creates a new managed schema, or migrates an existing one if one exists for
* the table name, entity name pair.
*
* @param tableName
* The name of the table we'll be creating or migrating a schema for.
* @param entitySchemaString
* The entity schema
* @param createTableAndFamilies
* If true, will create the table for this schema if it doesn't
* exist, and will create families if they don't exist.
*/
public void createOrMigrateSchema(String tableName, String entitySchemaString,
boolean createTableAndFamilies) throws InterruptedException {
boolean migrationRequired = prepareManagedSchema(tableName,
entitySchemaString);
if (migrationRequired && createTableAndFamilies) {
try {
HTableDescriptor descriptor = prepareTableDescriptor(tableName,
entitySchemaString);
if (hbaseAdmin.isTableAvailable(tableName)) {
modifyTable(tableName, descriptor);
} else {
createTable(descriptor);
}
} catch (IOException e) {
throw new DatasetException(e);
}
}
}
/**
* Prepare managed schema for this entitySchema
*/
private boolean prepareManagedSchema(String tableName,
String entitySchemaString) {
String entityName = getEntityNameFromSchemaString(entitySchemaString);
AvroEntitySchema entitySchema = parser
.parseEntitySchema(entitySchemaString);
AvroKeySchema keySchema = parser.parseKeySchema(entitySchemaString);
// Verify there are no ambiguities with the managed schemas
if (schemaManager.hasManagedSchema(tableName, entityName)) {
KeySchema currentKeySchema = schemaManager
.getKeySchema(tableName, entityName);
if (!keySchema.equals(currentKeySchema)) {
String msg =
"Migrating schema with different keys. Current: " + currentKeySchema
.getRawSchema() + " New: " + keySchema.getRawSchema();
LOG.error(msg);
throw new ValidationException(msg);
}
if (!schemaManager
.hasSchemaVersion(tableName, entityName, entitySchema)) {
LOG.info("Migrating Schema: (" + tableName + ", " + entityName + ")");
schemaManager.migrateSchema(tableName, entityName, entitySchemaString);
} else {
LOG.info("Schema hasn't changed, not migrating: (" + tableName + ", "
+ entityName + ")");
return false;
}
} else {
LOG.info("Creating Schema: (" + tableName + ", " + entityName + ")");
parser.parseEntitySchema(entitySchemaString).getColumnMappingDescriptor()
.getRequiredColumnFamilies();
schemaManager.createSchema(tableName, entityName, entitySchemaString,
"org.kitesdk.data.hbase.avro.AvroKeyEntitySchemaParser",
"org.kitesdk.data.hbase.avro.AvroKeySerDe",
"org.kitesdk.data.hbase.avro.AvroEntitySerDe");
}
return true;
}
/**
* Prepare the Table descriptor for the given entity Schema
*/
private HTableDescriptor prepareTableDescriptor(String tableName,
String entitySchemaString) {
HTableDescriptor descriptor = new HTableDescriptor(
Bytes.toBytes(tableName));
AvroEntitySchema entitySchema = parser
.parseEntitySchema(entitySchemaString);
Set<String> familiesToAdd = entitySchema.getColumnMappingDescriptor()
.getRequiredColumnFamilies();
familiesToAdd.add(new String(Constants.SYS_COL_FAMILY));
familiesToAdd.add(new String(Constants.OBSERVABLE_COL_FAMILY));
for (String familyToAdd : familiesToAdd) {
if (!descriptor.hasFamily(familyToAdd.getBytes())) {
descriptor.addFamily(new HColumnDescriptor(familyToAdd));
}
}
return descriptor;
}
/**
* Create the tables asynchronously with the HBase
*/
private void createTables(Collection<HTableDescriptor> tableDescriptors)
throws InterruptedException {
try {
Set<String> tablesCreated = Sets.newHashSet();
Multimap<String, HTableDescriptor> pendingTableUpdates = ArrayListMultimap
.create();
for (HTableDescriptor tableDescriptor : tableDescriptors) {
String tableName = Bytes.toString(tableDescriptor.getName());
if (tablesCreated.contains(tableName)) {
// We have to wait for the table async creation to modify
// Just add the required columns to be added
pendingTableUpdates.put(tableName, tableDescriptor);
} else {
LOG.info("Creating table " + tableName);
hbaseAdmin.createTableAsync(tableDescriptor, new byte[][] {});
tablesCreated.add(tableName);
}
}
// Wait for the tables to be online
for (int waitCount = 0;
waitCount < MAX_SECOND_WAIT_FOR_TABLE_CREATION; waitCount++) {
Iterator<String> iterator = tablesCreated.iterator();
while (iterator.hasNext()) {
String table = iterator.next();
if (hbaseAdmin.isTableAvailable(table)) {
// Perform any updates scheduled on the table
if (pendingTableUpdates.containsKey(table)) {
for (HTableDescriptor tableDescriptor : pendingTableUpdates
.get(table)) {
// Add the new columns - synchronous calls
modifyTable(table, tableDescriptor);
}
}
iterator.remove();
}
}
// If all tables are available, then break
if (tablesCreated.isEmpty()) {
break;
}
// Sleep for a second before checking again
Thread.sleep(1000);
}
} catch (IOException e) {
throw new DatasetException(e);
}
}
/**
* add the column families which are not already present to the given table
*/
private void modifyTable(String tableName, HTableDescriptor newDescriptor) {
LOG.info("Modifying table " + tableName);
HColumnDescriptor[] newFamilies = newDescriptor.getColumnFamilies();
try {
List<HColumnDescriptor> columnsToAdd = Lists.newArrayList();
HTableDescriptor currentFamilies = hbaseAdmin
.getTableDescriptor(Bytes.toBytes(tableName));
for (HColumnDescriptor newFamily : newFamilies) {
if (!currentFamilies.hasFamily(newFamily.getName())) {
columnsToAdd.add(new HColumnDescriptor(newFamily.getName()));
}
}
// Add all the necessary column families
if (!columnsToAdd.isEmpty()) {
hbaseAdmin.disableTable(tableName);
try {
for (HColumnDescriptor columnToAdd : columnsToAdd) {
hbaseAdmin.addColumn(tableName, columnToAdd);
}
} finally {
hbaseAdmin.enableTable(tableName);
}
}
} catch (IOException e) {
throw new DatasetException(e);
}
}
/**
* Create a single column asynchronously
*/
private void createTable(HTableDescriptor tableDescriptor)
throws InterruptedException {
createTables(ImmutableList.of(tableDescriptor));
}
/**
* Will return the contents of schemaFile as a string
*
* @param schemaFile
* The file who's contents should be returned.
* @return The contents of schemaFile
*/
private String getSchemaStringFromFile(File schemaFile) {
String schemaString;
FileInputStream fis = null;
try {
fis = new FileInputStream(schemaFile);
schemaString = AvroUtils.inputStreamToString(fis);
} catch (IOException e) {
throw new DatasetException(e);
} finally {
if (fis != null) {
try {
fis.close();
} catch (IOException e) {
}
}
}
return schemaString;
}
private List<String> getTablesFromSchemaString(String schema) {
JsonNode node;
try {
JsonParser jp = factory.createJsonParser(schema);
node = mapper.readTree(jp);
if (node.get("tables") == null) {
return new ArrayList<String>();
}
List<String> result = new ArrayList<String>(node.get("tables").size());
for (Iterator<JsonNode> it = node.get("tables").elements(); it
.hasNext();) {
result.add(it.next().textValue());
}
return result;
} catch (JsonParseException e) {
throw new ValidationException(e);
} catch (IOException e) {
throw new ValidationException(e);
}
}
private String getEntityNameFromSchemaString(String schema) {
JsonNode node;
try {
JsonParser jp = factory.createJsonParser(schema);
node = mapper.readTree(jp);
if (node.get("name") == null) {
return null;
}
return node.get("name").textValue();
} catch (JsonParseException e) {
throw new ValidationException(e);
} catch (IOException e) {
throw new ValidationException(e);
}
}
/**
* Gets the list of HBase Common Avro schema strings from dir. It recursively
* searches dir to find files that end in .avsc to locate those strings.
*
* @param dir
* The dir to recursively search for schema strings
* @return The list of schema strings
*/
private List<String> getSchemaStringsFromDir(File dir) {
List<String> schemaStrings = new ArrayList<String>();
Collection<File> schemaFiles = FileUtils.listFiles(dir,
new SuffixFileFilter(".avsc"), TrueFileFilter.INSTANCE);
for (File schemaFile : schemaFiles) {
schemaStrings.add(getSchemaStringFromFile(schemaFile));
}
return schemaStrings;
}
/**
* Gets the list of HBase Common Avro schema strings from a directory in the
* Jar. It recursively searches the directory in the jar to find files that
* end in .avsc to locate thos strings.
*
* @param jarPath
* The path to the jar to search
* @param directoryPath
* The directory in the jar to find avro schema strings
* @return The list of schema strings.
*/
private List<String> getSchemaStringsFromJar(String jarPath,
String directoryPath) {
LOG.info("Getting schema strings in: " + directoryPath + ", from jar: "
+ jarPath);
JarFile jar;
try {
jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new DatasetException(e);
} catch (IOException e) {
throw new DatasetException(e);
}
Enumeration<JarEntry> entries = jar.entries();
List<String> schemaStrings = new ArrayList<String>();
while (entries.hasMoreElements()) {
JarEntry jarEntry = entries.nextElement();
if (jarEntry.getName().startsWith(directoryPath)
&& jarEntry.getName().endsWith(".avsc")) {
LOG.info("Found schema: " + jarEntry.getName());
InputStream inputStream;
try {
inputStream = jar.getInputStream(jarEntry);
} catch (IOException e) {
throw new DatasetException(e);
}
String schemaString = AvroUtils.inputStreamToString(inputStream);
schemaStrings.add(schemaString);
}
}
return schemaStrings;
}
}