/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.spi;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.avro.Schema;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.IncompatibleSchemaException;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.ValidationException;
import org.kitesdk.data.impl.Accessor;
import org.kitesdk.data.spi.partition.ProvidedFieldPartitioner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Methods for checking compatibility.
*/
public abstract class Compatibility {
private static final Logger LOG = LoggerFactory.getLogger(Compatibility.class);
// https://github.com/apache/hive/blob/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g#L431
// The above reference includes a comment that states column names can be
// quoted strings with any character, but this is not true in general. However,
// a leading underscore (_) is allowed if the name is quoted.
private static Pattern hiveCompatible = Pattern
.compile("[a-zA-Z0-9_]+");
//As per the Avro specs mentioned here -http://avro.apache.org/docs/1.7.5/spec.html
// It should start with [A-Za-z_] and subsequently contain only [A-Za-z0-9_]
private static Pattern avroCompatible = Pattern.
compile("^[A-Za-z_][A-Za-z\\d_]*$");
/**
* Checks the name and descriptor for known compatibility issues and throws an
* exception if an incompatibility is found.
*
* If the column names are not compatible across components or if any
* partition name duplicates its source field name, this will cause an error.
*
* @param namespace a String namespace
* @param name a String dataset name
* @param descriptor a {@link DatasetDescriptor}
*/
public static void check(String namespace, String name, DatasetDescriptor descriptor) {
checkDatasetName(namespace, name);
checkDescriptor(descriptor);
}
/**
* Checks the name and schema for known compatibility issues and warns.
*
* If the column names are not compatible across components, this will warn
* the user.
*
* @param namespace a String namespace
* @param datasetName a String dataset name
* @param schema a {@link Schema}
*/
public static void checkAndWarn(String namespace, String datasetName, Schema schema) {
try {
checkDatasetName(namespace, datasetName);
checkSchema(schema);
} catch (IllegalArgumentException e) {
LOG.warn(e.getMessage());
} catch (IllegalStateException e) {
LOG.warn(e.getMessage());
}
}
/**
* Precondition-style validation that a dataset name is compatible.
*
* @param namespace a String namespace
* @param name a String name
*/
public static void checkDatasetName(String namespace, String name) {
Preconditions.checkNotNull(namespace, "Namespace cannot be null");
Preconditions.checkNotNull(name, "Dataset name cannot be null");
ValidationException.check(Compatibility.isCompatibleName(namespace),
"Namespace %s is not alphanumeric (plus '_')",
namespace);
ValidationException.check(Compatibility.isCompatibleName(name),
"Dataset name %s is not alphanumeric (plus '_')",
name);
}
/**
* Precondition-style validation that a {@link Schema} is compatible.
*
* @param schema an avro {@code Schema}
*/
public static void checkSchema(Schema schema) {
Preconditions.checkNotNull(schema, "Schema cannot be null");
List<String> incompatible = getIncompatibleNames(schema);
ValidationException.check(incompatible.isEmpty(),
"Field names are not alphanumeric (plus '_'): %s",
Joiner.on(", ").join(incompatible));
}
/**
* Precondition-style validation that the DatasetDescriptor is compatible.
*
* @param descriptor a {@link DatasetDescriptor}
*/
public static void checkDescriptor(DatasetDescriptor descriptor) {
Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");
Schema schema = descriptor.getSchema();
checkSchema(schema);
if (descriptor.isPartitioned()) {
// marked as [BUG] because this is checked in DatasetDescriptor
Preconditions.checkArgument(schema.getType() == Schema.Type.RECORD,
"[BUG] Partitioned datasets must have record schemas");
Set<String> names = Sets.newHashSet();
for (Schema.Field field : schema.getFields()) {
names.add(field.name());
}
List<String> incompatible = Lists.newArrayList();
List<String> duplicates = Lists.newArrayList();
for (FieldPartitioner fp : Accessor.getDefault().getFieldPartitioners(descriptor
.getPartitionStrategy())) {
String name = fp.getName();
if (!isCompatibleName(name)) {
incompatible.add(name);
} else if (names.contains(name)) {
duplicates.add(name);
} else {
names.add(name);
}
}
ValidationException.check(incompatible.isEmpty(),
"Partition names are not alphanumeric (plus '_'): %s",
Joiner.on(", ").join(incompatible));
ValidationException.check(duplicates.isEmpty(),
"Partition names duplicate data fields: %s",
Joiner.on(", ").join(duplicates));
}
}
/**
* Returns true if the name does not contain characters that are known to be
* incompatible with some projects, such as Hive or HBase.
*
* @param name a String name to check
* @return true if the name is compatible, false if known to be incompatible
*/
public static boolean isCompatibleName(String name) {
return hiveCompatible.matcher(name).matches();
}
/**
* Returns a list of field names from the schema that contain characters that
* are known to be incompatible with some projects, such as Hive or HBase.
*
* @param schema a {@link org.apache.avro.Schema} to check
* @return a {@link java.util.List} of incompatible field names
*/
private static List<String> getIncompatibleNames(Schema schema) {
NameValidation validation = new NameValidation();
SchemaUtil.visit(schema, validation);
return validation.getIncompatibleNames();
}
/**
* Schema visitor used to check compatibility of nested record field names.
*/
private static class NameValidation extends SchemaUtil.SchemaVisitor<Void> {
private List<String> incompatible = Lists.newArrayList();
@Override
public Void record(Schema record, List<String> names, List<Void> fields) {
String recordName = record.getName();
for (String name : names) {
if (!isCompatibleName(name)) {
incompatible.add(recordName + "." + name);
}
}
return null;
}
public List<String> getIncompatibleNames() {
return incompatible;
}
}
/**
* Returns true if the name does not contain characters that are known to be
* incompatible with the specs defined in Avro schema.
*
* @param name a String field name to check
* @return will return true if the name is Avro compatible ,false if not
*/
public static boolean isAvroCompatibleName(String name) {
return avroCompatible.matcher(name).matches();
}
/**
* Checks that the {@code existing} {@link DatasetDescriptor} can be replaced
* by {@code updated}.
*
* @param existing the current {@code DatasetDescriptor} for a dataset
* @param updated a new {@code DatasetDescriptor} for the same dataset
*/
public static void checkUpdate(DatasetDescriptor existing,
DatasetDescriptor updated) {
checkNotChanged("location", existing.getLocation(), updated.getLocation());
checkCompatible(existing, updated);
}
/**
* Checks that the {@code existing} {@link DatasetDescriptor} is compatible
* with {@code test}.
*
* @param existing the current {@code DatasetDescriptor} for a dataset
* @param test a new {@code DatasetDescriptor} for the same dataset
*/
public static void checkCompatible(DatasetDescriptor existing,
DatasetDescriptor test) {
checkNotChanged("format", existing.getFormat(), test.getFormat());
checkNotChanged("partitioning",
existing.isPartitioned(), test.isPartitioned());
if (existing.isPartitioned()) {
checkStrategyUpdate(
existing.getPartitionStrategy(),
test.getPartitionStrategy(),
test.getSchema());
}
// check can read records written with old schema using new schema
Schema oldSchema = existing.getSchema();
Schema testSchema = test.getSchema();
if (!SchemaValidationUtil.canRead(oldSchema, testSchema)) {
throw new IncompatibleSchemaException("Schema cannot read data " +
"written using existing schema. Schema: " + testSchema.toString(true) +
"\nExisting schema: " + oldSchema.toString(true));
}
}
private static void checkNotChanged(String what,
@Nullable Object existing,
@Nullable Object test) {
ValidationException.check(
(existing == test) || (existing != null && existing.equals(test)),
"Dataset %s is not compatible with existing: %s != %s",
what, String.valueOf(existing), String.valueOf(test));
}
public static void checkStrategyUpdate(PartitionStrategy existing,
PartitionStrategy other,
Schema schema) {
List<FieldPartitioner> existingFields = Accessor.getDefault()
.getFieldPartitioners(existing);
List<FieldPartitioner> otherFields = Accessor.getDefault()
.getFieldPartitioners(other);
ValidationException.check(existingFields.size() == otherFields.size(),
"Not compatible: cannot replace %s partitioners with %s partitioners",
existingFields.size(), otherFields.size());
for (int i = 0; i < existingFields.size(); i += 1) {
FieldPartitioner fp = existingFields.get(i);
FieldPartitioner replacement = otherFields.get(i);
if (fp.equals(replacement)) {
continue;
}
ValidationException.check(fp instanceof ProvidedFieldPartitioner,
"Cannot replace partition %s: not a provided partitioner",
fp.getName());
ValidationException.check(fp.getName().equals(replacement.getName()),
"Cannot change the name of partition %s (to %s)",
fp.getName(), replacement.getName());
Class<?> outputType = SchemaUtil.getPartitionType(replacement, schema);
ValidationException.check(
isCompatibleWithProvidedType(fp.getType(), outputType),
"Cannot change the data type of partition %s", fp.getName());
}
}
/**
* Check whether data already written to a provided partition can be read
* using the new partition type.
* <p>
* For example, existing ints can be read as strings, but existing strings
* can't be read as ints.
*
* @param providedClass the class from a provided partitioner
* @param replacementClass the partition class of the replacement partitioner
* @return {@code true} iff replacement class can be used with existing data
*/
private static boolean isCompatibleWithProvidedType(Class<?> providedClass,
Class<?> replacementClass) {
if (Integer.class.isAssignableFrom(providedClass)) {
return (replacementClass == String.class ||
replacementClass == Integer.class ||
replacementClass == Long.class);
} else if (Long.class.isAssignableFrom(providedClass)) {
return (replacementClass == String.class ||
replacementClass == Long.class);
} else if (String.class.isAssignableFrom(providedClass)) {
return replacementClass == String.class;
}
return false;
}
}