/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.data.spi.filesystem; import au.com.bytecode.opencsv.CSVParser; import au.com.bytecode.opencsv.CSVReader; import au.com.bytecode.opencsv.CSVWriter; import com.google.common.collect.ImmutableSet; import java.util.Set; import javax.annotation.Nullable; import org.apache.avro.SchemaBuilder; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.CharMatcher; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.util.regex.Pattern; import org.apache.avro.Schema; import org.kitesdk.data.DatasetException; import org.kitesdk.data.spi.Compatibility; import static java.lang.Math.min; public class CSVUtil { public static CSVParser newParser(CSVProperties props) { return new CSVParser( props.delimiter.charAt(0), props.quote.charAt(0), props.escape.charAt(0), false /* strict quotes off: don't ignore unquoted strings */, true /* ignore leading white-space */ ); } public static CSVReader newReader(InputStream incoming, CSVProperties props) { return new CSVReader( new InputStreamReader(incoming, Charset.forName(props.charset)), props.delimiter.charAt(0), props.quote.charAt(0), props.escape.charAt(0), props.linesToSkip, false /* strict quotes off: don't ignore unquoted strings */, true /* ignore leading white-space */ ); } public static CSVWriter newWriter(OutputStream outgoing, CSVProperties props) { return new CSVWriter(new OutputStreamWriter( outgoing, Charset.forName(props.charset)), props.delimiter.charAt(0), props.quote.charAt(0), props.escape.charAt(0)); } private static final Pattern LONG = Pattern.compile("\\d+"); private static final Pattern DOUBLE = Pattern.compile("\\d*\\.\\d*[dD]?"); private static final Pattern FLOAT = Pattern.compile("\\d*\\.\\d*[fF]?"); private static final int DEFAULT_INFER_LINES = 25; private static final Set<String> NO_REQUIRED_FIELDS = ImmutableSet.of(); public static Schema inferNullableSchema(String name, InputStream incoming, CSVProperties props) throws IOException { return inferSchemaInternal(name, incoming, props, NO_REQUIRED_FIELDS, true); } public static Schema inferNullableSchema(String name, InputStream incoming, CSVProperties props, Set<String> requiredFields) throws IOException { return inferSchemaInternal(name, incoming, props, requiredFields, true); } public static Schema inferSchema(String name, InputStream incoming, CSVProperties props) throws IOException { return inferSchemaInternal(name, incoming, props, NO_REQUIRED_FIELDS, false); } public static Schema inferSchema(String name, InputStream incoming, CSVProperties props, Set<String> requiredFields) throws IOException { return inferSchemaInternal(name, incoming, props, requiredFields, false); } private static Schema inferSchemaInternal(String name, InputStream incoming, CSVProperties props, Set<String> requiredFields, boolean makeNullable) throws IOException { CSVReader reader = newReader(incoming, props); String[] header; String[] line; if (props.useHeader) { // read the header and then the first line header = reader.readNext(); line = reader.readNext(); Preconditions.checkNotNull(line, "No content to infer schema"); } else if (props.header != null) { header = newParser(props).parseLine(props.header); line = reader.readNext(); Preconditions.checkNotNull(line, "No content to infer schema"); } else { // use the first line to create a header line = reader.readNext(); Preconditions.checkNotNull(line, "No content to infer schema"); header = new String[line.length]; for (int i = 0; i < line.length; i += 1) { header[i] = "field_" + String.valueOf(i); } } Schema.Type[] types = new Schema.Type[header.length]; String[] values = new String[header.length]; boolean[] nullable = new boolean[header.length]; boolean[] empty = new boolean[header.length]; for (int processed = 0; processed < DEFAULT_INFER_LINES; processed += 1) { if (line == null) { break; } for (int i = 0; i < header.length; i += 1) { if (i < line.length) { if (types[i] == null) { types[i] = inferFieldType(line[i]); if (types[i] != null) { // keep track of the value used values[i] = line[i]; } } if (line[i] == null) { nullable[i] = true; } else if (line[i].isEmpty()) { empty[i] = true; } } else { // no value results in null nullable[i] = true; } } line = reader.readNext(); } SchemaBuilder.FieldAssembler<Schema> fieldAssembler = SchemaBuilder.record(name) .doc("Schema generated by Kite").fields(); // types may be missing, but fieldSchema will return a nullable string for (int i = 0; i < header.length; i += 1) { if (header[i] == null) { throw new DatasetException("Bad header for field " + i + ": null"); } String fieldName = header[i].trim(); if (fieldName.isEmpty()) { throw new DatasetException( "Bad header for field " + i + ": \"" + fieldName + "\""); } else if(!Compatibility.isAvroCompatibleName(fieldName)) { throw new DatasetException( "Bad header for field, should start with a character " + "or _ and can contain only alphanumerics and _ " + i + ": \"" + fieldName + "\""); } // the empty string is not considered null for string fields boolean foundNull = (nullable[i] || (empty[i] && types[i] != Schema.Type.STRING)); if (requiredFields.contains(fieldName)) { if (foundNull) { throw new DatasetException("Found null value for required field: " + fieldName + " (" + types[i] + ")"); } fieldAssembler = fieldAssembler.name(fieldName) .doc("Type inferred from '" + sample(values[i]) + "'") .type(schema(types[i], false)).noDefault(); } else { SchemaBuilder.GenericDefault<Schema> defaultBuilder = fieldAssembler.name(fieldName) .doc("Type inferred from '" + sample(values[i]) + "'") .type(schema(types[i], makeNullable || foundNull)); if (makeNullable || foundNull) { fieldAssembler = defaultBuilder.withDefault(null); } else { fieldAssembler = defaultBuilder.noDefault(); } } } return fieldAssembler.endRecord(); } private static final CharMatcher NON_PRINTABLE = CharMatcher .inRange('\u0020', '\u007e').negate(); @VisibleForTesting static String sample(@Nullable String value) { if (value != null) { return NON_PRINTABLE.replaceFrom( value.subSequence(0, min(50, value.length())), '.'); } else { return "null"; } } /** * Create a {@link Schema} for the given type. If the type is null, * the schema will be a nullable String. If isNullable is true, the returned * schema will be nullable. * * @param type a {@link Schema.Type} compatible with {@code Schema.create} * @param makeNullable If {@code true}, the return type will be nullable * @return a {@code Schema} for the given {@code Schema.Type} * @see Schema#create(org.apache.avro.Schema.Type) */ private static Schema schema(Schema.Type type, boolean makeNullable) { Schema schema = Schema.create(type == null ? Schema.Type.STRING : type); if (makeNullable || type == null) { schema = Schema.createUnion(Lists.newArrayList( Schema.create(Schema.Type.NULL), schema)); } return schema; } private static Schema.Type inferFieldType(String example) { if (example == null || example.isEmpty()) { return null; // not enough information } else if (LONG.matcher(example).matches()) { return Schema.Type.LONG; } else if (DOUBLE.matcher(example).matches()) { return Schema.Type.DOUBLE; } else if (FLOAT.matcher(example).matches()) { return Schema.Type.FLOAT; } return Schema.Type.STRING; } }