/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.api.common.io; import java.io.IOException; import java.util.ArrayList; import com.google.common.base.Preconditions; import com.google.common.primitives.Ints; import eu.stratosphere.core.fs.FileInputSplit; import eu.stratosphere.core.fs.Path; import eu.stratosphere.types.parser.FieldParser; import eu.stratosphere.util.InstantiationUtil; public abstract class GenericCsvInputFormat<OT> extends DelimitedInputFormat<OT> { private static final long serialVersionUID = 1L; private static final Class<?>[] EMPTY_TYPES = new Class[0]; private static final boolean[] EMPTY_INCLUDED = new boolean[0]; private static final char DEFAULT_FIELD_DELIMITER = ','; // -------------------------------------------------------------------------------------------- // Variables for internal operation. // They are all transient, because we do not want them so be serialized // -------------------------------------------------------------------------------------------- private transient FieldParser<Object>[] fieldParsers; // -------------------------------------------------------------------------------------------- // The configuration parameters. Configured on the instance and serialized to be shipped. // -------------------------------------------------------------------------------------------- private Class<?>[] fieldTypes = EMPTY_TYPES; private boolean[] fieldIncluded = EMPTY_INCLUDED; private char fieldDelim = DEFAULT_FIELD_DELIMITER; private boolean lenient; private boolean skipFirstLineAsHeader; // -------------------------------------------------------------------------------------------- // Constructors and getters/setters for the configurable parameters // -------------------------------------------------------------------------------------------- protected GenericCsvInputFormat() { super(); } protected GenericCsvInputFormat(Path filePath) { super(filePath); } // -------------------------------------------------------------------------------------------- public int getNumberOfFieldsTotal() { return this.fieldIncluded.length; } public int getNumberOfNonNullFields() { return this.fieldTypes.length; } public char getFieldDelimiter() { return fieldDelim; } public void setFieldDelimiter(char fieldDelim) { if (fieldDelim > Byte.MAX_VALUE) { throw new IllegalArgumentException("The field delimiter must be an ASCII character."); } this.fieldDelim = fieldDelim; } public boolean isLenient() { return lenient; } public void setLenient(boolean lenient) { this.lenient = lenient; } public boolean isSkippingFirstLineAsHeader() { return skipFirstLineAsHeader; } public void setSkipFirstLineAsHeader(boolean skipFirstLine) { this.skipFirstLineAsHeader = skipFirstLine; } // -------------------------------------------------------------------------------------------- protected FieldParser<?>[] getFieldParsers() { return this.fieldParsers; } protected Class<?>[] getGenericFieldTypes() { // check if we are dense, i.e., we read all fields if (this.fieldIncluded.length == this.fieldTypes.length) { return this.fieldTypes; } else { // sparse type array which we made dense for internal book keeping. // create a sparse copy to return Class<?>[] types = new Class<?>[this.fieldIncluded.length]; for (int i = 0, k = 0; i < this.fieldIncluded.length; i++) { if (this.fieldIncluded[i]) { types[i] = this.fieldTypes[k++]; } } return types; } } protected void setFieldTypesGeneric(Class<?> ... fieldTypes) { if (fieldTypes == null) { throw new IllegalArgumentException("Field types must not be null."); } this.fieldIncluded = new boolean[fieldTypes.length]; ArrayList<Class<?>> types = new ArrayList<Class<?>>(); // check if we support parsers for these types for (int i = 0; i < fieldTypes.length; i++) { Class<?> type = fieldTypes[i]; if (type != null) { if (FieldParser.getParserForType(type) == null) { throw new IllegalArgumentException("The type '" + type.getName() + "' is not supported for the CSV input format."); } types.add(type); fieldIncluded[i] = true; } } Class<?>[] denseTypeArray = (Class<?>[]) types.toArray(new Class[types.size()]); this.fieldTypes = denseTypeArray; } protected void setFieldsGeneric(int[] sourceFieldIndices, Class<?>[] fieldTypes) { Preconditions.checkNotNull(sourceFieldIndices); Preconditions.checkNotNull(fieldTypes); Preconditions.checkArgument(sourceFieldIndices.length == fieldTypes.length, "Number of field indices and field types must match."); for (int i : sourceFieldIndices) { if (i < 0) { throw new IllegalArgumentException("Field indices must not be smaller than zero."); } } int largestFieldIndex = Ints.max(sourceFieldIndices); this.fieldIncluded = new boolean[largestFieldIndex + 1]; ArrayList<Class<?>> types = new ArrayList<Class<?>>(); // check if we support parsers for these types for (int i = 0; i < fieldTypes.length; i++) { Class<?> type = fieldTypes[i]; if (type != null) { if (FieldParser.getParserForType(type) == null) { throw new IllegalArgumentException("The type '" + type.getName() + "' is not supported for the CSV input format."); } types.add(type); fieldIncluded[sourceFieldIndices[i]] = true; } } Class<?>[] denseTypeArray = (Class<?>[]) types.toArray(new Class[types.size()]); this.fieldTypes = denseTypeArray; } protected void setFieldsGeneric(boolean[] includedMask, Class<?>[] fieldTypes) { Preconditions.checkNotNull(includedMask); Preconditions.checkNotNull(fieldTypes); ArrayList<Class<?>> types = new ArrayList<Class<?>>(); // check if types are valid for included fields int typeIndex = 0; for (int i = 0; i < includedMask.length; i++) { if (includedMask[i]) { if (typeIndex > fieldTypes.length - 1) { throw new IllegalArgumentException("Missing type for included field " + i + "."); } Class<?> type = fieldTypes[typeIndex++]; if (type == null) { throw new IllegalArgumentException("Type for included field " + i + " should not be null."); } else { // check if we support parsers for this type if (FieldParser.getParserForType(type) == null) { throw new IllegalArgumentException("The type '" + type.getName() + "' is not supported for the CSV input format."); } types.add(type); } } } Class<?>[] denseTypeArray = (Class<?>[]) types.toArray(new Class[types.size()]); this.fieldTypes = denseTypeArray; this.fieldIncluded = includedMask; } // -------------------------------------------------------------------------------------------- // Runtime methods // -------------------------------------------------------------------------------------------- @Override public void open(FileInputSplit split) throws IOException { super.open(split); // instantiate the parsers @SuppressWarnings("unchecked") FieldParser<Object>[] parsers = new FieldParser[fieldTypes.length]; for (int i = 0; i < fieldTypes.length; i++) { if (fieldTypes[i] != null) { Class<? extends FieldParser<?>> parserType = FieldParser.getParserForType(fieldTypes[i]); if (parserType == null) { throw new RuntimeException("No parser available for type '" + fieldTypes[i].getName() + "'."); } @SuppressWarnings("unchecked") FieldParser<Object> p = (FieldParser<Object>) InstantiationUtil.instantiate(parserType, FieldParser.class); parsers[i] = p; } } this.fieldParsers = parsers; // skip the first line, if we are at the beginning of a file and have the option set if (this.skipFirstLineAsHeader && this.splitStart == 0) { readLine(); // read and ignore } } protected boolean parseRecord(Object[] holders, byte[] bytes, int offset, int numBytes) throws ParseException { boolean[] fieldIncluded = this.fieldIncluded; int startPos = offset; final int limit = offset + numBytes; for (int field = 0, output = 0; field < fieldIncluded.length; field++) { // check valid start position if (startPos >= limit) { if (lenient) { return false; } else { throw new ParseException("Row too short: " + new String(bytes, offset, numBytes)); } } if (fieldIncluded[field]) { // parse field FieldParser<Object> parser = (FieldParser<Object>) this.fieldParsers[output]; Object reuse = holders[output]; startPos = parser.parseField(bytes, startPos, limit, this.fieldDelim, reuse); holders[output] = parser.getLastResult(); // check parse result if (startPos < 0) { // no good if (lenient) { return false; } else { String lineAsString = new String(bytes, offset, numBytes); throw new ParseException("Line could not be parsed: '" + lineAsString+"'\n" + "Expect field types: "+fieldTypesToString()+" \n" + "in file: "+filePath); } } output++; } else { // skip field startPos = skipFields(bytes, startPos, limit, fieldDelim); if (startPos < 0) { if (!lenient) { String lineAsString = new String(bytes, offset, numBytes); throw new ParseException("Line could not be parsed: '" + lineAsString+"'\n" + "Expect field types: "+fieldTypesToString()+" \n" + "in file: "+filePath); } } } } return true; } private String fieldTypesToString() { StringBuilder string = new StringBuilder(); string.append(this.fieldTypes[0].toString()); for (int i = 1; i < this.fieldTypes.length; i++) { string.append(", ").append(this.fieldTypes[i]); } return string.toString(); } protected int skipFields(byte[] bytes, int startPos, int limit, char delim) { int i = startPos; final byte delByte = (byte) delim; byte current; // skip over initial whitespace lines while (i < limit && ((current = bytes[i]) == ' ' || current == '\t')) { i++; } // first none whitespace character if (i < limit && bytes[i] == '"') { // quoted string i++; // the quote while (i < limit && bytes[i] != '"') { i++; } if (i < limit) { // end of the quoted field i++; // the quote // skip trailing whitespace characters while (i < limit && (current = bytes[i]) != delByte) { if (current == ' ' || current == '\t') { i++; } else { return -1; // illegal case of non-whitespace characters trailing } } return (i == limit ? limit : i+1); } else { // exited due to line end without quote termination return -1; } } else { // unquoted field while (i < limit && bytes[i] != delByte) { i++; } return (i == limit ? limit : i+1); } } }