/*********************************************************************************************************************** * * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * **********************************************************************************************************************/ package eu.stratosphere.api.java.io; import java.io.IOException; import java.util.Map; import java.util.TreeMap; import com.google.common.base.Preconditions; import eu.stratosphere.api.common.io.GenericCsvInputFormat; import eu.stratosphere.api.java.tuple.Tuple; import eu.stratosphere.core.fs.FileInputSplit; import eu.stratosphere.core.fs.Path; import eu.stratosphere.types.parser.FieldParser; import eu.stratosphere.util.StringUtils; public class CsvInputFormat<OUT extends Tuple> extends GenericCsvInputFormat<OUT> { private static final long serialVersionUID = 1L; public static final String DEFAULT_LINE_DELIMITER = "\n"; public static final char DEFAULT_FIELD_DELIMITER = ','; private transient Object[] parsedValues; // To speed up readRecord processing. Used to find windows line endings. // It is set when open so that readRecord does not have to evaluate it private boolean lineDelimiterIsLinebreak = false; public CsvInputFormat(Path filePath) { super(filePath); } public CsvInputFormat(Path filePath, Class<?> ... types) { this(filePath, DEFAULT_LINE_DELIMITER, DEFAULT_FIELD_DELIMITER, types); } public CsvInputFormat(Path filePath, String lineDelimiter, char fieldDelimiter, Class<?>... types) { super(filePath); setDelimiter(lineDelimiter); setFieldDelimiter(fieldDelimiter); setFieldTypes(types); } public void setFieldTypes(Class<?> ... fieldTypes) { if (fieldTypes == null || fieldTypes.length == 0) { throw new IllegalArgumentException("Field types must not be null or empty."); } setFieldTypesGeneric(fieldTypes); } public void setFields(int[] sourceFieldIndices, Class<?>[] fieldTypes) { Preconditions.checkNotNull(sourceFieldIndices); Preconditions.checkNotNull(fieldTypes); checkForMonotonousOrder(sourceFieldIndices, fieldTypes); setFieldsGeneric(sourceFieldIndices, fieldTypes); } public void setFields(boolean[] sourceFieldMask, Class<?>[] fieldTypes) { Preconditions.checkNotNull(sourceFieldMask); Preconditions.checkNotNull(fieldTypes); setFieldsGeneric(sourceFieldMask, fieldTypes); } public Class<?>[] getFieldTypes() { return super.getGenericFieldTypes(); } @Override public void open(FileInputSplit split) throws IOException { super.open(split); @SuppressWarnings("unchecked") FieldParser<Object>[] fieldParsers = (FieldParser<Object>[]) getFieldParsers(); //throw exception if no field parsers are available if (fieldParsers.length == 0) { throw new IOException("CsvInputFormat.open(FileInputSplit split) - no field parsers to parse input"); } // create the value holders this.parsedValues = new Object[fieldParsers.length]; for (int i = 0; i < fieldParsers.length; i++) { this.parsedValues[i] = fieldParsers[i].createValue(); } // left to right evaluation makes access [0] okay // this marker is used to fasten up readRecord, so that it doesn't have to check each call if the line ending is set to default if (this.getDelimiter().length == 1 && this.getDelimiter()[0] == '\n' ) { this.lineDelimiterIsLinebreak = true; } } @Override public OUT readRecord(OUT reuse, byte[] bytes, int offset, int numBytes) { /* * Fix to support windows line endings in CSVInputFiles with standard delimiter setup = \n */ //Find windows end line, so find carriage return before the newline if (this.lineDelimiterIsLinebreak == true && numBytes > 0 && bytes[offset + numBytes -1] == '\r' ) { //reduce the number of bytes so that the Carriage return is not taken as data numBytes--; } if (parseRecord(parsedValues, bytes, offset, numBytes)) { // valid parse, map values into pact record for (int i = 0; i < parsedValues.length; i++) { reuse.setField(parsedValues[i], i); } return reuse; } else { return null; } } @Override public String toString() { return "CSV Input (" + StringUtils.showControlCharacters(String.valueOf(getFieldDelimiter())) + ") " + getFilePath(); } // -------------------------------------------------------------------------------------------- @SuppressWarnings("unused") private static void checkAndCoSort(int[] positions, Class<?>[] types) { if (positions.length != types.length) { throw new IllegalArgumentException("The positions and types must be of the same length"); } TreeMap<Integer, Class<?>> map = new TreeMap<Integer, Class<?>>(); for (int i = 0; i < positions.length; i++) { if (positions[i] < 0) { throw new IllegalArgumentException("The field " + " (" + positions[i] + ") is invalid."); } if (types[i] == null) { throw new IllegalArgumentException("The type " + i + " is invalid (null)"); } if (map.containsKey(positions[i])) { throw new IllegalArgumentException("The position " + positions[i] + " occurs multiple times."); } map.put(positions[i], types[i]); } int i = 0; for (Map.Entry<Integer, Class<?>> entry : map.entrySet()) { positions[i] = entry.getKey(); types[i] = entry.getValue(); i++; } } private static void checkForMonotonousOrder(int[] positions, Class<?>[] types) { if (positions.length != types.length) { throw new IllegalArgumentException("The positions and types must be of the same length"); } int lastPos = -1; for (int i = 0; i < positions.length; i++) { if (positions[i] < 0) { throw new IllegalArgumentException("The field " + " (" + positions[i] + ") is invalid."); } if (types[i] == null) { throw new IllegalArgumentException("The type " + i + " is invalid (null)"); } if (positions[i] <= lastPos) { throw new IllegalArgumentException("The positions must be strictly increasing (no permutations are supported)."); } lastPos = positions[i]; } } }