/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.java.io;
import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.io.GenericCsvInputFormat;
import org.apache.flink.core.fs.FileInputSplit;
import org.apache.flink.types.parser.FieldParser;
import org.apache.flink.util.Preconditions;
import java.io.IOException;
import org.apache.flink.core.fs.Path;
import org.apache.flink.util.StringUtils;
@Internal
public abstract class CsvInputFormat<OUT> extends GenericCsvInputFormat<OUT> {
private static final long serialVersionUID = 1L;
public static final String DEFAULT_LINE_DELIMITER = "\n";
public static final String DEFAULT_FIELD_DELIMITER = ",";
protected transient Object[] parsedValues;
protected CsvInputFormat(Path filePath) {
super(filePath);
}
@Override
public void open(FileInputSplit split) throws IOException {
super.open(split);
@SuppressWarnings("unchecked")
FieldParser<Object>[] fieldParsers = (FieldParser<Object>[]) getFieldParsers();
//throw exception if no field parsers are available
if (fieldParsers.length == 0) {
throw new IOException("CsvInputFormat.open(FileInputSplit split) - no field parsers to parse input");
}
// create the value holders
this.parsedValues = new Object[fieldParsers.length];
for (int i = 0; i < fieldParsers.length; i++) {
this.parsedValues[i] = fieldParsers[i].createValue();
}
// left to right evaluation makes access [0] okay
// this marker is used to fasten up readRecord, so that it doesn't have to check each call if the line ending is set to default
if (this.getDelimiter().length == 1 && this.getDelimiter()[0] == '\n' ) {
this.lineDelimiterIsLinebreak = true;
}
this.commentCount = 0;
this.invalidLineCount = 0;
}
@Override
public OUT nextRecord(OUT record) throws IOException {
OUT returnRecord = null;
do {
returnRecord = super.nextRecord(record);
} while (returnRecord == null && !reachedEnd());
return returnRecord;
}
@Override
public OUT readRecord(OUT reuse, byte[] bytes, int offset, int numBytes) throws IOException {
/*
* Fix to support windows line endings in CSVInputFiles with standard delimiter setup = \n
*/
// Found window's end line, so find carriage return before the newline
if (this.lineDelimiterIsLinebreak && numBytes > 0 && bytes[offset + numBytes - 1] == '\r') {
//reduce the number of bytes so that the Carriage return is not taken as data
numBytes--;
}
if (commentPrefix != null && commentPrefix.length <= numBytes) {
//check record for comments
boolean isComment = true;
for (int i = 0; i < commentPrefix.length; i++) {
if (commentPrefix[i] != bytes[offset + i]) {
isComment = false;
break;
}
}
if (isComment) {
this.commentCount++;
return null;
}
}
if (parseRecord(parsedValues, bytes, offset, numBytes)) {
return fillRecord(reuse, parsedValues);
} else {
this.invalidLineCount++;
return null;
}
}
protected abstract OUT fillRecord(OUT reuse, Object[] parsedValues);
public Class<?>[] getFieldTypes() {
return super.getGenericFieldTypes();
}
protected static boolean[] createDefaultMask(int size) {
boolean[] includedMask = new boolean[size];
for (int x=0; x<includedMask.length; x++) {
includedMask[x] = true;
}
return includedMask;
}
protected static boolean[] toBooleanMask(int[] sourceFieldIndices) {
Preconditions.checkNotNull(sourceFieldIndices);
int max = 0;
for (int i : sourceFieldIndices) {
if (i < 0) {
throw new IllegalArgumentException("Field indices must not be smaller than zero.");
}
max = Math.max(i, max);
}
boolean[] includedMask = new boolean[max + 1];
// check if we support parsers for these types
for (int i = 0; i < sourceFieldIndices.length; i++) {
includedMask[sourceFieldIndices[i]] = true;
}
return includedMask;
}
@Override
public String toString() {
return "CSV Input (" + StringUtils.showControlCharacters(String.valueOf(getFieldDelimiter())) + ") " + getFilePath();
}
}