/*******************************************************************************
* Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*******************************************************************************/
package hydrograph.engine.hadoop.recordreader;
import cascading.tap.TapException;
import hydrograph.engine.cascading.scheme.DelimitedAndFixedWidthHelper;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
@SuppressWarnings("unused")
public class DelimitedAndFixedWidthRecordReader implements
RecordReader<LongWritable, Text> {
private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
String charsetName = "UTF-8";
String quote;
StringBuilder stringBuilder;
long start;
long end;
long pos;
FileSystem fs;
FSDataInputStream fileIn;
private String[] lengthsAndDelimiters, lengthsAndDelimitersType;
final Path file;
InputStreamReader inputStreamReader;
char[] singleChar, multipleChars;
boolean isQuotePresent = false;
public DelimitedAndFixedWidthRecordReader(JobConf conf, FileSplit split)
throws IOException {
lengthsAndDelimiters = DelimitedAndFixedWidthHelper
.modifyIdentifier(DelimitedAndFixedWidthHelper
.stringToArray(conf.get("lengthsAndDelimiters")));
lengthsAndDelimitersType = conf.getStrings("lengthsAndDelimitersType");
quote = conf.get("quote");
charsetName = conf.get("charsetName");
start = split.getStart();
pos = start;
end = start + split.getLength();
file = split.getPath();
fs = file.getFileSystem(conf);
fileIn = fs.open(split.getPath());
fileIn.seek(start);
inputStreamReader = new InputStreamReader(fileIn, charsetName);
singleChar = new char[1];
stringBuilder = new StringBuilder();
isQuotePresent = isQuotePresent(quote);
}
private boolean isQuotePresent(String string) {
if (string != null) {
if (string.equals(""))
return false;
else
return true;
}
return false;
}
@Override
public void close() throws IOException {
inputStreamReader.close();
fileIn.close();
}
@Override
public LongWritable createKey() {
return new LongWritable();
}
@Override
public Text createValue() {
return new Text("");
}
@Override
public synchronized float getProgress() throws IOException {
if (pos == end)
return 0.0f;
else {
return Math.min(1.0f, (pos - start) / (float) (end - start));
}
}
@Override
public synchronized boolean next(LongWritable key, Text value)
throws IOException {
boolean fieldNotFound, isMatchingDelimiterInProgress = false, isSecondLastCharNewline = false, isThirdLastCharNewline = false;
boolean quoteCharFound = false;
int fieldLength, delimiterCharCounter;
stringBuilder.setLength(0);
if (!isEOFEncountered() && !isSecondLastCharNewline
&& !isThirdLastCharNewline) {
for (int i = 0; i < lengthsAndDelimiters.length
&& !isSecondLastCharNewline && !isThirdLastCharNewline; i++) {
if (lengthsAndDelimitersType[i].contains("Integer")) {
fieldLength = Integer.parseInt(lengthsAndDelimiters[i]);
if (!(pos + fieldLength > end)) {
multipleChars = new char[fieldLength];
inputStreamReader.read(multipleChars);
pos += new String(multipleChars).getBytes(charsetName).length;
stringBuilder.append(multipleChars);
} else if ((isSecondLastChar() && isSecondLastCharNewline())
|| (isThirdLastChar() && isThirdLastCharNewline())) {
stringBuilder.setLength(0);
isSecondLastCharNewline = true;
isThirdLastCharNewline = true;
} else {
String message = "The input data is not according to specified schema. Expected data with delimiters or lengths as "
+ Arrays.toString(lengthsAndDelimiters)
+ ", got: " + stringBuilder.toString();
throw new TapException(message);
}
} else {
fieldNotFound = true;
delimiterCharCounter = 0;
do {
if (!isEOFEncountered()) {
inputStreamReader.read(singleChar);
pos += new String(singleChar).getBytes(charsetName).length;
if (isQuotePresent) {
if (isQuoteChar(singleChar[0]) == true) {
quoteCharFound = !quoteCharFound;
}
}
if (!quoteCharFound) {
if (lengthsAndDelimiters[i]
.charAt(delimiterCharCounter) == singleChar[0]) {
if (++delimiterCharCounter == lengthsAndDelimiters[i]
.length()) {
fieldNotFound = false;
}
isMatchingDelimiterInProgress = true;
} else if (isMatchingDelimiterInProgress) {
isMatchingDelimiterInProgress = false;
delimiterCharCounter = 0;
}
}
stringBuilder.append(singleChar);
} else if ((stringBuilder.toString().length() == 1 || stringBuilder
.toString().length() == 2)
&& (stringBuilder.toString().contentEquals(
"\r\n") || stringBuilder.toString()
.contentEquals("\n"))) {
fieldNotFound = false;
stringBuilder.setLength(0);
isSecondLastCharNewline = true;
isThirdLastCharNewline = true;
} else {
fieldNotFound = false;
String message = "The input data is not according to specified schema. Expected data with delimiters or lengths as: "
+ Arrays.toString(lengthsAndDelimiters)
+ ", gotW: " + stringBuilder.toString();
throw new TapException(message);
}
} while (fieldNotFound);
}
}
} else {
return false;
}
if (!isThirdLastCharNewline && !isSecondLastCharNewline) {
value.set(stringBuilder.toString());
return true;
} else {
return false;
}
}
private boolean isQuoteChar(char c) {
if (quote.charAt(0) == c) {
return true;
}
return false;
}
private boolean isThirdLastCharNewline() throws IOException {
inputStreamReader.read(singleChar);
pos += new String(singleChar).getBytes(charsetName).length;
stringBuilder.append(singleChar);
return stringBuilder.toString().contentEquals("\r");
}
private boolean isThirdLastChar() {
return pos == end - 2;
}
private boolean isSecondLastCharNewline() throws IOException {
inputStreamReader.read(singleChar);
pos += new String(singleChar).getBytes(charsetName).length;
stringBuilder.append(singleChar);
return stringBuilder.toString().contentEquals("\n");
}
private boolean isSecondLastChar() {
return pos == end - 1;
}
@Override
public synchronized long getPos() throws IOException {
return pos;
}
private boolean isEOFEncountered() {
return pos >= end;
}
}