// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataprofiler.core.process;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import org.talend.dataprofiler.core.i18n.internal.DefaultMessagesImpl;
/**
*
* @author xtan
*
*/
public class DelimitedFileReader {
private boolean debug = false;
/*--------------1------------------- */
private BufferedReader inputStream = null;
private StreamBuffer streamBuffer = null;
private ColumnBuffer4Joiner columnBuffer = null;
/*--------------2------------------- */
private String[] values = new String[StaticSettings.INITIAL_COLUMN_COUNT];
private int columnsCount = 0;
private long currentRecord = 0;
private boolean skipEmptyRecord = false;
/*--------------3------------------- */
private boolean hasReadRecord = false;
private boolean autoReallocateForHuge = true;
private boolean initialized = false;
private boolean closed = false;
/*--------------4------------------- */
public boolean splitRecord = false;
/*--------------5------------------- */
public DelimitedFileReader(String fileName, String encoding, String fieldDelimiter, String recordDelimiter,
boolean needSkipEmptyRecord) throws IOException {
if (fileName == null || fieldDelimiter == null || recordDelimiter == null) {
throw new IllegalArgumentException(DefaultMessagesImpl.getString("DelimitedFileReader_NullPara")); //$NON-NLS-1$
}
inputStream = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), encoding));
columnBuffer = new ColumnBuffer4Joiner();
streamBuffer = new StreamBuffer(fieldDelimiter, recordDelimiter);
skipEmptyRecord = needSkipEmptyRecord;
initialized = true;
}
public boolean readRecord() throws IOException {
if (splitRecord) {
return readRecordSplitRecord();
} else {
return readRecordSplitField();
}
}
private boolean readRecordSplitField() throws IOException {
checkClosed();
boolean in = false;
hasReadRecord = false;
columnsCount = 0;
streamBuffer.columnStart = streamBuffer.currentPosition;
while (streamBuffer.hasMoreData() && !hasReadRecord) {
if (debug) {
System.out.println(streamBuffer);
}
in = true;
if (streamBuffer.needJoinReadNextBuffer()) {
joinAndRead();
}
if (streamBuffer.isStartFieldDelimited()) {
endColumn();
streamBuffer.skipFieldDelimiter();
} else if (streamBuffer.isStartRecordDelimited()) {
endColumn();
endRecord();
} else {
streamBuffer.currentPosition++;
// for checking one column can support the max number of chars
if (!autoReallocateForHuge
&& streamBuffer.currentPosition - streamBuffer.columnStart + columnBuffer.position > StaticSettings.MAX_CHARS_IN_ONE_COLUMN) {
close();
throw new IOException(DefaultMessagesImpl.getString(
"DelimitedFileReader_OverLimitColumn", StaticSettings.MAX_CHARS_IN_ONE_COLUMN) //$NON-NLS-1$
);
}
}
}
if (!hasReadRecord) {
if (in) {// aaa;bbb#111;222
endColumn();
endRecord();
}
}
return hasReadRecord;
}
private boolean readRecordSplitRecord() throws IOException {
checkClosed();
boolean in = false;
hasReadRecord = false;
columnsCount = 0;
streamBuffer.columnStart = streamBuffer.currentPosition;
while (streamBuffer.hasMoreData() && !hasReadRecord) {
if (debug) {
System.out.println(streamBuffer);
}
in = true;
if (streamBuffer.needJoinReadNextBuffer()) {
joinAndRead();
}
if (streamBuffer.isStartRecordDelimited()) {
endColumn();
endRecord();
} else if (streamBuffer.isStartFieldDelimited()) {
endColumn();
streamBuffer.skipFieldDelimiter();
} else {
streamBuffer.currentPosition++;
// for checking one column can support the max number of chars
if (!autoReallocateForHuge
&& streamBuffer.currentPosition - streamBuffer.columnStart + columnBuffer.position > StaticSettings.MAX_CHARS_IN_ONE_COLUMN) {
close();
throw new IOException(DefaultMessagesImpl.getString(
"DelimitedFileReader_OverLimitColumn", StaticSettings.MAX_CHARS_IN_ONE_COLUMN) //$NON-NLS-1$
);
}
}
}
if (!hasReadRecord) {
if (in) {// aaa;bbb#111;222
endColumn();
endRecord();
}
}
return hasReadRecord;
}
private void joinAndRead() throws IOException {
columnBuffer.saveCharInJoiner();
streamBuffer.joinReadNextBuffer();
}
private void endRecord() throws IOException {
streamBuffer.skipRecordDelimiter();
if (skipEmptyRecord) {
if (columnsCount == 0 || (columnsCount == 1 && values[0].equals(""))) { //$NON-NLS-1$
columnsCount = 0;// reset the columnsCount = 0 is a must
streamBuffer.columnStart = streamBuffer.currentPosition;
return;
}
}
// this flag is used as a loop exit condition during parsing
hasReadRecord = true;
currentRecord++;
}
/**
* @exception IOException Thrown if a very rare extreme exception occurs during parsing, normally resulting from
* improper data format.
*/
private void endColumn() throws IOException {
String currentValue = ""; //$NON-NLS-1$
if (columnBuffer.position == 0) {
currentValue = new String(streamBuffer.buffer, streamBuffer.columnStart, streamBuffer.currentPosition
- streamBuffer.columnStart);
} else {
// add the areadly datas in buffer
columnBuffer.saveCharInJoiner();
currentValue = new String(columnBuffer.buffer, 0, columnBuffer.position);
}
columnBuffer.position = 0;
// for checking one record can support the max number of columns
if (!autoReallocateForHuge && columnsCount >= StaticSettings.MAX_COLUMNS_IN_ONE_RECORD) {
close();
throw new IOException(DefaultMessagesImpl.getString(
"DelimitedFileReader_OverLimitRecord", StaticSettings.MAX_COLUMNS_IN_ONE_RECORD) //$NON-NLS-1$
);
}
if (columnsCount == values.length) {
int newLength = values.length * 2;
String[] holder = new String[newLength];
System.arraycopy(values, 0, holder, 0, values.length);
values = holder;
}
values[columnsCount] = currentValue;
columnsCount++;
}
private void close(boolean closing) {
if (!closed) {
if (closing) {
streamBuffer.buffer = null;
columnBuffer.buffer = null;
}
try {
if (initialized) {
inputStream.close();
}
} catch (Exception e) {
// just ignore the exception
}
inputStream = null;
closed = true;
}
}
/**
* when read a new record or get content of the column, there should check the stream first.
*/
private void checkClosed() throws IOException {
if (closed) {
throw new IOException(DefaultMessagesImpl.getString("DelimitedFileReader_StreamClosed")); //$NON-NLS-1$
}
}
/**
* get one column value with a given column index for the current record.
*/
public String get(int columnIndex) throws IOException {
checkClosed();
if (columnIndex > -1 && columnIndex < columnsCount) {
return values[columnIndex];
} else {
return ""; //$NON-NLS-1$
}
}
public String getRowRecord() {
// here can fast for the common record seperator function with fieldDelimiter=""
// becase there is only one field.
if (columnsCount == 1) {
return values[0];
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < columnsCount; i++) {
if (i > 0) {
sb.append(streamBuffer.fieldDelimiter);// add the fieldDelimiter
}
sb.append(values[i]);
}
return sb.toString();
}
/**
* get the result how many records have been read.
*/
public long getProcessedRecordCount() {
return currentRecord;
}
/**
* <p>
* when skip the some records of the beginning and some records of the end in the file, then compute how many
* records are there.
* </p>
* <p>
* it can use after the skipHeaders();
* </p>
*/
public long getAvailableRowCount(int footer) throws IOException {
checkClosed();
boolean flag = true;
do {
flag = readRecord();
} while (flag);
return currentRecord - footer;
}
public int getAvailableColumnsCount() throws IOException {
return columnsCount;
}
/**
* skip the some records of the beginning in the file, and set the "currentRecord = 0"
*
*
*/
public void skipHeaders(int header) throws IOException {
checkClosed();
if (header <= 0) {
return;
}
for (int i = 0; i < header; i++) {
readRecord();
}
currentRecord = 0;
}
/**
* <p>
* the default limit is:
* </p>
* <li>public static final int MAX_CHARS_IN_ONE_COLUMN = 100000;</li> <li>public static final int
* MAX_COLUMNS_IN_ONE_RECORD = 100000;</li>
*
*/
public void setAutoReallocateForHuge(boolean autoReallocateForHuge) {
this.autoReallocateForHuge = autoReallocateForHuge;
}
/**
* <p>
* if the input datas like this: 111;222;#aaa;bbb;# (row separator: ;# field separator: ; )
* </p>
* <li>if FirstSplit_RecordSeparator, there will get (2 records, 2 columns)</li> <li>if FirstSplit_FieldSeparator,
* there will get (2 records, 3 columns)</li>
* <p>
* The default value is false, it means split the field first.
* </p>
*/
public void setSplitRecord(boolean splitRecord) {
this.splitRecord = splitRecord;
}
/**
* Closes and releases all related resources.
*/
public void close() {
if (!closed) {
close(true);
closed = true;
}
}
@Override
protected void finalize() {
close(false);
}
/**
* a buffer: save the end chars of the last buffer and begin chars of the current buffer in the memory
*
* @author xtan
*
*/
private class ColumnBuffer4Joiner {
char[] buffer;
int position;
public ColumnBuffer4Joiner() {
buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE];
position = 0;
}
/**
* join TWO buffer together, save the end chars of the last buffer and begin chars of the current buffer in one
* place
*/
public void saveCharInJoiner() {
int columnBufferBlankSpaceNum = columnBuffer.buffer.length - columnBuffer.position;
int streamBufferFieldCharNum = 0;
if (streamBuffer.count > 0) {// a must
streamBufferFieldCharNum = streamBuffer.currentPosition - streamBuffer.columnStart;
}
// check and expand more memory for buffer
if (columnBufferBlankSpaceNum < streamBufferFieldCharNum) {
int newLength = columnBuffer.buffer.length + Math.max(streamBufferFieldCharNum, columnBuffer.buffer.length);
char[] holder = new char[newLength];
System.arraycopy(columnBuffer.buffer, 0, holder, 0, columnBuffer.position);
columnBuffer.buffer = holder;
}
// copy datas from streamBuffer to columnBuffer for save it temporarily
System.arraycopy(streamBuffer.buffer, streamBuffer.columnStart, columnBuffer.buffer, columnBuffer.position,
streamBufferFieldCharNum);
columnBuffer.position += streamBufferFieldCharNum;
}
}
/**
* <b> a buffer with funtion: join the last data and read the next buffer, for supporting the multi-separator</b>
* Notice: "count, currentPosition, currentPosition, columnStart", they are import here
*
* @author xtan
*/
private class StreamBuffer {
public boolean needJoinReadNextBuffer() {
// 5-2=3, when currentPostion=3; or 5-0=5, when currentPosition=5, and file not end, so read next buffer
return currentPosition >= lastIndexToRead && !streamEndMeet;
/* notice: here is >=, not > */
}
// when count=5, currentPosition=4 ===> hasMoreData()=true, it mean buffer[4] still need process
public boolean hasMoreData() {
return !streamEndMeet || currentPosition < count;
}
private void moveTailToHead() {
count = count - currentPosition;
for (int i = 0; i < count; i++) {
buffer[i] = buffer[currentPosition + i];
}
lastIndexToRead = count - maxLimit;
currentPosition = 0;
columnStart = 0;
}
public void joinReadNextBuffer() throws IOException {
moveTailToHead();
int readCount = 0;
int maxReadLength = buffer.length - count;
try {
readCount = inputStream.read(buffer, count, maxReadLength);
} catch (IOException ex) {
close();
throw ex;
}
if (debug) {
System.out.println("##maxReadLength=" + maxReadLength + " newReadCount=" + readCount); //$NON-NLS-1$ //$NON-NLS-2$
System.out.println(streamBuffer);
}
/* @see bug:http://talendforge.org/bugs/view.php?id=4554 */
if (readCount < maxReadLength) {
if (readCount == -1) {
streamEndMeet = true;
}
}
if (readCount > -1) {
count += readCount;
lastIndexToRead = count - maxLimit;
}
}
/*--------------1------------------- */
private char[] buffer;
private char[] fieldDelimiter;
private char[] recordDelimiter;
/*--------------2------------------- */
private boolean lineFeedAll = false;
// if LineMode.LINEFEED_ALL, if "\r\n", variableLineFeed = 2, if "\r" or "\n", variableLineFeed = 1
// only used to adjust the skipRecordDelimiter()
private int variableLineFeed = 0;
/*--------------3------------------- */
private int maxLimit;// maxLimit = Math.max(fieldDelimiter.length, recordDelimiter.length);
private int currentPosition;
// for join read
private int lastIndexToRead;// lastIndexToRead = count - maxLimit, special: 5 = 5 - 0
private int count;
private int columnStart;
/*--------------4------------------- */
// end of the file
private boolean streamEndMeet = false;
public StreamBuffer(String fieldDelimiterPara, String recordDelimiterPara) throws IOException {
buffer = new char[StaticSettings.MAX_BUFFER_SIZE];
if (recordDelimiterPara.equals("\\n")) { //$NON-NLS-1$
if (StaticSettings.LINEMODE == LineMode.LINEFEED_ALL) {
// notice: here we set it "\r\n", neither "\n" nor "\r", becase there want max length
recordDelimiter = "\r\n".toCharArray(); //$NON-NLS-1$
lineFeedAll = true;
} else {
recordDelimiter = recordDelimiterPara.toCharArray();
}
} else {
recordDelimiter = recordDelimiterPara.toCharArray();
}
fieldDelimiter = fieldDelimiterPara.toCharArray();
maxLimit = Math.max(fieldDelimiter.length, recordDelimiter.length);
// read one buffer datas first when initial
try {
count = inputStream.read(buffer, 0, buffer.length);
} catch (IOException ex) {
close();
throw ex;
}
currentPosition = 0;
columnStart = 0;
lastIndexToRead = count - maxLimit;
streamEndMeet = (count < buffer.length);
}
public boolean isStartFieldDelimited() {
int maxLengthCanCheck = count - currentPosition;
if (fieldDelimiter.length != 0 && fieldDelimiter.length <= maxLengthCanCheck) {// test here is a must
for (int i = 0; i < fieldDelimiter.length; i++) {
if (buffer[currentPosition + i] != fieldDelimiter[i]) {
return false;
}
}
} else {
return false;
}
return true;
}
public boolean isStartRecordDelimited() {
int maxLengthCanCheck = count - currentPosition;
if (lineFeedAll) {// maxLengthCanCheck > 0 is must
if (maxLengthCanCheck > 0) {
if (buffer[currentPosition] == '\n') {
variableLineFeed = 1;
return true;
} else if (buffer[currentPosition] == '\r') {
variableLineFeed = 1;
if (buffer[currentPosition + 1] == '\n' && maxLengthCanCheck > 1) {
variableLineFeed = 2;
}
return true;
}
}
return false;
} else {
if (recordDelimiter.length != 0 && recordDelimiter.length <= maxLengthCanCheck) {// test here is a
// must
for (int i = 0; i < recordDelimiter.length; i++) {
if (buffer[currentPosition + i] != recordDelimiter[i]) {
return false;
}
}
} else {
return false;
}
}
return true;
}
public void skipFieldDelimiter() {
currentPosition += fieldDelimiter.length;
columnStart = currentPosition;// means: start a new Column
}
public void skipRecordDelimiter() {
if (lineFeedAll) {
currentPosition += variableLineFeed;
} else {
currentPosition += recordDelimiter.length;
}
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("count=").append(count).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("maxLimit=").append(maxLimit).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("lastIndexToRead=").append(lastIndexToRead).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("currentPosition=").append(currentPosition).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("columnStart=").append(columnStart).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("streamEndMeet=").append(streamEndMeet).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("overMaxLimit()=").append(needJoinReadNextBuffer()).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("hasMoreData()=").append(hasMoreData()).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("char[]=").append(buffer).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("char[").append(currentPosition).append("]=").append(buffer[currentPosition]).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
sb.append("fieldDelimiterLength=").append(fieldDelimiter.length).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
sb.append("recordDelimiterLength=").append(recordDelimiter.length).append("\n"); //$NON-NLS-1$ //$NON-NLS-2$
return sb.toString();
}
}
/**
*
* DOC Administrator DelimitedFileReader class global comment. Detailled comment
*/
public enum LineMode {
// it means, if input recordDelimiter is "\n",
// if JRE on windows, it will be "\r\n",
// if JRE on linux, it will be "\n",
// if JRE on Mac, it will be "\r"
LINEFEED_JRE,
// it means, if input recordDelimiter is "\n", it will treat all the "\r", "\n", "\r\n" as recordDelimiter
LINEFEED_ALL,
// it means, if input recordDelimiter is "\n", just treat it as "\n"
LINEFEED_NORMAL;
}
/**
* StaticSettings for the DelimitedDataReader. they can be changed in unit test.
*/
private static class StaticSettings {
public static final int MAX_BUFFER_SIZE = 1024;
public static final int INITIAL_COLUMN_COUNT = 10;
public static final int INITIAL_COLUMN_BUFFER_SIZE = 50;
public static final int MAX_CHARS_IN_ONE_COLUMN = 100000;
public static final int MAX_COLUMNS_IN_ONE_RECORD = 100000;
// how do we process the "\n" as recordDelimiter
public static final LineMode LINEMODE = LineMode.LINEFEED_ALL;
// public static final boolean autoReallocateForHuge = true;
}
public static void main(String[] args) throws IOException {
DelimitedFileReader fid = new DelimitedFileReader("D:\\talend\\talendFID\\in.csv", "ISO-8859-15", "", "\n", false); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
int rowNum = 0;
while (fid.readRecord()) {
System.out.println("*********Row" + rowNum + "***********"); //$NON-NLS-1$ //$NON-NLS-2$
System.out.println("------Row------\n" + fid.getRowRecord()); //$NON-NLS-1$
int fieldNum = fid.getAvailableColumnsCount();
for (int k = 0; k < fieldNum; k++) {
System.out.println("------" + k + "------\n" + fid.get(k)); //$NON-NLS-1$ //$NON-NLS-2$
}
rowNum++;
System.out.println("\n\n"); //$NON-NLS-1$
}
}
}