/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.csvinput; import java.io.FileInputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.List; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.trans.step.BaseStepData; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.steps.textfileinput.EncodingType; /** * @author Matt * @since 24-jan-2005 */ public class CsvInputData extends BaseStepData implements StepDataInterface { public FileChannel fc; public ByteBuffer bb; public RowMetaInterface convertRowMeta; public RowMetaInterface outputRowMeta; private byte[] byteBuffer; private int startBuffer; private int endBuffer; private int bufferSize; public byte[] delimiter; public byte[] enclosure; public int preferredBufferSize; public String[] filenames; public int filenr; public int startFilenr; public byte[] binaryFilename; public FileInputStream fis; public boolean isAddingRowNumber; public long rowNumber; public boolean stopReading; public int stepNumber; public int totalNumberOfSteps; public List<Long> fileSizes; public long totalFileSize; public long blockToRead; public long startPosition; public long endPosition; public long bytesToSkipInFirstFile; public long totalBytesRead; public boolean parallel; public int filenameFieldIndex; public int rownumFieldIndex; public EncodingType encodingType; public PatternMatcherInterface delimiterMatcher; public PatternMatcherInterface enclosureMatcher; public CrLfMatcherInterface crLfMatcher; public FieldsMapping fieldsMapping; /** * Data class for CsvInput step * * @see CsvInput */ public CsvInputData() { super(); byteBuffer = new byte[] {}; startBuffer = 0; endBuffer = 0; totalBytesRead = 0; } // Resize private void resizeByteBufferArray() { // What's the new size? // It's (endBuffer-startBuffer)+size !! // That way we can at least read one full block of data using NIO // bufferSize = endBuffer - startBuffer; int newSize = bufferSize + preferredBufferSize; byte[] newByteBuffer = new byte[newSize + 100]; // copy over the old data... System.arraycopy( byteBuffer, startBuffer, newByteBuffer, 0, bufferSize ); // replace the old byte buffer... byteBuffer = newByteBuffer; // Adjust start and end point of data in the byte buffer // startBuffer = 0; endBuffer = bufferSize; } private int readBufferFromFile() throws IOException { // See if the line is not longer than the buffer. // In that case we need to increase the size of the byte buffer. // Since this method doesn't get called every other character, I'm sure we can spend a bit of time here without // major performance loss. // if ( endBuffer >= bb.capacity() ) { resizeByteBuffer( (int) ( bb.capacity() * 1.5 ) ); } bb.position( endBuffer ); int n = fc.read( bb ); if ( n >= 0 ) { // adjust the highest used position... // bufferSize = endBuffer + n; // Make sure we have room in the target byte buffer array // if ( byteBuffer.length < bufferSize ) { byte[] newByteBuffer = new byte[bufferSize]; System.arraycopy( byteBuffer, 0, newByteBuffer, 0, byteBuffer.length ); byteBuffer = newByteBuffer; } // Store the data in our byte array // bb.position( endBuffer ); bb.get( byteBuffer, endBuffer, n ); } return n; } private void resizeByteBuffer( int newSize ) { ByteBuffer newBuffer = ByteBuffer.allocateDirect( newSize ); // Increase by 50% newBuffer.position( 0 ); newBuffer.put( bb ); bb = newBuffer; } /** * Check to see if the buffer size is large enough given the data.endBuffer pointer.<br> * Resize the buffer if there is not enough room. * * @return false if everything is OK, true if there is a problem and we should stop. * @throws IOException * in case there is a I/O problem (read error) */ boolean resizeBufferIfNeeded() throws IOException { if ( endOfBuffer() ) { // Oops, we need to read more data... // Better resize this before we read other things in it... // resizeByteBufferArray(); // Also read another chunk of data, now that we have the space for it... // int n = readBufferFromFile(); // If we didn't manage to read something, we return true to indicate we're done // return n < 0; } return false; } /** * Moves the endBuffer pointer by one.<br> * If there is not enough room in the buffer to go there, resize the byte buffer and read more data.<br> * if there is no more data to read and if the endBuffer pointer has reached the end of the byte buffer, we return * true.<br> * * @return true if we reached the end of the byte buffer. * @throws IOException * In case we get an error reading from the input file. */ boolean moveEndBufferPointer() throws IOException { return moveEndBufferPointer( true ); } /** * This method should be used very carefully. Moving pointer without increasing number of written bytes * can lead to data corruption. */ boolean moveEndBufferPointer( boolean increaseTotalBytes ) throws IOException { endBuffer++; if ( increaseTotalBytes ) { totalBytesRead++; } return resizeBufferIfNeeded(); } /** * <pre> * [abcd "" defg] --> [abcd " defg] * [""""] --> [""] * [""] --> ["] * </pre> * * @return the byte array with escaped enclosures escaped. */ byte[] removeEscapedEnclosures( byte[] field, int nrEnclosuresFound ) { byte[] result = new byte[field.length - nrEnclosuresFound]; int resultIndex = 0; for ( int i = 0; i < field.length; i++ ) { result[resultIndex++] = field[i]; if ( field[i] == enclosure[0] && i + 1 < field.length && field[i + 1] == enclosure[0] ) { // Skip the escaped enclosure after adding the first one i++; } } return result; } byte[] getField( boolean delimiterFound, boolean enclosureFound, boolean newLineFound, boolean endOfBuffer ) { int fieldStart = startBuffer; int fieldEnd = endBuffer; if ( newLineFound && !endOfBuffer ) { fieldEnd -= encodingType.getLength(); } if ( enclosureFound ) { fieldStart += enclosure.length; fieldEnd -= enclosure.length; } int length = fieldEnd - fieldStart; if ( length <= 0 ) { length = 0; } byte[] field = new byte[length]; System.arraycopy( byteBuffer, fieldStart, field, 0, length ); return field; } void closeFile() throws KettleException { try { if ( fc != null ) { fc.close(); } if ( fis != null ) { fis.close(); } } catch ( IOException e ) { throw new KettleException( "Unable to close file channel for file '" + filenames[filenr - 1], e ); } } int getStartBuffer() { return startBuffer; } void setStartBuffer( int startBuffer ) { this.startBuffer = startBuffer; } int getEndBuffer() { return endBuffer; } boolean newLineFound() { return crLfMatcher.isReturn( byteBuffer, endBuffer ) || crLfMatcher.isLineFeed( byteBuffer, endBuffer ); } boolean delimiterFound() { return delimiterMatcher.matchesPattern( byteBuffer, endBuffer, delimiter ); } boolean enclosureFound() { return enclosureMatcher.matchesPattern( byteBuffer, endBuffer, enclosure ); } boolean endOfBuffer() { return endBuffer >= bufferSize; } }