/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2017 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.csvinput;
import org.junit.Test;
import org.pentaho.di.core.exception.KettleStepException;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.trans.step.RowAdapter;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepMetaDataCombi;
import org.pentaho.di.trans.steps.StepMockUtil;
import org.pentaho.di.trans.steps.mock.StepMockHelper;
import org.pentaho.di.trans.steps.textfileinput.TextFileInputField;
import java.io.File;
import static org.junit.Assert.assertEquals;
/**
* We take file with content
* and run it parallel with several steps.
* see docs for {@link CsvInput#prepareToRunInParallel} to understand how running file in parallel works
*
* We measure the correctness of work by counting the number of lines, written on each step.
* As a result, we should come to this pseudo formula: numberOfLines = sum of number of lines written by each step.
*
* Just a simple example:
* Assume, we have file with this content:
*
* a,b\r\n
* c,d\r\n
*
* If we will run it with 2 steps, we expect the first step to read 1st line, and the second step to read second line.
*
* Every test is built in this pattern.
*
* We actually play with 4 things:
* - file content
* - number of threads (it's actually same as number of steps)
* - representation of new line (it can be 2 bytes: '\r\n' (windows) or 1 byte: '\r' or '\n' (Mac, Linux) .
* Representation can differ. So, if we have different types of new lines in one file - it's ok.
* - file ends with new line or not
*/
public class CsvProcessRowInParallelTest extends CsvInputUnitTestBase {
@Test
public void oneByteNewLineIndicator_NewLineAtTheEnd_2Threads() throws Exception {
final int totalNumberOfSteps = 2;
final String fileContent =
"a;1\r"
+ "b;2\r";
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 1 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 1 );
}
@Test
public void oneByteNewLineIndicator_NoNewLineAtTheEnd_2Threads() throws Exception {
final int totalNumberOfSteps = 2;
final String fileContent =
"a;1\r"
+ "b;2\r"
+ "c;3";
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 2 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 1 );
}
@Test
public void PDI_15162_mixedByteNewLineIndicator_NewLineAtTheEnd_2Threads() throws Exception {
final int totalNumberOfSteps = 2;
final String fileContent =
"ab;111\r\n"
+ "bc;222\r\n"
+ "cd;333\r\n"
+ "de;444\r\n"
+ "ef;555\r"
+ "fg;666\r\n"
+ "gh;777\r\n"
+ "hi;888\r\n"
+ "ij;999\r"
+ "jk;000\r";
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 5 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 5 );
}
@Test
public void PDI_15162_mixedByteNewLineIndicator_NoNewLineAtTheEnd_2Threads() throws Exception {
final int totalNumberOfSteps = 2;
final String fileContent =
"ab;111\r\n"
+ "bc;222\r\n"
+ "cd;333\r\n"
+ "de;444\r\n"
+ "ef;555\r"
+ "fg;666\r\n"
+ "gh;777\r\n"
+ "hi;888\r\n"
+ "ij;999\r"
+ "jk;000";
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 5 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 5 );
}
@Test
public void twoByteNewLineIndicator_NewLineAtTheEnd_2Threads() throws Exception {
final String fileContent =
"a;1\r\n"
+ "b;2\r\n";
final int totalNumberOfSteps = 2;
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 1 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 1 );
}
@Test
public void twoByteNewLineIndicator_NoNewLineAtTheEnd_2Threads() throws Exception {
final String fileContent =
"a;1\r\n"
+ "b;2";
final int totalNumberOfSteps = 2;
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 1 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 1 );
}
@Test
public void twoByteNewLineIndicator_NewLineAtTheEnd_3Threads() throws Exception {
final String fileContent =
"a;1\r\n"
+ "b;2\r\n"
// thread 1 should read until this line
+ "c;3\r\n"
+ "d;4\r\n"
// thread 2 should read until this line
+ "e;5\r\n"
+ "f;6\r\n";
// thread 3 should read until this line
final int totalNumberOfSteps = 3;
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 2 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 2 );
assertEquals( createAndRunOneStep( sharedFile, 2, totalNumberOfSteps ), 2 );
}
/**
* Here files content is 16 bytes summary, where 8 of this bytes is the first line, 5 is the second one, 3 is the
* last.
* <p>
* As we are running this with 2 threads, we expect: 1st thread to read 1st line 2nd thread to read 2nd and 3d line.
*/
@Test
public void mixedBytesNewLineIndicator_NoNewLineAtTheEnd_2Threads() throws Exception {
final String fileContent =
"abcd;1\r\n"
+ "b;2\r\n"
+ "d;3";
final int totalNumberOfSteps = 2;
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 1 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 2 );
}
@Test
public void mixedBytesNewLineIndicator_NewLineAtTheEnd_2Threads() throws Exception {
final String fileContent =
"abcd;1\r\n"
+ "b;2\r"
+ "d;3\r";
final int totalNumberOfSteps = 2;
File sharedFile = createTestFile( "UTF-8", fileContent );
assertEquals( createAndRunOneStep( sharedFile, 0, totalNumberOfSteps ), 1 );
assertEquals( createAndRunOneStep( sharedFile, 1, totalNumberOfSteps ), 2 );
}
/**
* So as not to heap up list of taken parameters, we are passing combi, but we expect to see CsvInput class instances
* in it's content.
*/
private int processRows( StepMetaDataCombi combi ) throws Exception {
CsvInput csvInput = (CsvInput) combi.step;
CsvInputData stepData = (CsvInputData) combi.data;
CsvInputMeta stepMeta = (CsvInputMeta) combi.meta;
final int[] writtenRows = { 0 };
csvInput.addRowListener( new RowAdapter() {
@Override
public void rowWrittenEvent( RowMetaInterface rowMeta, Object[] row ) throws KettleStepException {
writtenRows[ 0 ]++;
}
} );
boolean haveRowsToRead;
do {
haveRowsToRead = !csvInput.processRow( stepMeta, stepData );
} while ( !haveRowsToRead );
csvInput.dispose( stepMeta, stepData );
return writtenRows[ 0 ];
}
private CsvInput createCsvInput() {
StepMockHelper<CsvInputMeta, StepDataInterface> stepMockHelper =
StepMockUtil.getStepMockHelper( CsvInputMeta.class, "CsvInputEnclosureTest" );
return new CsvInput( stepMockHelper.stepMeta, stepMockHelper.stepDataInterface, 0,
stepMockHelper.transMeta, stepMockHelper.trans );
}
private int createAndRunOneStep( File sharedFile, int stepNr, int totalNumberOfSteps )
throws Exception {
StepMetaDataCombi combiStep1 = createBaseCombi( sharedFile );
configureData( (CsvInputData) combiStep1.data, stepNr, totalNumberOfSteps );
return processRows( combiStep1 );
}
private StepMetaDataCombi createBaseCombi( File sharedFile ) {
StepMetaDataCombi combi = new StepMetaDataCombi();
CsvInputData data = new CsvInputData();
CsvInputMeta meta = createMeta( sharedFile, createInputFileFields( "Field_000", "Field_001" ) );
CsvInput csvInput = createCsvInput();
csvInput.init( meta, data );
combi.step = csvInput;
combi.data = data;
combi.meta = meta;
return combi;
}
private CsvInputMeta createMeta( File file, TextFileInputField[] fields ) {
CsvInputMeta meta = new CsvInputMeta();
meta.setFilename( file.getAbsolutePath() );
meta.setDelimiter( ";" );
meta.setEncoding( "utf-8" );
meta.setEnclosure( "\"" );
meta.setBufferSize( "1024" );
meta.setInputFields( fields );
meta.setHeaderPresent( false );
meta.setRunningInParallel( true );
return meta;
}
private void configureData( CsvInputData data, int currentStepNr, int totalNumberOfSteps ) {
data.parallel = true;
data.stepNumber = currentStepNr;
data.totalNumberOfSteps = totalNumberOfSteps;
}
}