/*******************************************************************************
* Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*******************************************************************************/
package hydrograph.engine.cascading.assembly;
import cascading.pipe.Pipe;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import com.hotels.plunger.Bucket;
import com.hotels.plunger.Data;
import com.hotels.plunger.DataBuilder;
import com.hotels.plunger.Plunger;
import hydrograph.engine.cascading.assembly.RemoveDupsAssembly;
import hydrograph.engine.cascading.assembly.infra.ComponentParameters;
import hydrograph.engine.core.component.entity.RemoveDupsEntity;
import hydrograph.engine.core.component.entity.elements.KeyField;
import hydrograph.engine.core.component.entity.elements.OutSocket;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertThat;
/**
* Test dedup sub assembly. The tests are written using plunger framework
*
* @author Prabodh
*/
public class RemoveDupsAssemblyTest {
@Before
public void setup() {
// TODO: setup related things go here
}
/**
* Test dedup component's out port with 'keep first' option
*/
@Test
public void RemoveDupsKeepFirstWithOutPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(false);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField = new KeyField();
keyField.setName("col1");
removeDupsEntity.setKeyFields(new KeyField[] { keyField });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("first");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"), dedup);
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(1));
assertThat(actual.get(0), is(new Tuple("C1R1", "C2R1", "C3R1")));
}
/**
* Test dedup component's out port with 'keep last' option
*/
@Test
public void RemoveDupsKeepLastWithOutPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(false);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField = new KeyField();
keyField.setName("col1");
removeDupsEntity.setKeyFields(new KeyField[] { keyField });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("last");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"), dedup); // create
// bucket
// for
// the
// dedup
// sub
// assembly
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(1));
assertThat(actual.get(0), is(new Tuple("C1R1", "C2R3", "C3R3")));
}
/**
* Test dedup component's out port with 'keep unique' option
*/
@Test
public void RemoveDupsKeepUniqueOnlyWithOutPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R3", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(false);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField = new KeyField();
keyField.setName("col1");
removeDupsEntity.setKeyFields(new KeyField[] { keyField });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("unique");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"), dedup); // create
// bucket
// for
// the
// dedup
// sub
// assembly
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(1));
assertThat(actual.get(0), is(new Tuple("C1R3", "C2R3", "C3R3")));
}
/**
* Test dedup component's out port with 'keep first' option with more than
* one key
*/
@Test
public void RemoveDupsKeepFirstWithMultipleKeysWithOutPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R1").addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R1", "C3R2")
.addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(false);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField1 = new KeyField();
keyField1.setName("col1");
KeyField keyField2 = new KeyField();
keyField2.setName("col2");
removeDupsEntity.setKeyFields(new KeyField[] { keyField1, keyField2 });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("first");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"), dedup); // create
// bucket
// for
// the
// dedup
// sub
// assembly
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(3));
// assertThat(actual.get(0), is(new Tuple("C1R1", "C2R1", "C3R1")));
// Use HashSet so that order of fields does not matter while comparison
Set<Tuple> output = new HashSet<Tuple>(actual);
Set<Tuple> expectedOutput = new HashSet<Tuple>();
expectedOutput.add(new Tuple("C1R1", "C2R1", "C3R1"));
expectedOutput.add(new Tuple("C1R1", "C2R2", "C3R1"));
expectedOutput.add(new Tuple("C1R1", "C2R3", "C3R3"));
Assert.assertEquals(expectedOutput, output);
}
/**
* Test dedup component's out port with 'keep last' option with more than
* one key
*/
@Test
public void RemoveDupsKeepLastWithMultipleKeysWithOutPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R1").addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R1", "C3R2")
.addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(false);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField1 = new KeyField();
keyField1.setName("col1");
KeyField keyField2 = new KeyField();
keyField2.setName("col2");
removeDupsEntity.setKeyFields(new KeyField[] { keyField1, keyField2 });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("last");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"), dedup); // create
// bucket
// for
// the
// dedup
// sub
// assembly
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(3));
// assertThat(actual.get(0), is(new Tuple("C1R1", "C2R1", "C3R1")));
// Use HashSet so that order of fields does not matter while comparison
Set<Tuple> output = new HashSet<Tuple>(actual);
Set<Tuple> expectedOutput = new HashSet<Tuple>();
expectedOutput.add(new Tuple("C1R1", "C2R1", "C3R2"));
expectedOutput.add(new Tuple("C1R1", "C2R2", "C3R2"));
expectedOutput.add(new Tuple("C1R1", "C2R3", "C3R3"));
Assert.assertEquals(expectedOutput, output);
}
/**
* Test dedup component's unused port with 'keep first' option
*/
@Test
public void RemoveDupsKeepFirstWithUnusedPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(true);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField1 = new KeyField();
keyField1.setName("col1");
removeDupsEntity.setKeyFields(new KeyField[] { keyField1 });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("first");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("unused0", "unused"));
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
// create bucket for the dedup sub assembly
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"),
dedup.getOutLink("unused", "unused0", removeDupsEntity.getComponentId())); // explicitly
// set
// the
// unused
// port.
// Default
// is
// out
// port
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(2));
// assertThat(actual.get(0), is(new Tuple("C1R1", "C2R1", "C3R1")));
// Use HashSet so that order of fields does not matter while comparison
Set<Tuple> output = new HashSet<Tuple>(actual);
Set<Tuple> expectedOutput = new HashSet<Tuple>();
expectedOutput.add(new Tuple("C1R1", "C2R2", "C3R2"));
expectedOutput.add(new Tuple("C1R1", "C2R3", "C3R3"));
Assert.assertEquals(expectedOutput, output);
}
/**
* Test dedup component's unused port with 'keep last' option
*/
@Test
public void RemoveDupsKeepLastWithUnusedPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(true);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField1 = new KeyField();
keyField1.setName("col1");
removeDupsEntity.setKeyFields(new KeyField[] { keyField1 });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("last");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("out0", "out"));
outSocketList.add(new OutSocket("unused0", "unused"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
// create bucket for the dedup sub assembly
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"),
dedup.getOutLink("unused", "unused0", removeDupsEntity.getComponentId())); // explicitly
// set
// the
// unused
// port.
// Default
// is
// out
// port
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(2));
// Use HashSet so that order of fields does not matter while comparison
Set<Tuple> output = new HashSet<Tuple>(actual);
Set<Tuple> expectedOutput = new HashSet<Tuple>();
expectedOutput.add(new Tuple("C1R1", "C2R1", "C3R1"));
expectedOutput.add(new Tuple("C1R1", "C2R2", "C3R2"));
Assert.assertEquals(expectedOutput, output);
}
/**
* Test dedup component's unused port with 'keep unique' option
*/
@Test
public void RemoveDupsKeepUniqueOnlyWithUnusedPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R3", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(true);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField1 = new KeyField();
keyField1.setName("col1");
removeDupsEntity.setKeyFields(new KeyField[] { keyField1 });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("unique");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("unused0", "unused"));
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
// create bucket for the dedup sub assembly
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"),
dedup.getOutLink("unused", "unused0", removeDupsEntity.getComponentId())); // explicitly
// set
// the
// unused
// port.
// Default
// is
// out
// port
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(2));
// Use HashSet so that order of fields does not matter while comparison
Set<Tuple> output = new HashSet<Tuple>(actual);
Set<Tuple> expectedOutput = new HashSet<Tuple>();
expectedOutput.add(new Tuple("C1R1", "C2R1", "C3R1"));
expectedOutput.add(new Tuple("C1R1", "C2R2", "C3R2"));
Assert.assertEquals(expectedOutput, output);
}
/**
* Test dedup component's unused port with 'keep first' option with more
* than one key
*/
@Test
public void RemoveDupsKeepFirstWithMultipleKeysWithUnusedPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R1").addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R1", "C3R2")
.addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(true);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField1 = new KeyField();
keyField1.setName("col1");
KeyField keyField2 = new KeyField();
keyField2.setName("col2");
removeDupsEntity.setKeyFields(new KeyField[] { keyField1, keyField2 });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("first");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("unused0", "unused"));
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
// create bucket for the dedup sub assembly
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"),
dedup.getOutLink("unused", "unused0", removeDupsEntity.getComponentId())); // explicitly
// set
// the
// unused
// port.
// Default
// is
// out
// port
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(2));
// assertThat(actual.get(0), is(new Tuple("C1R1", "C2R1", "C3R1")));
// Use HashSet so that order of fields does not matter while comparison
Set<Tuple> output = new HashSet<Tuple>(actual);
Set<Tuple> expectedOutput = new HashSet<Tuple>();
expectedOutput.add(new Tuple("C1R1", "C2R1", "C3R2"));
expectedOutput.add(new Tuple("C1R1", "C2R2", "C3R2"));
Assert.assertEquals(expectedOutput, output);
}
/**
* Test dedup component's unused port with 'keep last' option with more than
* one key
*/
@Test
public void RemoveDupsKeepLastWithMultipleKeysWithUnusedPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R1").addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R1", "C3R2")
.addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(true);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
KeyField keyField1 = new KeyField();
keyField1.setName("col1");
KeyField keyField2 = new KeyField();
keyField2.setName("col2");
removeDupsEntity.setKeyFields(new KeyField[] { keyField1, keyField2 });
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("last");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("unused0", "unused"));
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
// create bucket for the dedup sub assembly
// explicitly set the unused port. Default is out port
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"),
dedup.getOutLink("unused", "unused0", removeDupsEntity.getComponentId()));
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(2));
// assertThat(actual.get(0), is(new Tuple("C1R1", "C2R1", "C3R1")));
// Use HashSet so that order of fields does not matter while comparison
Set<Tuple> output = new HashSet<Tuple>(actual);
Set<Tuple> expectedOutput = new HashSet<Tuple>();
expectedOutput.add(new Tuple("C1R1", "C2R1", "C3R1"));
expectedOutput.add(new Tuple("C1R1", "C2R2", "C3R1"));
Assert.assertEquals(expectedOutput, output);
}
@Test
public void RemoveDupsKeepLastWithNullKeyWithOutPort() {
Plunger plunger = new Plunger();
Data file1 = new DataBuilder(new Fields("col1", "col2", "col3")).addTuple("C1R1", "C2R1", "C3R1")
.addTuple("C1R1", "C2R2", "C3R1").addTuple("C1R1", "C2R2", "C3R2").addTuple("C1R1", "C2R1", "C3R2")
.addTuple("C1R1", "C2R3", "C3R3").build();
Pipe pipe1 = plunger.newNamedPipe("pipe1", file1); // pipe corresponding
// to an input of
// dedup component
ArrayList<Boolean> list = new ArrayList<Boolean>();
list.add(true);
RemoveDupsEntity removeDupsEntity = new RemoveDupsEntity();
removeDupsEntity.setComponentId("dedupTest");
removeDupsEntity.setKeyFields(null);
// keep just the first record. Discard remaining duplicate records
removeDupsEntity.setKeep("last");
List<OutSocket> outSocketList = new ArrayList<OutSocket>();
outSocketList.add(new OutSocket("unused0", "unused"));
outSocketList.add(new OutSocket("out0", "out"));
removeDupsEntity.setOutSocketList(outSocketList);
ComponentParameters parameters = new ComponentParameters();
parameters.addInputPipe(pipe1);
parameters.addInputFields(new Fields("col1", "col2", "col3"));
RemoveDupsAssembly dedup = new RemoveDupsAssembly(removeDupsEntity, parameters);
// create bucket for the dedup sub assembly
// explicitly set the unused port. Default is out port
Bucket bucket = plunger.newBucket(new Fields("col1", "col2", "col3"),
dedup.getOutLink("out", "out0", removeDupsEntity.getComponentId()));
List<Tuple> actual = bucket.result().asTupleList(); // get results from
// bucket
// assert the actual results with expected results
assertThat(actual.size(), is(1));
// Use HashSet so that order of fields does not matter while comparison
Set<Tuple> output = new HashSet<Tuple>(actual);
Set<Tuple> expectedOutput = new HashSet<Tuple>();
expectedOutput.add(new Tuple("C1R1", "C2R3", "C3R3"));
Assert.assertEquals(expectedOutput, output);
}
}