/* * Copyright © 2015-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2.lib.partitioned; import co.cask.cdap.api.Predicate; import co.cask.cdap.api.dataset.lib.Partition; import co.cask.cdap.api.dataset.lib.PartitionDetail; import co.cask.cdap.api.dataset.lib.PartitionFilter; import co.cask.cdap.api.dataset.lib.PartitionKey; import co.cask.cdap.api.dataset.lib.PartitionedFileSet; import co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties; import co.cask.cdap.api.dataset.lib.Partitioning; import co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer; import co.cask.cdap.api.dataset.lib.partitioned.ConsumablePartition; import co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration; import co.cask.cdap.api.dataset.lib.partitioned.ConsumerWorkingSet; import co.cask.cdap.api.dataset.lib.partitioned.PartitionAcceptor; import co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer; import co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumerResult; import co.cask.cdap.api.dataset.lib.partitioned.ProcessState; import co.cask.cdap.api.dataset.lib.partitioned.StatePersistor; import co.cask.cdap.data2.dataset2.DatasetFrameworkTestUtil; import co.cask.cdap.proto.Id; import co.cask.tephra.TransactionAware; import co.cask.tephra.TransactionContext; import co.cask.tephra.TransactionExecutor; import co.cask.tephra.TransactionManager; import co.cask.tephra.inmemory.InMemoryTxSystemClient; import com.google.common.base.Function; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import org.apache.twill.filesystem.Location; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.ClassRule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.UUID; import javax.annotation.Nullable; /** * Tests PartitionConsumer. */ public class PartitionConsumerTest { @ClassRule public static TemporaryFolder tmpFolder = new TemporaryFolder(); @ClassRule public static DatasetFrameworkTestUtil dsFrameworkUtil = new DatasetFrameworkTestUtil(); private static final Partitioning PARTITIONING_1 = Partitioning.builder() .addStringField("s") .addIntField("i") .addLongField("l") .build(); private static final Id.DatasetInstance pfsInstance = Id.DatasetInstance.from(DatasetFrameworkTestUtil.NAMESPACE_ID, "pfs"); private static final Id.DatasetInstance pfsExternalInstance = Id.DatasetInstance.from(DatasetFrameworkTestUtil.NAMESPACE_ID, "ext"); private static Location pfsBaseLocation; @Before public void before() throws Exception { dsFrameworkUtil.createInstance("partitionedFileSet", pfsInstance, PartitionedFileSetProperties.builder() .setPartitioning(PARTITIONING_1) .setBasePath("testDir") .build()); pfsBaseLocation = ((PartitionedFileSet) dsFrameworkUtil.getInstance(pfsInstance)) .getEmbeddedFileSet().getBaseLocation(); Assert.assertTrue(pfsBaseLocation.exists()); } @After public void after() throws Exception { if (dsFrameworkUtil.getInstance(pfsInstance) != null) { dsFrameworkUtil.deleteInstance(pfsInstance); } if (dsFrameworkUtil.getInstance(pfsExternalInstance) != null) { dsFrameworkUtil.deleteInstance(pfsExternalInstance); } Assert.assertFalse(pfsBaseLocation.exists()); } @Test public void testPartitionConsumer() throws Exception { // exercises the edge case of partition consumption, when partitions are being consumed, while another in-progress // transaction has added a partition, but it has not yet committed, so the partition is not available for the // consumer PartitionedFileSet dataset1 = dsFrameworkUtil.getInstance(pfsInstance); PartitionedFileSet dataset2 = dsFrameworkUtil.getInstance(pfsInstance); TransactionManager txManager = dsFrameworkUtil.getTxManager(); InMemoryTxSystemClient txClient = new InMemoryTxSystemClient(txManager); // producer simply adds initial partition TransactionContext txContext1 = new TransactionContext(txClient, (TransactionAware) dataset1); txContext1.start(); PartitionKey partitionKey1 = generateUniqueKey(); dataset1.getPartitionOutput(partitionKey1).addPartition(); txContext1.finish(); // consumer simply consumes initial partition TransactionContext txContext2 = new TransactionContext(txClient, (TransactionAware) dataset2); txContext2.start(); PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset2, new InMemoryStatePersistor()); List<? extends PartitionDetail> partitionIterator = partitionConsumer.consumePartitions().getPartitions(); Assert.assertEquals(1, partitionIterator.size()); Assert.assertEquals(partitionKey1, partitionIterator.get(0).getPartitionKey()); txContext2.finish(); // producer adds a second partition, but does not yet commit the transaction txContext1.start(); PartitionKey partitionKey2 = generateUniqueKey(); dataset1.getPartitionOutput(partitionKey2).addPartition(); // consumer attempts to consume at a time after the partition was added, but before it committed. Because of this, // the partition is not visible and will not be consumed txContext2.start(); Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty()); txContext2.finish(); // producer commits the transaction in which the second partition was added txContext1.finish(); // the next time the consumer runs, it processes the second partition txContext2.start(); partitionIterator = partitionConsumer.consumePartitions().getPartitions(); Assert.assertEquals(1, partitionIterator.size()); Assert.assertEquals(partitionKey2, partitionIterator.get(0).getPartitionKey()); txContext2.finish(); } @Test public void testSimplePartitionConsuming() throws Exception { final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance); final TransactionAware txAwareDataset = (TransactionAware) dataset; final Set<PartitionKey> partitionKeys1 = new HashSet<>(); for (int i = 0; i < 10; i++) { partitionKeys1.add(generateUniqueKey()); } final Set<PartitionKey> partitionKeys2 = new HashSet<>(); for (int i = 0; i < 15; i++) { partitionKeys2.add(generateUniqueKey()); } final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { for (PartitionKey partitionKey : partitionKeys1) { dataset.getPartitionOutput(partitionKey).addPartition(); } } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only // those partitions are added to the dataset at this point List<? extends Partition> consumedPartitions = partitionConsumer.consumePartitions().getPartitions(); Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions)); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { for (PartitionKey partitionKey : partitionKeys2) { dataset.getPartitionOutput(partitionKey).addPartition(); } } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional // partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions())); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // consuming the partitions again, without adding any new partitions returns an empty iterator Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty()); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator // with all the partition keys List<? extends Partition> consumedPartitions = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()).consumePartitions().getPartitions(); Set<PartitionKey> allKeys = new HashSet<>(); allKeys.addAll(partitionKeys1); allKeys.addAll(partitionKeys2); Assert.assertEquals(allKeys, toKeys(consumedPartitions)); } }); } @Test public void testConsumeAfterDelete() throws Exception { final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance); final TransactionAware txAwareDataset = (TransactionAware) dataset; final Set<PartitionKey> partitionKeys1 = new HashSet<>(); for (int i = 0; i < 3; i++) { partitionKeys1.add(generateUniqueKey()); } // need to ensure that our consumerConfiguration is larger than the amount we consume initially, so that // additional partitions (which will be deleted afterwards) are brought into the working set ConsumerConfiguration consumerConfiguration = ConsumerConfiguration.builder().setMaxWorkingSetSize(100).build(); final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), consumerConfiguration); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { for (PartitionKey partitionKey : partitionKeys1) { dataset.getPartitionOutput(partitionKey).addPartition(); } } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // add 2 more partitions after the first 3. We do not need to keep track of these, because they will be dropped // and not consumed for (int i = 0; i < 2; i++) { dataset.getPartitionOutput(generateUniqueKey()).addPartition(); } } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // consume 3 of the 5 initial partitions Assert.assertEquals(partitionKeys1, toKeys(partitionConsumer.consumePartitions(3).getPartitions())); } }); final Set<PartitionKey> partitionKeys2 = new HashSet<>(); for (int i = 0; i < 5; i++) { partitionKeys2.add(generateUniqueKey()); } dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // drop all existing partitions (2 of which are not consumed) for (PartitionDetail partitionDetail : dataset.getPartitions(PartitionFilter.ALWAYS_MATCH)) { dataset.dropPartition(partitionDetail.getPartitionKey()); } // add 5 new ones for (PartitionKey partitionKey : partitionKeys2) { dataset.getPartitionOutput(partitionKey).addPartition(); } } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // the consumed partition keys should correspond to partitionKeys2, and not include the dropped, but unconsumed // partitions added before them Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions())); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // consuming the partitions again, without adding any new partitions returns an empty iterator Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty()); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator // with all the partition keys added after the deletions ConcurrentPartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()); Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer2.consumePartitions().getPartitions())); } }); } @Test public void testPartitionConsumingWithFilterAndLimit() throws Exception { final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance); final TransactionAware txAwareDataset = (TransactionAware) dataset; final Set<PartitionKey> partitionKeys1 = new HashSet<>(); for (int i = 0; i < 10; i++) { partitionKeys1.add(generateUniqueKey()); } final Set<PartitionKey> partitionKeys2 = new HashSet<>(); for (int i = 0; i < 15; i++) { partitionKeys2.add(generateUniqueKey()); } final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()); // add each of partitionKeys1 in separate transaction, so limit can be applied at arbitrary values // (consumption only happens at transaction borders) for (final PartitionKey partitionKey : partitionKeys1) { dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { dataset.getPartitionOutput(partitionKey).addPartition(); } }); } dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only // those partitions are added to the dataset at this point List<Partition> consumedPartitions = new ArrayList<>(); // with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(1).getPartitions()); Assert.assertEquals(1, consumedPartitions.size()); // ask for 5 more Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5).getPartitions()); Assert.assertEquals(6, consumedPartitions.size()); // ask for 5 more, but there are only 4 more unconsumed partitions (size of partitionKeys1 is 10). Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5).getPartitions()); Assert.assertEquals(10, consumedPartitions.size()); Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions)); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { for (PartitionKey partitionKey : partitionKeys2) { dataset.getPartitionOutput(partitionKey).addPartition(); } } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional // partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions())); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // consuming the partitions again, without adding any new partitions returns an empty iterator Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty()); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // creating a new PartitionConsumer resets the consumption state. // test combination of filter and limit // the partitionFilter will match partitionKeys [1, 7), of which there are 6 final PartitionFilter partitionFilter = PartitionFilter.builder().addRangeCondition("i", 1, 7).build(); final Predicate<PartitionDetail> predicate = new Predicate<PartitionDetail>() { @Override public boolean apply(PartitionDetail partitionDetail) { return partitionFilter.match(partitionDetail.getPartitionKey()); } }; ConsumerConfiguration configuration = ConsumerConfiguration.builder().setPartitionPredicate(predicate).build(); PartitionConsumer newPartitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), configuration); List<Partition> consumedPartitions = new ArrayList<>(); // apply the filter (narrows it down to 6 elements) and apply a limit of 4 results in 4 consumed partitions Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(4).getPartitions()); Assert.assertEquals(4, consumedPartitions.size()); // apply a limit of 3, using the same filter returns the remaining 2 elements that fit that filter Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(3).getPartitions()); Assert.assertEquals(6, consumedPartitions.size()); // assert that the partitions returned have partition keys, where the i values range from [1, 7] Set<Integer> expectedIFields = new HashSet<>(); for (int i = 1; i < 7; i++) { expectedIFields.add(i); } Set<Integer> actualIFields = new HashSet<>(); for (Partition consumedPartition : consumedPartitions) { actualIFields.add((Integer) consumedPartition.getPartitionKey().getField("i")); } Assert.assertEquals(expectedIFields, actualIFields); } }); } /** * A custom {@link PartitionAcceptor} which skips any partitions that have partition key field 's' not equal to * an 'allowedSField'. It also stops iterating if the 'i' key field is equal to a specified 'stopOnI' value. */ public static final class CustomAcceptor implements PartitionAcceptor { private final String allowedSField; private final Integer stopOnI; public CustomAcceptor(String allowedSField) { this(allowedSField, null); } public CustomAcceptor(String allowedSField, @Nullable Integer stopOnI) { this.allowedSField = allowedSField; this.stopOnI = stopOnI; } @Override public Return accept(PartitionDetail partitionDetail) { String sField = (String) partitionDetail.getPartitionKey().getField("s"); if (!allowedSField.equals(sField)) { return Return.SKIP; } int iField = (int) partitionDetail.getPartitionKey().getField("i"); if (stopOnI != null && stopOnI.equals(iField)) { return Return.STOP; } return Return.ACCEPT; } } @Test public void testPartitionConsumingWithPartitionAcceptor() throws Exception { final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance); final TransactionAware txAwareDataset = (TransactionAware) dataset; // i will range from [0,10), s will always be 'partitionKeys1' final Set<PartitionKey> partitionKeys1 = new HashSet<>(); for (int i = 0; i < 10; i++) { PartitionKey key = PartitionKey.builder() .addIntField("i", i) .addLongField("l", 17L) .addStringField("s", "partitionKeys1") .build(); partitionKeys1.add(key); } // i will range from [0,15), s will always be 'partitionKeys2' final Set<PartitionKey> partitionKeys2 = new HashSet<>(); for (int i = 0; i < 15; i++) { PartitionKey key = PartitionKey.builder() .addIntField("i", i) .addLongField("l", 17L) .addStringField("s", "partitionKeys2") .build(); partitionKeys2.add(key); } dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { for (final PartitionKey partitionKey : partitionKeys1) { dataset.getPartitionOutput(partitionKey).addPartition(); } for (final PartitionKey partitionKey : partitionKeys2) { dataset.getPartitionOutput(partitionKey).addPartition(); } } }); final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { List<Partition> consumedPartitions = new ArrayList<>(); // specify a PartitionAcceptor that only limits to partitions where 's' field is equal to 'partitionKeys1' // so it will get all the partitions in partitionKeys1 Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(new CustomAcceptor("partitionKeys1")).getPartitions()); // assert that we consumed all the partitions represented by partitionsKeys1 Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions)); consumedPartitions.clear(); // ask for partitions where 's' field is equal to 'partitionKeys2', but stop iterating upon 'i' field == 8 Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(new CustomAcceptor("partitionKeys2", 8)).getPartitions()); // this will give us 8 of partitionKeys2 Assert.assertEquals(8, consumedPartitions.size()); // ask for the remainder of the partitions - i ranging from [8,15). Then, we will have all of 'partitionKeys2' Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions().getPartitions()); Assert.assertEquals(partitionKeys2, toKeys(consumedPartitions)); } }); } private Set<PartitionKey> toKeys(List<? extends Partition> partitions) { Set<PartitionKey> partitionKeys = new HashSet<>(partitions.size()); for (Partition partition : partitions) { partitionKeys.add(partition.getPartitionKey()); } return partitionKeys; } private static final class InMemoryStatePersistor implements StatePersistor { private byte[] state; @Override public void persistState(byte[] state) { this.state = state; } @Nullable @Override public byte[] readState() { return state; } } @Test public void testSimpleConcurrency() throws Exception { final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance); final TransactionAware txAwareDataset = (TransactionAware) dataset; final Set<PartitionKey> partitionKeys = new HashSet<>(); for (int i = 0; i < 10; i++) { partitionKeys.add(generateUniqueKey()); } // have ConcurrentPartitionConsumers that share the same state. InMemoryStatePersistor persistor = new InMemoryStatePersistor(); ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(3).build(); final PartitionConsumer partitionConsumer1 = new ConcurrentPartitionConsumer(dataset, persistor, configuration); final PartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, persistor, configuration); final PartitionConsumer partitionConsumer3 = new ConcurrentPartitionConsumer(dataset, persistor, configuration); // add all ten keys to the partitioned fileset dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { for (final PartitionKey partitionKey : partitionKeys) { dataset.getPartitionOutput(partitionKey).addPartition(); } } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions List<PartitionDetail> consumedBy1 = partitionConsumer1.consumePartitions(1).getPartitions(); Assert.assertEquals(1, consumedBy1.size()); // partitionConsumer2 asks for 10 partitions, but 1 is currently in progress by partitionConsumer1, so it only // gets the remaining 9 partitions List<PartitionDetail> consumedBy2 = partitionConsumer2.consumePartitions(10).getPartitions(); Assert.assertEquals(9, consumedBy2.size()); // partitionConsumer3 tries to consume partitions, but all are marked in-progress by partitionConsumer 1 and 2 Assert.assertEquals(0, partitionConsumer3.consumePartitions().getPartitions().size()); // partitionConsumer1 aborts its partition, so it then becomes available for partitionConsumer3 partitionConsumer1.onFinish(consumedBy1, false); consumedBy1.clear(); // queries with limit=2, but only the 1 is available that partitionConsumer1 released List<PartitionDetail> consumedBy3 = partitionConsumer3.consumePartitions(2).getPartitions(); Assert.assertEquals(1, consumedBy3.size()); // partitionConsumers 2 and 3 marks that it successfully processed the partitions partitionConsumer3.onFinish(consumedBy3, true); // test onFinishWithKeys API List<PartitionKey> keysConsumedBy2 = Lists.transform(consumedBy2, new Function<PartitionDetail, PartitionKey>() { @Override public PartitionKey apply(PartitionDetail input) { return input.getPartitionKey(); } }); partitionConsumer2.onFinishWithKeys(keysConsumedBy2, true); // at this point, all partitions are processed, so no additional partitions are available for consumption Assert.assertEquals(0, partitionConsumer3.consumePartitions().getPartitions().size()); List<PartitionDetail> allProcessedPartitions = new ArrayList<>(); allProcessedPartitions.addAll(consumedBy1); allProcessedPartitions.addAll(consumedBy2); allProcessedPartitions.addAll(consumedBy3); // ordering may be different, since all the partitions were added in the same transaction Assert.assertEquals(partitionKeys, toKeys(allProcessedPartitions)); } }); } @Test public void testOnFinishWithInvalidPartition() throws Exception { // tests: // - attempts to abort a Partition that is not IN_PROGRESS // - attempts to commit a Partition that is already committed // both of these throw IllegalArgumentException final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance); final TransactionAware txAwareDataset = (TransactionAware) dataset; ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(3).build(); final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), configuration); final PartitionKey partitionKey = generateUniqueKey(); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { dataset.getPartitionOutput(partitionKey).addPartition(); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { List<PartitionDetail> partitionDetails = partitionConsumer.consumePartitions(1).getPartitions(); Assert.assertEquals(1, partitionDetails.size()); // aborting the processing of the partition partitionConsumer.onFinish(partitionDetails, false); // calling abort on the partition again throws IllegalArgumentException, because the partitions passed in to // abort were not found to have IN_PROGRESS state try { partitionConsumer.onFinish(partitionDetails, false); Assert.fail("Expected not to be able to abort a partition that is not IN_PROGRESS"); } catch (IllegalStateException expected) { } // try to process the partition again, this time marking it as complete (by passing in true) partitionDetails = partitionConsumer.consumePartitions(1).getPartitions(); Assert.assertEquals(1, partitionDetails.size()); partitionConsumer.onFinish(partitionDetails, true); // attempting to mark it as complete a second time will an IllegalArgumentException, because the partition // is not found to have an IN_PROGRESS state try { partitionConsumer.onFinish(partitionDetails, true); Assert.fail("Expected not to be able to call onFinish on a partition is not IN_PROGRESS"); } catch (IllegalArgumentException expected) { } } }); } @Test public void testNumRetries() throws Exception { final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance); final TransactionAware txAwareDataset = (TransactionAware) dataset; final int numRetries = 1; ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(numRetries).build(); final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), configuration); final PartitionKey partitionKey = generateUniqueKey(); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { dataset.getPartitionOutput(partitionKey).addPartition(); } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { // consuming and aborting the partition numRetries times plus one (for the first attempt) makes it get removed // from the working set for (int i = 0; i < numRetries + 1; i++) { List<PartitionDetail> partitionDetails = partitionConsumer.consumePartitions(1).getPartitions(); Assert.assertEquals(1, partitionDetails.size()); Assert.assertEquals(partitionKey, partitionDetails.get(0).getPartitionKey()); // aborting the processing of the partition partitionConsumer.onFinish(partitionDetails, false); } // after the 2nd abort, the partition is discarded entirely, and so no partitions are available for consuming PartitionConsumerResult result = partitionConsumer.consumePartitions(1); Assert.assertEquals(0, result.getPartitions().size()); Assert.assertEquals(1, result.getFailedPartitions().size()); Assert.assertEquals(partitionKey, result.getFailedPartitions().get(0).getPartitionKey()); } }); } /** * Custom implementation of {@link ConcurrentPartitionConsumer} that returns only a single partition if it is that * partition's last attempt at processing. */ public static final class CustomConsumer extends ConcurrentPartitionConsumer { public CustomConsumer(PartitionedFileSet partitionedFileSet, StatePersistor statePersistor, ConsumerConfiguration configuration) { super(partitionedFileSet, statePersistor, configuration); } @Override public PartitionConsumerResult doConsume(ConsumerWorkingSet workingSet, PartitionAcceptor acceptor) { doExpiry(workingSet); workingSet.populate(getPartitionedFileSet(), getConfiguration()); long now = System.currentTimeMillis(); List<PartitionDetail> toConsume = new ArrayList<>(); // check if the first available partition is on its last attempt. If so, return it as a single element. List<? extends ConsumablePartition> partitions = workingSet.getPartitions(); if (partitions.size() >= 1) { ConsumablePartition firstPartition = partitions.get(0); if (isLastAttempt(firstPartition)) { firstPartition.take(); firstPartition.setTimestamp(now); toConsume.add(getPartitionedFileSet().getPartition(firstPartition.getPartitionKey())); return new PartitionConsumerResult(toConsume, removeDiscardedPartitions(workingSet)); } } for (ConsumablePartition consumablePartition : partitions) { if (ProcessState.AVAILABLE != consumablePartition.getProcessState()) { continue; } // if the first available partition is not on its last attempt, perform the regular partition consuming, // but skipping any partitions that are on their last attempt if (isLastAttempt(consumablePartition)) { continue; } PartitionDetail partition = getPartitionedFileSet().getPartition(consumablePartition.getPartitionKey()); PartitionAcceptor.Return accept = acceptor.accept(partition); switch (accept) { case ACCEPT: consumablePartition.take(); consumablePartition.setTimestamp(now); toConsume.add(partition); continue; case SKIP: continue; case STOP: break; } } return new PartitionConsumerResult(toConsume, removeDiscardedPartitions(workingSet)); } // returns true if the given partition only has one more attempt at processing before it is discarded private boolean isLastAttempt(ConsumablePartition partition) { return partition.getNumFailures() == getConfiguration().getMaxRetries() - 1; } } @Test public void testCustomOperations() throws Exception { final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance); final TransactionAware txAwareDataset = (TransactionAware) dataset; ConsumerConfiguration configuration = ConsumerConfiguration.builder() .setMaxRetries(3) .build(); final PartitionConsumer partitionConsumer = new CustomConsumer(dataset, new InMemoryStatePersistor(), configuration); final int numPartitions = 3; final List<PartitionKey> partitionKeys = new ArrayList<>(numPartitions); for (int i = 0; i < numPartitions; i++) { partitionKeys.add(generateUniqueKey()); } dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { for (PartitionKey partitionKey : partitionKeys) { dataset.getPartitionOutput(partitionKey).addPartition(); } } }); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { List<PartitionDetail> partitions = partitionConsumer.consumePartitions().getPartitions(); Assert.assertEquals(numPartitions, partitions.size()); partitionConsumer.onFinish(partitions, false); partitions = partitionConsumer.consumePartitions().getPartitions(); Assert.assertEquals(numPartitions, partitions.size()); partitionConsumer.onFinish(partitions, false); // after two failure attempts, the partitions are now returned individually partitions = partitionConsumer.consumePartitions().getPartitions(); Assert.assertEquals(1, partitions.size()); partitionConsumer.onFinish(partitions, true); partitions = partitionConsumer.consumePartitions().getPartitions(); Assert.assertEquals(1, partitions.size()); partitionConsumer.onFinish(partitions, true); partitions = partitionConsumer.consumePartitions().getPartitions(); Assert.assertEquals(1, partitions.size()); partitionConsumer.onFinish(partitions, true); } }); } private int counter = 0; // generates unique partition keys, where the 'i' field is incrementing from 0 upwards on each returned key private PartitionKey generateUniqueKey() { return PartitionKey.builder() .addIntField("i", counter++) .addLongField("l", 17L) .addStringField("s", UUID.randomUUID().toString()) .build(); } }