/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.datacleansing;
import co.cask.cdap.api.dataset.lib.PartitionDetail;
import co.cask.cdap.api.dataset.lib.PartitionFilter;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.cube.AggregationFunction;
import co.cask.cdap.api.dataset.lib.cube.TimeValue;
import co.cask.cdap.api.metrics.MetricDataQuery;
import co.cask.cdap.api.metrics.MetricTimeSeries;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.proto.Id;
import co.cask.cdap.test.ApplicationManager;
import co.cask.cdap.test.DataSetManager;
import co.cask.cdap.test.MapReduceManager;
import co.cask.cdap.test.ServiceManager;
import co.cask.cdap.test.TestBase;
import co.cask.common.http.HttpRequest;
import co.cask.common.http.HttpRequests;
import co.cask.common.http.HttpResponse;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import org.apache.twill.filesystem.Location;
import org.junit.Assert;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
/**
* Tests that a MapReduce job can incrementally process the partitions of a PartitionedFileSet, using a small sample of
* data with the DataCleansing MapReduce job.
*/
public class DataCleansingMapReduceTest extends TestBase {
private static final String RECORD1 =
"{\"pid\":223986723,\"name\":\"bob\",\"dob\":\"02-12-1983\",\"zip\":\"84125\"}";
private static final String RECORD2 =
"{\"pid\":198637201,\"name\":\"timothy\",\"dob\":\"06-21-1995\",\"zip\":\"84125q\"}";
private static final Set<String> RECORD_SET1 = ImmutableSet.of(RECORD1, RECORD2);
private static final String RECORD3 =
"{\"pid\":001058370,\"name\":\"jill\",\"dob\":\"12-12-1963\",\"zip\":\"84126\"}";
private static final String RECORD4 =
"{\"pid\":000150018,\"name\":\"wendy\",\"dob\":\"06-19-1987\",\"zip\":\"84125\"}";
private static final Set<String> RECORD_SET2 = ImmutableSet.of(RECORD3, RECORD4);
private static final String RECORD5 =
"{\"pid\":013587810,\"name\":\"john\",\"dob\":\"10-10-1991\",\"zip\":\"84126\"}";
private static final String RECORD6 =
"{\"pid\":811638015,\"name\":\"samantha\",\"dob\":\"04-20-1965\",\"zip\":\"84125\"}";
private static final Set<String> RECORD_SET3 = ImmutableSet.of(RECORD5, RECORD6);
private static final String schemaJson = DataCleansingMapReduce.SchemaMatchingFilter.DEFAULT_SCHEMA.toString();
private static final SimpleSchemaMatcher schemaMatcher =
new SimpleSchemaMatcher(DataCleansingMapReduce.SchemaMatchingFilter.DEFAULT_SCHEMA);
@Test
public void testPartitionConsuming() throws Exception {
ApplicationManager applicationManager = deployApplication(DataCleansing.class);
ServiceManager serviceManager = applicationManager.getServiceManager(DataCleansingService.NAME).start();
serviceManager.waitForStatus(true);
URL serviceURL = serviceManager.getServiceURL();
// write a set of records to one partition and run the DataCleansingMapReduce job on that one partition
createPartition(serviceURL, RECORD_SET1);
// before starting the MR, there are 0 invalid records and 0 valid records, according to metrics
Assert.assertEquals(0, getValidityMetrics(true));
Assert.assertEquals(0, getValidityMetrics(false));
Long now = System.currentTimeMillis();
ImmutableMap<String, String> args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(),
DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
MapReduceManager mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForFinish(5, TimeUnit.MINUTES);
compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(RECORD_SET1, true));
compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(RECORD_SET1, false));
// assert that some of the records have indeed been filtered
Assert.assertNotEquals(filterRecords(RECORD_SET1, true), RECORD_SET1);
Assert.assertNotEquals(filterRecords(RECORD_SET1, false), Collections.<String>emptySet());
// verify this via metrics
Assert.assertEquals(1, getValidityMetrics(true));
Assert.assertEquals(1, getValidityMetrics(false));
// create two additional partitions
createPartition(serviceURL, RECORD_SET2);
createPartition(serviceURL, RECORD_SET3);
// running the MapReduce job now processes these two new partitions (RECORD_SET1 and RECORD_SET2) and creates a new
// partition with with the output
now = System.currentTimeMillis();
args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(),
DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForFinish(5, TimeUnit.MINUTES);
ImmutableSet<String> recordSets2and3 =
ImmutableSet.<String>builder().addAll(RECORD_SET2).addAll(RECORD_SET3).build();
compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(recordSets2and3, true));
compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(recordSets2and3, false));
// verify this via metrics
Assert.assertEquals(1, getValidityMetrics(true));
Assert.assertEquals(5, getValidityMetrics(false));
// running the MapReduce job without adding new partitions creates no additional output
now = System.currentTimeMillis();
args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(),
DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForFinish(5, TimeUnit.MINUTES);
compareData(now, DataCleansing.CLEAN_RECORDS, Collections.<String>emptySet());
compareData(now, DataCleansing.INVALID_RECORDS, Collections.<String>emptySet());
// verify that the records were properly partitioned on their zip
DataSetManager<PartitionedFileSet> cleanRecords = getDataset(DataCleansing.CLEAN_RECORDS);
PartitionFilter filter = PartitionFilter.builder().addValueCondition("zip", 84125).build();
Assert.assertEquals(ImmutableSet.of(RECORD1, RECORD4, RECORD6), getDataFromFilter(cleanRecords.get(), filter));
filter = PartitionFilter.builder().addValueCondition("zip", 84126).build();
Assert.assertEquals(ImmutableSet.of(RECORD3, RECORD5), getDataFromFilter(cleanRecords.get(), filter));
}
private void createPartition(URL serviceUrl, Set<String> records) throws IOException {
URL url = new URL(serviceUrl, "v1/records/raw");
String body = Joiner.on("\n").join(records) + "\n";
HttpRequest request = HttpRequest.post(url).withBody(body).build();
HttpResponse response = HttpRequests.execute(request);
Assert.assertEquals(200, response.getResponseCode());
}
private void compareData(Long time, String dsName, Set<String> expectedRecords) throws Exception {
Assert.assertEquals(expectedRecords, getDataFromFile(time, dsName));
Assert.assertEquals(expectedRecords, getDataFromExplore(time, dsName));
}
private Set<String> getDataFromExplore(Long time, String dsName) throws Exception {
try (Connection connection = getQueryClient()) {
ResultSet results = connection
.prepareStatement("SELECT * FROM dataset_" + dsName + " where TIME = " + time)
.executeQuery();
Set<String> cleanRecords = new HashSet<>();
while (results.next()) {
cleanRecords.add(results.getString(1));
}
return cleanRecords;
}
}
private Set<String> getDataFromFile(Long time, String dsName) throws Exception {
DataSetManager<PartitionedFileSet> cleanRecords = getDataset(dsName);
PartitionFilter filter = PartitionFilter.builder().addValueCondition("time", time).build();
return getDataFromFilter(cleanRecords.get(), filter);
}
private Set<String> getDataFromFilter(PartitionedFileSet partitionedFileSet,
PartitionFilter filter) throws IOException {
Set<PartitionDetail> partitions = partitionedFileSet.getPartitions(filter);
Set<String> cleanData = new HashSet<>();
for (PartitionDetail partition : partitions) {
Assert.assertEquals(ImmutableMap.of("source.program", "DataCleansingMapReduce"),
partition.getMetadata().asMap());
Location partitionLocation = partition.getLocation();
for (Location location : partitionLocation.list()) {
if (location.getName().startsWith("part-")) {
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(location.getInputStream()))) {
String line;
while ((line = bufferedReader.readLine()) != null) {
cleanData.add(line);
}
}
}
}
}
return cleanData;
}
/**
* @param records the set of records to filter
* @param filterInvalids if true, will filter out invalid records; else, will return only invalid records
* @return the filtered set of records
*/
private Set<String> filterRecords(Set<String> records, boolean filterInvalids) {
Set<String> filteredSet = new HashSet<>();
for (String record : records) {
if (filterInvalids == schemaMatcher.matches(record)) {
filteredSet.add(record);
}
}
return filteredSet;
}
// pass true to get the number of invalid records; pass false to get the number of valid records processed.
private long getValidityMetrics(boolean invalid) throws Exception {
String metric = "user.records." + (invalid ? "invalid" : "valid");
Map<String, String> tags = ImmutableMap.of(Constants.Metrics.Tag.NAMESPACE, Id.Namespace.DEFAULT.getId(),
Constants.Metrics.Tag.APP, DataCleansing.NAME,
Constants.Metrics.Tag.MAPREDUCE, DataCleansingMapReduce.NAME);
MetricDataQuery metricQuery = new MetricDataQuery(0, Integer.MAX_VALUE, Integer.MAX_VALUE, metric,
AggregationFunction.SUM, tags, ImmutableList.<String>of());
Collection<MetricTimeSeries> result = getMetricsManager().query(metricQuery);
if (result.isEmpty()) {
return 0;
}
// since it is totals query and not groupBy specified, we know there's one time series
List<TimeValue> timeValues = result.iterator().next().getTimeValues();
if (timeValues.isEmpty()) {
return 0;
}
// since it is totals, we know there's one value only
return timeValues.get(0).getValue();
}
}