/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.dq.test;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.dataset.table.Row;
import co.cask.cdap.api.dataset.table.Scanner;
import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.dq.AggregationTypeValue;
import co.cask.cdap.dq.DataQualityApp;
import co.cask.cdap.dq.DataQualityService;
import co.cask.cdap.dq.DataQualitySource;
import co.cask.cdap.dq.FieldDetail;
import co.cask.cdap.dq.TimestampValue;
import co.cask.cdap.dq.functions.DiscreteValuesHistogram;
import co.cask.cdap.dq.testclasses.StreamBatchSource;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.cdap.proto.Id;
import co.cask.cdap.proto.artifact.AppRequest;
import co.cask.cdap.proto.artifact.ArtifactSummary;
import co.cask.cdap.test.ApplicationManager;
import co.cask.cdap.test.MapReduceManager;
import co.cask.cdap.test.ServiceManager;
import co.cask.cdap.test.StreamManager;
import co.cask.cdap.test.TestBase;
import co.cask.common.http.HttpRequest;
import co.cask.common.http.HttpRequests;
import co.cask.common.http.HttpResponse;
import com.google.common.base.Strings;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import java.lang.reflect.Type;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nullable;
/**
* Test for {@link DataQualityApp}.
*/
public class DataQualityAppTest extends TestBase {
private static final Gson GSON = new Gson();
private static final Type TOKEN_TYPE_LIST_TIMESTAMP_VALUE = new TypeToken<ArrayList<TimestampValue>>() { }.getType();
private static final Type TOKEN_TYPE_DOUBLE = new TypeToken<Double>() { }.getType();
private static final Type TOKEN_TYPE_MAP_STRING_INTEGER = new TypeToken<Map<String, Integer>>() { }.getType();
private static final Type TOKEN_TYPE_LIST_AGGREGATION_TYPE_VALUES =
new TypeToken<List<AggregationTypeValue>>() { }.getType();
private static final Type TOKEN_TYPE_SET_FIELD_DETAIL = new TypeToken<HashSet<FieldDetail>>() { }.getType();
private static final Integer WORKFLOW_SCHEDULE_MINUTES = 5;
private static Id.Artifact appArtifact;
private static boolean sentData = false;
@BeforeClass
public static void setup() throws Exception {
appArtifact = Id.Artifact.from(Id.Namespace.DEFAULT, "dqArtifact", "1.0");
addAppArtifact(appArtifact, DataQualityApp.class, BatchSource.class.getPackage().getName(),
PipelineConfigurer.class.getPackage().getName());
Id.Artifact pluginArtifactId = Id.Artifact.from(Id.Namespace.DEFAULT, "source-plugin", "1.0.0-SNAPSHOT");
addPluginArtifact(pluginArtifactId, appArtifact, StreamBatchSource.class, AvroKey.class, GenericRecord.class);
}
@Before
public void beforeTest() throws Exception {
StreamManager streamManager = getStreamManager("logStream");
streamManager.createStream();
String logData1 = "10.10.10.10 - - [01/Feb/2015:06:47:10 +0000] " +
"\"GET /browse/COOP-DBT-JOB1-238/artifact HTTP/1.1\"" +
" 301 256 \"-\" \"Mozilla/5.0 (compatible; AhrefsBot/5.0; +http://ahrefs.com/robot/)\"";
String logData2 = "10.10.12.10 - - [01/Feb/2015:06:47:10 +0000]" +
" \"GET /browse/COOP-DBT-JOB1-238/artifact HTTP/1.1\"" +
" 301 256 \"-\" \"Mozilla/5.0 (compatible; AhrefsBot/5.0; +http://ahrefs.com/robot/)\"";
String logData3 = "10.10.11.10 - - [01/Feb/2015:06:47:10 +0000]" +
" \"GET /browse/COOP-DBT-JOB1-238/artifact HTTP/1.1\"" +
" 301 256 \"-\" \"Mozilla/5.0 (compatible; AhrefsBot/5.0; +http://ahrefs.com/robot/)\"";
if (!sentData) {
streamManager.send(logData1);
streamManager.send(logData2);
streamManager.send(logData3);
sentData = true;
}
}
@Test(expected = IllegalStateException.class)
public void testInvalidConfig() throws Exception {
Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "badApp");
Map<String, Set<String>> testMap = new HashMap<>();
// Empty aggregation set - should throw an exception while creating an application
testMap.put("content_length", new HashSet<String>());
DataQualityApp.DataQualityConfig config = new DataQualityApp.DataQualityConfig(
50, getStreamSource(), "avg", null);
AppRequest<DataQualityApp.DataQualityConfig> appRequest = new AppRequest<>(
new ArtifactSummary(appArtifact.getName(), appArtifact.getVersion().getVersion()), config);
deployApplication(appId, appRequest);
}
@Test
public void testDefaultConfig() throws Exception {
Map<String, Set<String>> testMap = new HashMap<>();
Set<String> testSet = new HashSet<>();
testSet.add("DiscreteValuesHistogram");
testMap.put("content_length", testSet);
DataQualityApp.DataQualityConfig config = new DataQualityApp.DataQualityConfig(
WORKFLOW_SCHEDULE_MINUTES, getStreamSource(), "dataQuality", testMap);
Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "newApp");
AppRequest<DataQualityApp.DataQualityConfig> appRequest = new AppRequest<>(
new ArtifactSummary(appArtifact.getName(), appArtifact.getVersion().getVersion()), config);
ApplicationManager applicationManager = deployApplication(appId, appRequest);
MapReduceManager mrManager = applicationManager.getMapReduceManager("FieldAggregator").start();
mrManager.waitForFinish(180, TimeUnit.SECONDS);
Table logDataStore = (Table) getDataset("dataQuality").get();
Scanner scanner = logDataStore.scan(null, null);
DiscreteValuesHistogram discreteValuesHistogramAggregationFunction = new DiscreteValuesHistogram();
Row row;
try {
while ((row = scanner.next()) != null) {
if (Bytes.toString(row.getRow()).contains("content_length")) {
Map<byte[], byte[]> columnsMapBytes = row.getColumns();
byte[] output = columnsMapBytes.get(Bytes.toBytes("DiscreteValuesHistogram"));
if (output != null) {
discreteValuesHistogramAggregationFunction.combine(output);
}
}
}
} finally {
scanner.close();
}
Map<String, Integer> outputMap = discreteValuesHistogramAggregationFunction.retrieveAggregation();
Map<String, Integer> expectedMap = Maps.newHashMap();
expectedMap.put("256", 3);
Assert.assertEquals(expectedMap, outputMap);
}
@Test
public void testMeanContentLength() throws Exception {
Map<String, Set<String>> testMap = new HashMap<>();
Set<String> testSet = new HashSet<>();
testSet.add("Mean");
testMap.put("content_length", testSet);
DataQualityApp.DataQualityConfig config = new DataQualityApp.DataQualityConfig(
WORKFLOW_SCHEDULE_MINUTES, getStreamSource(), "avg", testMap);
Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "newApp2");
AppRequest<DataQualityApp.DataQualityConfig> appRequest = new AppRequest<>(
new ArtifactSummary(appArtifact.getName(), appArtifact.getVersion().getVersion()), config);
ApplicationManager applicationManager = deployApplication(appId, appRequest);
MapReduceManager mrManager = applicationManager.getMapReduceManager("FieldAggregator").start();
mrManager.waitForFinish(180, TimeUnit.SECONDS);
ServiceManager serviceManager = applicationManager.getServiceManager
(DataQualityService.SERVICE_NAME).start();
serviceManager.waitForStatus(true);
/* Test for aggregationsGetter handler */
URL url = new URL(serviceManager.getServiceURL(),
"v1/sources/logStream/fields/content_length/aggregations/Mean/timeseries");
HttpResponse httpResponse = HttpRequests.execute(HttpRequest.get(url).build());
Assert.assertEquals(HttpURLConnection.HTTP_OK, httpResponse.getResponseCode());
String response = httpResponse.getResponseBodyAsString();
List<TimestampValue> tsValueListActual = GSON.fromJson(response, TOKEN_TYPE_LIST_TIMESTAMP_VALUE);
TimestampValue firstTimestampValue = tsValueListActual.get(0);
Object objActual = firstTimestampValue.getValue();
String actualJSON = GSON.toJson(objActual);
Double actualDouble = GSON.fromJson(actualJSON, TOKEN_TYPE_DOUBLE);
Assert.assertEquals(actualDouble, new Double(256.0));
serviceManager.stop();
serviceManager.waitForFinish(180, TimeUnit.SECONDS);
}
@Test
public void testTotals() throws Exception {
Map<String, Set<String>> testMap = new HashMap<>();
Set<String> testSet = new HashSet<>();
testSet.add("DiscreteValuesHistogram");
testMap.put("content_length", testSet);
testMap.put("status", testSet);
testMap.put("request_time", testSet);
DataQualityApp.DataQualityConfig config = new DataQualityApp.DataQualityConfig(WORKFLOW_SCHEDULE_MINUTES,
getStreamSource(), "histogram", testMap);
Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "newApp3");
AppRequest<DataQualityApp.DataQualityConfig> appRequest = new AppRequest<>(
new ArtifactSummary(appArtifact.getName(), appArtifact.getVersion().getVersion()), config);
ApplicationManager applicationManager = deployApplication(appId, appRequest);
MapReduceManager mrManager = applicationManager.getMapReduceManager("FieldAggregator").start();
mrManager.waitForFinish(180, TimeUnit.SECONDS);
Map<String, Integer> expectedMap = new HashMap<>();
expectedMap.put("256", 3);
/* Test for the aggregationsGetter handler */
ServiceManager serviceManager = applicationManager.getServiceManager
(DataQualityService.SERVICE_NAME).start();
serviceManager.waitForStatus(true);
URL url = new URL(serviceManager.getServiceURL(),
"v1/sources/logStream/fields/content_length/aggregations/DiscreteValuesHistogram/totals");
HttpResponse httpResponse = HttpRequests.execute(HttpRequest.get(url).build());
Assert.assertEquals(HttpURLConnection.HTTP_OK, httpResponse.getResponseCode());
String response = httpResponse.getResponseBodyAsString();
Map<String, Integer> histogramMap = GSON.fromJson(response, TOKEN_TYPE_MAP_STRING_INTEGER);
Assert.assertEquals(expectedMap, histogramMap);
/* Test for the fieldsGetter handler */
url = new URL(serviceManager.getServiceURL(), "v1/sources/logStream/fields");
httpResponse = HttpRequests.execute(HttpRequest.get(url).build());
Assert.assertEquals(HttpURLConnection.HTTP_OK, httpResponse.getResponseCode());
response = httpResponse.getResponseBodyAsString();
Set<FieldDetail> outputSet = GSON.fromJson(response, TOKEN_TYPE_SET_FIELD_DETAIL);
Set<FieldDetail> expectedSet = new HashSet<>();
AggregationTypeValue aggregationTypeValue = new AggregationTypeValue("DiscreteValuesHistogram", true);
Set<AggregationTypeValue> aggregationTypeValuesList = Sets.newHashSet(aggregationTypeValue);
expectedSet.add(new FieldDetail("content_length", aggregationTypeValuesList));
expectedSet.add(new FieldDetail("request_time", aggregationTypeValuesList));
expectedSet.add(new FieldDetail("status", aggregationTypeValuesList));
Assert.assertEquals(expectedSet, outputSet);
/* Test for the aggregationTypesGetter handler */
url = new URL(serviceManager.getServiceURL(), "v1/sources/logStream/fields/content_length");
httpResponse = HttpRequests.execute(HttpRequest.get(url).build());
Assert.assertEquals(HttpURLConnection.HTTP_OK, httpResponse.getResponseCode());
response = httpResponse.getResponseBodyAsString();
List<AggregationTypeValue> expectedAggregationTypeValuesList = new ArrayList<>();
List<AggregationTypeValue> outputAggregationTypeValuesList =
GSON.fromJson(response, TOKEN_TYPE_LIST_AGGREGATION_TYPE_VALUES);
expectedAggregationTypeValuesList.add(new AggregationTypeValue("DiscreteValuesHistogram", true));
Assert.assertEquals(expectedAggregationTypeValuesList, outputAggregationTypeValuesList);
serviceManager.stop();
serviceManager.waitForFinish(180, TimeUnit.SECONDS);
}
private DataQualitySource getStreamSource() {
return getStreamSource("logStream", WORKFLOW_SCHEDULE_MINUTES, "clf");
}
private DataQualitySource getStreamSource(String streamName, int workflowMinutes, String format) {
return getStreamSource(streamName, workflowMinutes, format, null);
}
private DataQualitySource getStreamSource(String streamName, int workflowMinutes, @Nullable String format,
@Nullable String schema) {
Map<String, String> properties = new HashMap<>();
properties.put("name", streamName);
properties.put("duration", String.valueOf(workflowMinutes) + "m");
if (!Strings.isNullOrEmpty(format)) {
properties.put("format", format);
}
if (!Strings.isNullOrEmpty(schema)) {
properties.put("schema", schema);
}
return new DataQualitySource("Stream", streamName, properties);
}
}