/* * Copyright © 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.examples.wikipedia; import co.cask.cdap.api.artifact.ArtifactVersion; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.api.workflow.WorkflowToken; import co.cask.cdap.common.NotFoundException; import co.cask.cdap.common.utils.Tasks; import co.cask.cdap.proto.Id; import co.cask.cdap.proto.RunRecord; import co.cask.cdap.proto.WorkflowTokenNodeDetail; import co.cask.cdap.proto.artifact.AppRequest; import co.cask.cdap.proto.artifact.ArtifactSummary; import co.cask.cdap.test.ApplicationManager; import co.cask.cdap.test.StreamManager; import co.cask.cdap.test.TestBase; import co.cask.cdap.test.TestConfiguration; import co.cask.cdap.test.WorkflowManager; import co.cask.cdap.test.XSlowTests; import com.google.common.collect.ImmutableMap; import com.google.gson.Gson; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; import org.junit.experimental.categories.Category; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; /** * Test for {@link WikipediaPipelineApp} */ public class WikipediaPipelineAppTest extends TestBase { private static final Gson GSON = new Gson(); @ClassRule public static final TestConfiguration CONFIG = new TestConfiguration("explore.enabled", false); private static final Id.Artifact ARTIFACT_ID = Id.Artifact.from(Id.Namespace.DEFAULT, "WikipediaPipelineArtifact", new ArtifactVersion("1.0")); private static final Id.Application APP_ID = Id.Application.from(Id.Namespace.DEFAULT, WikipediaPipelineApp.class.getSimpleName()); private static final ArtifactSummary ARTIFACT_SUMMARY = new ArtifactSummary("WikipediaPipelineArtifact", "1.0"); @BeforeClass public static void setup() throws Exception { addAppArtifact(ARTIFACT_ID, WikipediaPipelineApp.class); } @Test @Category(XSlowTests.class) public void test() throws Exception { WikipediaPipelineApp.WikipediaAppConfig appConfig = new WikipediaPipelineApp.WikipediaAppConfig(); AppRequest<WikipediaPipelineApp.WikipediaAppConfig> appRequest = new AppRequest<>(ARTIFACT_SUMMARY, appConfig); ApplicationManager appManager = deployApplication(APP_ID, appRequest); // Setup input streams with test data createTestData(); WorkflowManager workflowManager = appManager.getWorkflowManager(WikipediaPipelineWorkflow.NAME); // Test with default threshold. Workflow should not proceed beyond first condition. testWorkflow(workflowManager, appConfig); // Test with a reduced threshold, so the workflow proceeds beyond the first predicate testWorkflow(workflowManager, appConfig, 1); // Test K-Means appConfig = new WikipediaPipelineApp.WikipediaAppConfig("kmeans"); appRequest = new AppRequest<>(ARTIFACT_SUMMARY, appConfig); appManager = deployApplication(APP_ID, appRequest); workflowManager = appManager.getWorkflowManager(WikipediaPipelineWorkflow.NAME); testWorkflow(workflowManager, appConfig, 1); } private void createTestData() throws Exception { StreamManager likesStreamManager = getStreamManager(WikipediaPipelineApp.PAGE_TITLES_STREAM); String like1 = GSON.toJson(new StreamToDataset.PageTitleToDatasetMapper.Page("Metallica", "103636093053996", "012-02-11T22:41:49+0000")); String like2 = GSON.toJson(new StreamToDataset.PageTitleToDatasetMapper.Page("grunge", "58417452552", "2012-02-07T21:29:53+0000")); likesStreamManager.send(like1); likesStreamManager.send(like2); StreamManager rawWikipediaStreamManager = getStreamManager(WikipediaPipelineApp.RAW_WIKIPEDIA_STREAM); String data1 = "{\"batchcomplete\":\"\",\"query\":{\"normalized\":[{\"from\":\"metallica\",\"to\":\"Metallica\"}]" + ",\"pages\":{\"18787\":{\"pageid\":18787,\"ns\":0,\"title\":\"Metallica\",\"revisions\":[{\"contentformat\":" + "\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"{{Other uses}}{{pp-semi|small=yes}}{{pp-move-indef|" + "small=yes}}{{Use mdy dates|date=April 2013}}{{Infobox musical artist|name = Metallica|image = Metallica at " + "The O2 Arena London 2008.jpg|caption = Metallica in [[London]] in 2008. From left to right: [[Kirk Hammett]], " + "[[Lars Ulrich]], [[James Hetfield]] and [[Robert Trujillo]]\"}]}}}}"; String data2 = "{\"batchcomplete\":\"\",\"query\":{\"pages\":{\"51580\":{\"pageid\":51580,\"ns\":0," + "\"title\":\"Grunge\",\"revisions\":[{\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\"," + "\"*\":\"{{About|the music genre}}{{Infobox music genre| name = Grunge| bgcolor = crimson| color = white| " + "stylistic_origins = {{nowrap|[[Alternative rock]], [[hardcore punk]],}} [[Heavy metal music|heavy metal]], " + "[[punk rock]], [[hard rock]], [[noise rock]]| cultural_origins = Mid-1980s, [[Seattle|Seattle, Washington]], " + "[[United States]]| instruments = [[Electric guitar]], [[bass guitar]], [[Drum kit|drums]], " + "[[Singing|vocals]]| derivatives = [[Post-grunge]], [[nu metal]]| subgenrelist = | subgenres = | fusiongenres" + " = | regional_scenes = [[Music of Washington (state)|Washington state]]| other_topics = * " + "[[Alternative metal]]* [[Generation X]]* [[Grunge speak|grunge speak hoax]]* [[timeline of alternative " + "rock]]}}'''Grunge''' (sometimes referred to as the '''Seattle sound''') is a subgenre of [[alternative rock]]" + " that emerged during the mid-1980s in the American state of [[Washington (state)|Washington]], particularly " + "in [[Seattle]]. The early grunge movement revolved around Seattle's [[independent record label]] " + "[[Sub Pop]], but by the early 1990s its popularity had spread, with grunge acts in California and other " + "parts of the U.S. building strong followings and signing major record deals.Grunge became commercially " + "successful in the first half of the 1990s, due mainly to the release of [[Nirvana (band)|Nirvana]]'s " + "''[[Nevermind]]'', [[Pearl Jam]]'s ''[[Ten (Pearl Jam album)|Ten]]'', [[Soundgarden]]'s " + "''[[Badmotorfinger]]'', [[Alice in Chains]]' ''[[Dirt (Alice in Chains album)|Dirt]]'', and " + "[[Stone Temple Pilots]]' ''[[Core (Stone Temple Pilots album)|Core]]''.\"}]}}}}"; rawWikipediaStreamManager.send(data1); rawWikipediaStreamManager.send(data2); waitForStreamToBePopulated(likesStreamManager, 2); waitForStreamToBePopulated(rawWikipediaStreamManager, 2); } private void waitForStreamToBePopulated(final StreamManager streamManager, int numEvents) throws Exception { Tasks.waitFor(numEvents, new Callable<Integer>() { @Override public Integer call() throws Exception { List<StreamEvent> streamEvents = streamManager.getEvents(0, Long.MAX_VALUE, Integer.MAX_VALUE); return streamEvents.size(); } }, 10, TimeUnit.SECONDS); } private void testWorkflow(WorkflowManager workflowManager, WikipediaPipelineApp.WikipediaAppConfig config) throws Exception { testWorkflow(workflowManager, config, null); } private void testWorkflow(WorkflowManager workflowManager, WikipediaPipelineApp.WikipediaAppConfig config, @Nullable Integer threshold) throws Exception { if (threshold == null) { workflowManager.start(); } else { workflowManager.start(ImmutableMap.of( WikipediaPipelineWorkflow.MIN_PAGES_THRESHOLD_KEY, String.valueOf(threshold), WikipediaPipelineWorkflow.MODE_KEY, WikipediaPipelineWorkflow.ONLINE_MODE)); } workflowManager.waitForFinish(5, TimeUnit.MINUTES); String pid = getLatestPid(workflowManager.getHistory()); WorkflowTokenNodeDetail tokenAtCondition = workflowManager.getTokenAtNode(pid, WikipediaPipelineWorkflow.EnoughDataToProceed.class.getSimpleName(), WorkflowToken.Scope.USER, "result"); boolean conditionResult = Boolean.parseBoolean(tokenAtCondition.getTokenDataAtNode().get("result")); if (threshold == null) { Assert.assertFalse(conditionResult); assertWorkflowToken(workflowManager, config, pid, false); } else { Assert.assertTrue(conditionResult); assertWorkflowToken(workflowManager, config, pid, true); } } @Nullable private String getLatestPid(List<RunRecord> history) { String pid = null; long latestStartTime = 0; for (RunRecord runRecord : history) { // OK to use start ts, since we ensure that the next run begins after the previous run finishes in the test if (runRecord.getStartTs() > latestStartTime) { latestStartTime = runRecord.getStartTs(); pid = runRecord.getPid(); } } return pid; } private void assertWorkflowToken(WorkflowManager workflowManager, WikipediaPipelineApp.WikipediaAppConfig config, String pid, boolean continueConditionSucceeded) throws NotFoundException { assertTokenAtPageTitlesMRNode(workflowManager, pid); assertTokenAtRawDataMRNode(workflowManager, pid, continueConditionSucceeded); assertTokenAtNormalizationMRNode(workflowManager, pid, continueConditionSucceeded); assertTokenAtSparkClusteringNode(workflowManager, config, pid, continueConditionSucceeded); assertTokenAtTopNMRNode(workflowManager, pid, continueConditionSucceeded); } private void assertTokenAtPageTitlesMRNode(WorkflowManager workflowManager, String pid) throws NotFoundException { WorkflowTokenNodeDetail pageTitlesUserTokens = workflowManager.getTokenAtNode(pid, WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME, null, null); Assert.assertTrue(Boolean.parseBoolean(pageTitlesUserTokens.getTokenDataAtNode().get("result"))); WorkflowTokenNodeDetail pageTitlesSystemTokens = workflowManager.getTokenAtNode(pid, WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME, WorkflowToken.Scope.SYSTEM, null); Assert.assertEquals(2, Integer.parseInt(pageTitlesSystemTokens.getTokenDataAtNode().get("custom.num.records"))); } private void assertTokenAtRawDataMRNode(WorkflowManager workflowManager, String pid, boolean continueConditionSucceeded) throws NotFoundException { if (!continueConditionSucceeded) { return; } WorkflowTokenNodeDetail rawWikiDataUserTokens = workflowManager.getTokenAtNode(pid, WikipediaDataDownloader.NAME, null, null); Assert.assertTrue(Boolean.parseBoolean(rawWikiDataUserTokens.getTokenDataAtNode().get("result"))); WorkflowTokenNodeDetail rawWikiDataSystemTokens = workflowManager.getTokenAtNode(pid, WikipediaDataDownloader.NAME, WorkflowToken.Scope.SYSTEM, null); Assert.assertEquals(2, Integer.parseInt(rawWikiDataSystemTokens.getTokenDataAtNode().get("custom.num.records"))); } private void assertTokenAtNormalizationMRNode(WorkflowManager workflowManager, String pid, boolean continueConditionSucceeded) throws NotFoundException { if (!continueConditionSucceeded) { return; } WorkflowTokenNodeDetail normalizedDataUserTokens = workflowManager.getTokenAtNode(pid, WikiContentValidatorAndNormalizer.NAME, null, null); Assert.assertTrue(Boolean.parseBoolean(normalizedDataUserTokens.getTokenDataAtNode().get("result"))); WorkflowTokenNodeDetail normalizedDataSystemTokens = workflowManager.getTokenAtNode(pid, WikiContentValidatorAndNormalizer.NAME, WorkflowToken.Scope.SYSTEM, null); Assert.assertEquals(2, Integer.parseInt(normalizedDataSystemTokens.getTokenDataAtNode().get("custom.num.records"))); } private void assertTokenAtSparkClusteringNode(WorkflowManager workflowManager, WikipediaPipelineApp.WikipediaAppConfig config, String pid, boolean continueConditionSucceeded) throws NotFoundException { if (!continueConditionSucceeded) { return; } @SuppressWarnings("ConstantConditions") String sparkProgramName = SparkWikipediaClustering.NAME + "-" + config.clusteringAlgorithm.toUpperCase(); WorkflowTokenNodeDetail clusteringUserTokens = workflowManager.getTokenAtNode(pid, sparkProgramName, null, null); Assert.assertEquals(10, Integer.parseInt(clusteringUserTokens.getTokenDataAtNode().get("num.records"))); Assert.assertTrue(clusteringUserTokens.getTokenDataAtNode().containsKey("highest.score.term")); Assert.assertTrue(clusteringUserTokens.getTokenDataAtNode().containsKey("highest.score.value")); WorkflowTokenNodeDetail ldaSystemTokens = workflowManager.getTokenAtNode(pid, sparkProgramName, WorkflowToken.Scope.SYSTEM, null); Assert.assertTrue(ldaSystemTokens.getTokenDataAtNode().isEmpty()); } private void assertTokenAtTopNMRNode(WorkflowManager workflowManager, String pid, boolean continueConditionSucceeded) throws NotFoundException { if (!continueConditionSucceeded) { return; } WorkflowTokenNodeDetail topNUserTokens = workflowManager.getTokenAtNode(pid, TopNMapReduce.NAME, null, null); Assert.assertTrue(Boolean.parseBoolean(topNUserTokens.getTokenDataAtNode().get("result"))); WorkflowTokenNodeDetail topNSystemTokens = workflowManager.getTokenAtNode(pid, TopNMapReduce.NAME, WorkflowToken.Scope.SYSTEM, null); Assert.assertEquals(10, Integer.parseInt(topNSystemTokens.getTokenDataAtNode().get("custom.num.records"))); } }