/* * Copyright © 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.examples.wikipedia; import co.cask.cdap.api.Config; import co.cask.cdap.api.app.AbstractApplication; import co.cask.cdap.api.data.stream.Stream; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.dataset.lib.KeyValueTable; import co.cask.cdap.api.dataset.table.Table; import javax.annotation.Nullable; /** * App to demonstrate a data pipeline that processes Wikipedia data using a CDAP Workflow. */ public class WikipediaPipelineApp extends AbstractApplication<WikipediaPipelineApp.WikipediaAppConfig> { static final String PAGE_TITLES_STREAM = "pageTitleStream"; static final String RAW_WIKIPEDIA_STREAM = "wikiStream"; static final String PAGE_TITLES_DATASET = "pages"; static final String RAW_WIKIPEDIA_DATASET = "wikidata"; static final String NORMALIZED_WIKIPEDIA_DATASET = "normalized"; static final String SPARK_CLUSTERING_OUTPUT_DATASET = "clustering"; static final String MAPREDUCE_TOPN_OUTPUT = "topn"; static final String LIKES_TO_DATASET_MR_NAME = "LikesToDataset"; static final String WIKIPEDIA_TO_DATASET_MR_NAME = "WikiDataToDataset"; @Override public void configure() { addStream(new Stream(PAGE_TITLES_STREAM)); addStream(new Stream(RAW_WIKIPEDIA_STREAM)); addMapReduce(new StreamToDataset(LIKES_TO_DATASET_MR_NAME)); addMapReduce(new StreamToDataset(WIKIPEDIA_TO_DATASET_MR_NAME)); addMapReduce(new WikipediaDataDownloader()); addMapReduce(new WikiContentValidatorAndNormalizer()); addMapReduce(new TopNMapReduce()); addSpark(new SparkWikipediaClustering(getConfig())); createDataset(PAGE_TITLES_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Page titles dataset").build()); createDataset(RAW_WIKIPEDIA_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Raw Wikipedia dataset").build()); createDataset(NORMALIZED_WIKIPEDIA_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Normalized Wikipedia dataset").build()); createDataset(SPARK_CLUSTERING_OUTPUT_DATASET, Table.class, DatasetProperties.builder().setDescription("Spark clustering output dataset").build()); createDataset(MAPREDUCE_TOPN_OUTPUT, KeyValueTable.class, DatasetProperties.builder().setDescription("MapReduce top-'N'-words output dataset").build()); addWorkflow(new WikipediaPipelineWorkflow(getConfig())); addService(new WikipediaService()); } /** * Config for Wikipedia App. */ public static class WikipediaAppConfig extends Config { @Nullable public final String clusteringAlgorithm; public WikipediaAppConfig() { this(null); } public WikipediaAppConfig(@Nullable String clusteringAlgorithm) { this.clusteringAlgorithm = clusteringAlgorithm == null ? "lda" : clusteringAlgorithm; } } }