/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.wikipedia;
import co.cask.cdap.api.Predicate;
import co.cask.cdap.api.workflow.AbstractWorkflow;
import co.cask.cdap.api.workflow.Value;
import co.cask.cdap.api.workflow.WorkflowContext;
import co.cask.cdap.api.workflow.WorkflowToken;
import java.util.Map;
/**
* Workflow for Wikipedia data pipeline
*/
public class WikipediaPipelineWorkflow extends AbstractWorkflow {
static final String NAME = WikipediaPipelineWorkflow.class.getSimpleName();
static final String MIN_PAGES_THRESHOLD_KEY = "min.pages.threshold";
static final String MODE_KEY = "mode";
static final String ONLINE_MODE = "online";
private final String sparkProgramName;
@SuppressWarnings("ConstantConditions")
public WikipediaPipelineWorkflow(WikipediaPipelineApp.WikipediaAppConfig config) {
this.sparkProgramName = SparkWikipediaClustering.NAME + "-" + config.clusteringAlgorithm.toUpperCase();
}
@Override
protected void configure() {
setName(NAME);
setDescription("A workflow that demonstrates a typical data pipeline to process Wikipedia data.");
addMapReduce(WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME);
condition(new EnoughDataToProceed())
.condition(new IsWikipediaSourceOnline())
.addMapReduce(WikipediaDataDownloader.NAME)
.otherwise()
.addMapReduce(WikipediaPipelineApp.WIKIPEDIA_TO_DATASET_MR_NAME)
.end()
.addMapReduce(WikiContentValidatorAndNormalizer.NAME)
.fork()
.addSpark(sparkProgramName)
.also()
.addMapReduce(TopNMapReduce.NAME)
.join()
.otherwise()
.end();
}
static class EnoughDataToProceed implements Predicate<WorkflowContext> {
@Override
public boolean apply(WorkflowContext context) {
Map<String, String> runtimeArguments = context.getRuntimeArguments();
int threshold = 10;
if (runtimeArguments.containsKey(MIN_PAGES_THRESHOLD_KEY)) {
threshold = Integer.parseInt(runtimeArguments.get(MIN_PAGES_THRESHOLD_KEY));
}
WorkflowToken token = context.getToken();
Value result = token.get("result", WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME);
Value numPages = token.get("custom.num.records", WorkflowToken.Scope.SYSTEM);
boolean conditionResult = result != null && result.getAsBoolean() &&
numPages != null && numPages.getAsLong() > threshold;
token.put("result", Value.of(conditionResult));
// Also add information in the token on whether to download wikipedia data over the internet
// NOTE: The following predicate can even consume this information through runtimeArguments. However,
// we want to demonstrate the usage of workflow token through this example, so adding this indirection.
if (runtimeArguments.containsKey(MODE_KEY)) {
token.put(ONLINE_MODE, Value.of(runtimeArguments.get(MODE_KEY).equalsIgnoreCase(ONLINE_MODE)));
}
return conditionResult;
}
}
static class IsWikipediaSourceOnline implements Predicate<WorkflowContext> {
@Override
public boolean apply(WorkflowContext context) {
WorkflowToken token = context.getToken();
Value online = token.get(ONLINE_MODE, EnoughDataToProceed.class.getSimpleName(), WorkflowToken.Scope.USER);
return online != null && online.getAsBoolean();
}
}
}