package com.amazonaws.bigdatablog.indexcommoncrawl;
import cascading.flow.FlowDef;
import cascading.operation.regex.RegexGenerator;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
//import cascading.scheme.local.TextLine;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.Tap;
import cascading.tap.local.FileTap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import org.elasticsearch.hadoop.cascading.EsTap;
import java.util.Properties;
public class CommonCrawlIndex {
public static FlowDef buildFlowDef(Properties properties){
// create the Cascading "source" (input) tap to read the commonCrawl WAT file(s)
Tap source=null;
//check if we're running locally or on HDFS
Boolean isDistributed =((properties.containsKey("platform")) && properties.getProperty("platform").toString().compareTo("DISTRIBUTED")==0);
//(properties.getProperty("platform") == "DISTRIBUTED"));
String inPath = properties.getProperty("inPath");
if (isDistributed){
source = new Hfs(new cascading.scheme.hadoop.TextLine(new Fields("line")), inPath);
}else {
source = new FileTap(new cascading.scheme.local.TextLine(new Fields("line")), inPath);
}
// create the "sink" (output) tap that will export the data to Elasticsearch
Tap sink = new EsTap(properties.getProperty("es.target.index"));
//Build the Cascading Flow Definition
return CommonCrawlIndex.createCommonCrawlFlowDef(source, sink);
}
public static FlowDef createCommonCrawlFlowDef(Tap source, Tap sink) {
Pipe parsePipe = new Pipe( "exportCommonCrawlWATPipe" );
//Add a Regular Expression to collect the envelope json field from each line in the file
RegexGenerator splitter=new RegexGenerator(new Fields("json"),"^\\{\"Envelope\".*$");
parsePipe = new Each( parsePipe, new Fields( "line" ), splitter, Fields.RESULTS );
// connect the taps, pipes, etc., into a flow
return FlowDef.flowDef()
.addSource( parsePipe, source )
.addTailSink( parsePipe, sink );
}
}