/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.test.recordJobs.relational;
import java.io.Serializable;
import java.util.Iterator;
import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.common.Program;
import eu.stratosphere.api.common.ProgramDescription;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.api.java.record.operators.FileDataSource;
import eu.stratosphere.api.java.record.functions.CoGroupFunction;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsExcept;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsFirstExcept;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsSecondExcept;
import eu.stratosphere.api.java.record.functions.JoinFunction;
import eu.stratosphere.api.java.record.functions.MapFunction;
import eu.stratosphere.api.java.record.io.CsvInputFormat;
import eu.stratosphere.api.java.record.io.CsvOutputFormat;
import eu.stratosphere.api.java.record.operators.CoGroupOperator;
import eu.stratosphere.api.java.record.operators.JoinOperator;
import eu.stratosphere.api.java.record.operators.MapOperator;
import eu.stratosphere.types.IntValue;
import eu.stratosphere.types.Record;
import eu.stratosphere.types.StringValue;
import eu.stratosphere.util.Collector;
/**
* Implements the following relational OLAP query as PACT program:
*
* <code><pre>
* SELECT r.pageURL, r.pageRank, r.avgDuration
* FROM Documents d JOIN Rankings r
* ON d.url = r.url
* WHERE CONTAINS(d.text, [keywords])
* AND r.rank > [rank]
* AND NOT EXISTS (
* SELECT * FROM Visits v
* WHERE v.destUrl = d.url
* AND v.visitDate < [date]);
* * </pre></code>
*
* Table Schemas: <code><pre>
* CREATE TABLE Documents (
* url VARCHAR(100) PRIMARY KEY,
* contents TEXT );
*
* CREATE TABLE Rankings (
* pageRank INT,
* pageURL VARCHAR(100) PRIMARY KEY,
* avgDuration INT );
*
* CREATE TABLE Visits (
* sourceIP VARCHAR(16),
* destURL VARCHAR(100),
* visitDate DATE,
* adRevenue FLOAT,
* userAgent VARCHAR(64),
* countryCode VARCHAR(3),
* languageCode VARCHAR(6),
* searchWord VARCHAR(32),
* duration INT );
* </pre></code>
*
*/
public class WebLogAnalysis implements Program, ProgramDescription {
private static final long serialVersionUID = 1L;
/**
* MapFunction that filters for documents that contain a certain set of
* keywords.
*/
@ConstantFieldsExcept(1)
public static class FilterDocs extends MapFunction implements Serializable {
private static final long serialVersionUID = 1L;
private static final String[] KEYWORDS = { " editors ", " oscillations ", " convection " };
/**
* Filters for documents that contain all of the given keywords and projects the records on the URL field.
*
* Output Format:
* 0: URL
*/
@Override
public void map(Record record, Collector<Record> out) throws Exception {
// FILTER
// Only collect the document if all keywords are contained
String docText = record.getField(1, StringValue.class).toString();
boolean allContained = true;
for (String kw : KEYWORDS) {
if (!docText.contains(kw)) {
allContained = false;
break;
}
}
if (allContained) {
record.setNull(1);
out.collect(record);
}
}
}
/**
* MapFunction that filters for records where the rank exceeds a certain threshold.
*/
@ConstantFieldsExcept({})
public static class FilterRanks extends MapFunction implements Serializable {
private static final long serialVersionUID = 1L;
private static final int RANKFILTER = 50;
/**
* Filters for records of the rank relation where the rank is greater
* than the given threshold.
*
* Output Format:
* 0: URL
* 1: RANK
* 2: AVG_DURATION
*/
@Override
public void map(Record record, Collector<Record> out) throws Exception {
if (record.getField(1, IntValue.class).getValue() > RANKFILTER) {
out.collect(record);
}
}
}
/**
* MapFunction that filters for records of the visits relation where the year
* (from the date string) is equal to a certain value.
*/
@ConstantFieldsExcept(1)
public static class FilterVisits extends MapFunction implements Serializable {
private static final long serialVersionUID = 1L;
private static final int YEARFILTER = 2010;
/**
* Filters for records of the visits relation where the year of visit is equal to a
* specified value. The URL of all visit records passing the filter is emitted.
*
* Output Format:
* 0: URL
*/
@Override
public void map(Record record, Collector<Record> out) throws Exception {
// Parse date string with the format YYYY-MM-DD and extract the year
String dateString = record.getField(1, StringValue.class).getValue();
int year = Integer.parseInt(dateString.substring(0,4));
if (year == YEARFILTER) {
record.setNull(1);
out.collect(record);
}
}
}
/**
* JoinFunction that joins the filtered entries from the documents and the
* ranks relation.
*/
@ConstantFieldsSecondExcept({})
public static class JoinDocRanks extends JoinFunction implements Serializable {
private static final long serialVersionUID = 1L;
/**
* Joins entries from the documents and ranks relation on their URL.
*
* Output Format:
* 0: URL
* 1: RANK
* 2: AVG_DURATION
*/
@Override
public void join(Record document, Record rank, Collector<Record> out) throws Exception {
out.collect(rank);
}
}
/**
* CoGroupFunction that realizes an anti-join.
* If the first input does not provide any pairs, all pairs of the second input are emitted.
* Otherwise, no pair is emitted.
*/
@ConstantFieldsFirstExcept({})
public static class AntiJoinVisits extends CoGroupFunction implements Serializable {
private static final long serialVersionUID = 1L;
/**
* If the visit iterator is empty, all pairs of the rank iterator are emitted.
* Otherwise, no pair is emitted.
*
* Output Format:
* 0: URL
* 1: RANK
* 2: AVG_DURATION
*/
@Override
public void coGroup(Iterator<Record> ranks, Iterator<Record> visits, Collector<Record> out) {
// Check if there is a entry in the visits relation
if (!visits.hasNext()) {
while (ranks.hasNext()) {
// Emit all rank pairs
out.collect(ranks.next());
}
}
}
}
@Override
public Plan getPlan(String... args) {
// parse job parameters
int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
String docsInput = (args.length > 1 ? args[1] : "");
String ranksInput = (args.length > 2 ? args[2] : "");
String visitsInput = (args.length > 3 ? args[3] : "");
String output = (args.length > 4 ? args[4] : "");
/*
* Output Format:
* 0: URL
* 1: DOCUMENT_TEXT
*/
// Create DataSourceContract for documents relation
@SuppressWarnings("unchecked")
CsvInputFormat docsFormat = new CsvInputFormat('|', StringValue.class, StringValue.class);
FileDataSource docs = new FileDataSource(docsFormat, docsInput, "Docs Input");
/*
* Output Format:
* 0: URL
* 1: RANK
* 2: AVG_DURATION
*/
// Create DataSourceContract for ranks relation
FileDataSource ranks = new FileDataSource(new CsvInputFormat(), ranksInput, "Ranks input");
CsvInputFormat.configureRecordFormat(ranks)
.recordDelimiter('\n')
.fieldDelimiter('|')
.field(StringValue.class, 1)
.field(IntValue.class, 0)
.field(IntValue.class, 2);
/*
* Output Format:
* 0: URL
* 1: DATE
*/
// Create DataSourceContract for visits relation
@SuppressWarnings("unchecked")
CsvInputFormat visitsFormat = new CsvInputFormat('|', null, StringValue.class, StringValue.class);
FileDataSource visits = new FileDataSource(visitsFormat, visitsInput, "Visits input:q");
// Create MapOperator for filtering the entries from the documents
// relation
MapOperator filterDocs = MapOperator.builder(new FilterDocs())
.input(docs)
.name("Filter Docs")
.build();
filterDocs.getCompilerHints().setFilterFactor(0.15f);
// Create MapOperator for filtering the entries from the ranks relation
MapOperator filterRanks = MapOperator.builder(new FilterRanks())
.input(ranks)
.name("Filter Ranks")
.build();
filterRanks.getCompilerHints().setFilterFactor(0.25f);
// Create MapOperator for filtering the entries from the visits relation
MapOperator filterVisits = MapOperator.builder(new FilterVisits())
.input(visits)
.name("Filter Visits")
.build();
filterVisits.getCompilerHints().setFilterFactor(0.2f);
// Create JoinOperator to join the filtered documents and ranks
// relation
JoinOperator joinDocsRanks = JoinOperator.builder(new JoinDocRanks(), StringValue.class, 0, 0)
.input1(filterDocs)
.input2(filterRanks)
.name("Join Docs Ranks")
.build();
// Create CoGroupOperator to realize a anti join between the joined
// documents and ranks relation and the filtered visits relation
CoGroupOperator antiJoinVisits = CoGroupOperator.builder(new AntiJoinVisits(), StringValue.class, 0, 0)
.input1(joinDocsRanks)
.input2(filterVisits)
.name("Antijoin DocsVisits")
.build();
// Create DataSinkContract for writing the result of the OLAP query
FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, antiJoinVisits, "Result");
result.setDegreeOfParallelism(numSubTasks);
CsvOutputFormat.configureRecordFormat(result)
.recordDelimiter('\n')
.fieldDelimiter('|')
.lenient(true)
.field(IntValue.class, 1)
.field(StringValue.class, 0)
.field(IntValue.class, 2);
// Return the PACT plan
Plan p = new Plan(result, "Weblog Analysis");
p.setDefaultParallelism(numSubTasks);
return p;
}
@Override
public String getDescription() {
return "Parameters: [numSubTasks], [docs], [ranks], [visits], [output]";
}
}