/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.test.recordJobTests;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.api.java.record.operators.FileDataSource;
import eu.stratosphere.api.java.record.io.CsvOutputFormat;
import eu.stratosphere.api.java.record.io.TextInputFormat;
import eu.stratosphere.api.java.record.operators.MapOperator;
import eu.stratosphere.api.java.record.operators.ReduceOperator;
import eu.stratosphere.test.recordJobs.wordcount.WordCount.CountWords;
import eu.stratosphere.test.recordJobs.wordcount.WordCount.TokenizeLine;
import eu.stratosphere.test.testdata.WordCountData;
import eu.stratosphere.test.util.RecordAPITestBase;
import eu.stratosphere.types.IntValue;
import eu.stratosphere.types.StringValue;
/**
* WordCount with multiple inputs to the reducer.
* <p>
* This test case is an adaption of issue #192 (and #124), which revealed problems with the union readers in Nephele.
* The problems have been fixed with commit 1228a5e. Without this commit the test will deadlock.
*
* @see {@link https://github.com/stratosphere/stratosphere/issues/192}
* @see {@link https://github.com/stratosphere/stratosphere/issues/124}
*/
public class WordCountUnionReduceITCase extends RecordAPITestBase {
private static final int MULTIPLY = 1000;
private String inputPath;
private String outputPath;
@Override
protected void preSubmit() throws Exception {
// the fixed input is repeated this many times and the expected counts
// are multiplied by this factor, because the problem only occurs with
// inputs of a certain size
String input = repeatString(WordCountData.TEXT, MULTIPLY);
this.inputPath = createTempFile("input.txt", input);
this.outputPath = getTempDirPath("output");
}
@Override
protected Plan getTestJob() {
WordCountUnionReduce wc = new WordCountUnionReduce();
return wc.getPlan(this.inputPath, this.outputPath, 4);
}
@Override
protected void postSubmit() throws Exception {
String expectedCounts =
multiplyIntegersInString(WordCountData.COUNTS,
// adjust counts to string repetition (InputSizeFactor) and two mappers (*2)
MULTIPLY * 2);
compareResultsByLinesInMemory(expectedCounts, this.outputPath);
}
/**
* This is the adapted plan from issue #192.
*
* @see {@link https://github.com/stratosphere/stratosphere/issues/192}
*/
private class WordCountUnionReduce {
/**
* <pre>
* +-------------+
* //=> | MapOperator | =\\
* +--------+ // +-------------+ \\ +----------------+ +------+
* | Source | =| |=> | ReduceOperator | => | Sink |
* +--------+ \\ +-------------+ // +----------------+ +------+
* \\=> | MapOperator | =//
* +-------------+
* </pre>
*/
public Plan getPlan(String inputPath, String outputPath, int numSubtasks) {
FileDataSource source = new FileDataSource(TextInputFormat.class, inputPath, "First Input");
MapOperator wordsFirstInput = MapOperator.builder(TokenizeLine.class)
.input(source)
.name("Words (First Input)")
.build();
MapOperator wordsSecondInput = MapOperator.builder(TokenizeLine.class)
.input(source)
.name("Words (Second Input)")
.build();
@SuppressWarnings("unchecked")
ReduceOperator counts = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
.input(wordsFirstInput, wordsSecondInput)
.name("Word Counts")
.build();
FileDataSink sink = new FileDataSink(CsvOutputFormat.class, outputPath, counts);
CsvOutputFormat.configureRecordFormat(sink)
.recordDelimiter('\n')
.fieldDelimiter(' ')
.field(StringValue.class, 0)
.field(IntValue.class, 1);
Plan plan = new Plan(sink, "WordCount Union Reduce");
plan.setDefaultParallelism(numSubtasks);
return plan;
}
}
/**
* Repeats the given String and returns the resulting String.
*
* @param str
* the string to repeat
* @param n
* the number of times to repeat the string
* @return repeated string if n > 1, otherwise the input string
*/
private String repeatString(String str, int n) {
if (n <= 1) {
return str;
}
StringBuilder sb = new StringBuilder(str.length() * n + 1);
for (int i = 0; i < n; i++) {
sb.append(str);
}
return sb.toString();
}
/**
* Returns a new String with all occurring integers multiplied.
*
* @param str
* the string which contains integers to multiply
* @param n
* the factor to multiply each integer with
* @return new string with multiplied integers
*/
private String multiplyIntegersInString(String str, int n) {
Pattern counts = Pattern.compile("(\\d+)");
Matcher matcher = counts.matcher(str);
StringBuffer sb = new StringBuffer(str.length());
boolean hasMatch = false;
while (matcher.find()) {
hasMatch = true;
matcher.appendReplacement(sb, String.valueOf(n * Integer.parseInt(matcher.group(1))));
}
return hasMatch ? sb.toString() : str;
}
}