/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package hip.ch6.joins.repartition.impl;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* This abstract class serves as the base class for the reducer class of a data
* join job. The reduce function will first group the values according to their
* input tags, and then compute the cross product of over the groups. For each
* tuple in the cross product, it calls the following method, which is expected
* to be implemented in a subclass.
* <p/>
* protected abstract OptimizedTaggedMapOutput combine(String key, OptimizedTaggedMapOutput value1, OptimizedTaggedMapOutput value2);
* <p/>
* The above method is expected to produce one output value from an array of
* records of different sources. The user code can also perform filtering here.
* It can return null if it decides to the records do not meet certain
* conditions.
*/
public abstract class OptimizedDataJoinReducerBase
extends OptimizedJobBase {
protected Reporter reporter = null;
private long maxNumOfValuesPerGroup = 100;
protected long largestNumOfValues = 0;
protected long numOfValues = 0;
protected long collected = 0;
protected JobConf job;
public void close() throws IOException {
if (this.reporter != null) {
this.reporter.setStatus(super.getReport());
}
}
public void configure(JobConf job) {
super.configure(job);
this.job = job;
this.maxNumOfValuesPerGroup =
job.getLong("datajoin.maxNumOfValuesPerGroup", 100);
}
public void reduce(Object key, Iterator values,
OutputCollector output, Reporter reporter)
throws IOException {
if (this.reporter == null) {
this.reporter = reporter;
}
CompositeKey k = (CompositeKey) key;
System.out.println("K[" + k + "]");
List<OutputValue> smaller =
new ArrayList<OutputValue>();
this.numOfValues = 0;
while (values.hasNext()) {
numOfValues++;
Object value = values.next();
System.out.println(" V[" + value + "]");
if (this.numOfValues % 100 == 0) {
reporter.setStatus("key: " + key.toString() + " numOfValues: "
+ this.numOfValues);
}
if (this.numOfValues > this.maxNumOfValuesPerGroup) {
continue;
}
OutputValue cloned =
((OutputValue) value).clone(job);
if (cloned.isSmaller().get()) {
System.out.println("Adding to smaller coll");
smaller.add(cloned);
} else {
System.out.println("Join/collect");
joinAndCollect(k, smaller, cloned, output, reporter);
}
}
if (this.numOfValues > this.largestNumOfValues) {
this.largestNumOfValues = numOfValues;
LOG.info("key: " + key.toString() + " this.largestNumOfValues: "
+ this.largestNumOfValues);
}
addLongValue("groupCount", 1);
}
/**
* Join the list of the value lists, and collect the results.
*
* @param key
* @param smaller
* @param value
* @param output
* @param reporter
* @throws java.io.IOException
*/
private void joinAndCollect(CompositeKey key,
List<OutputValue> smaller,
OutputValue value,
OutputCollector output,
Reporter reporter)
throws IOException {
if (smaller.size() < 1) {
OutputValue combined =
combine(key.getKey(), null, value);
collect(key, combined, output, reporter);
} else {
for (OutputValue small : smaller) {
OutputValue combined =
combine(key.getKey(), small, value);
collect(key, combined, output, reporter);
}
}
}
private static Text outputKey = new Text();
private void collect(CompositeKey key,
OutputValue combined,
OutputCollector output,
Reporter reporter) throws IOException {
this.collected += 1;
addLongValue("collectedCount", 1);
if (combined != null) {
outputKey.set(key.getKey());
output.collect(outputKey, combined.getData());
reporter.setStatus(
"key: " + key.toString() + " collected: " + collected);
addLongValue("actuallyCollectedCount", 1);
}
}
/**
* @param key
* @param smallValue
* @param largeValue
* @return combined value derived from values of the sources
*/
protected abstract OutputValue combine(String key,
OutputValue smallValue,
OutputValue largeValue);
public void map(Object arg0, Object arg1, OutputCollector arg2,
Reporter arg3) throws IOException {
throw new IOException("Unsupported operation");
}
}