/**
* This software is licensed to you under the Apache License, Version 2.0 (the
* "Apache License").
*
* LinkedIn's contributions are made under the Apache License. If you contribute
* to the Software, the contributions will be deemed to have been made under the
* Apache License, unless you expressly indicate otherwise. Please do not make any
* contributions that would be inconsistent with the Apache License.
*
* You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, this software
* distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
* License for the specific language governing permissions and limitations for the
* software governed under the Apache License.
*
* © 2012 LinkedIn Corp. All Rights Reserved.
*/
package com.senseidb.indexing.hadoop.reduce;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm;
import com.senseidb.indexing.hadoop.keyvalueformat.Shard;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;
/**
* This combiner combines multiple intermediate forms into one intermediate
* form. More specifically, the input intermediate forms are a single-document
* ram index and/or a single delete term. An output intermediate form contains
* a multi-document ram index and/or multiple delete terms.
*/
public class SenseiCombiner extends MapReduceBase implements
Reducer<Shard, IntermediateForm, Shard, IntermediateForm> {
private static final Logger logger = Logger.getLogger(SenseiCombiner.class);
Configuration iconf;
long maxSizeInBytes;
long nearMaxSizeInBytes;
public void reduce(Shard key, Iterator<IntermediateForm> values,
OutputCollector<Shard, IntermediateForm> output, Reporter reporter)
throws IOException {
String message = key.toString();
IntermediateForm form = null;
while (values.hasNext()) {
IntermediateForm singleDocForm = values.next();
long formSize = form == null ? 0 : form.totalSizeInBytes();
long singleDocFormSize = singleDocForm.totalSizeInBytes();
if (form != null && formSize + singleDocFormSize > maxSizeInBytes) {
closeForm(form, message);
output.collect(key, form);
form = null;
}
if (form == null && singleDocFormSize >= nearMaxSizeInBytes) {
output.collect(key, singleDocForm);
} else {
if (form == null) {
form = createForm(message);
}
form.process(singleDocForm);
}
}
if (form != null) {
closeForm(form, message);
output.collect(key, form);
}
}
private IntermediateForm createForm(String message) throws IOException {
logger.info("Construct a form writer for " + message);
IntermediateForm form = new IntermediateForm();
form.configure(iconf);
return form;
}
private void closeForm(IntermediateForm form, String message)
throws IOException {
form.closeWriter();
logger.info("Closed the form writer for " + message + ", form = " + form);
}
public void configure(JobConf job) {
iconf = new Configuration(job);
maxSizeInBytes = iconf.getLong(SenseiJobConfig.MAX_RAMSIZE_BYTES, 50L << 20);
nearMaxSizeInBytes = maxSizeInBytes - (maxSizeInBytes >>> 3); // 7/8 of max
}
public void close() throws IOException {
}
}