package com.ontology2.bakemono.entityCentric; import com.google.common.collect.Lists; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Reducer; import org.apache.log4j.Logger; import java.io.IOException; import java.util.List; abstract public class EntityMatchesRuleReducer<KEY,VALUE> extends Reducer<KEY,VALUE,NullWritable,VALUE> { static Logger log= Logger.getLogger(EntityMatchesRuleReducer.class); @Override protected void reduce(KEY key,Iterable<VALUE> values,Context context) throws IOException, InterruptedException { // I'm not trusting that Hadoop is honoring the contract for Iterable, that is, // I can't iterate on it twice safely. // // memory consumption and speed could get a factor of 2 by adapting this so that once match returns, // it dumps the content of the List and streams the rest of the facts // List<VALUE> rewindableValues= Lists.newArrayList(); for(VALUE value:values) { rewindableValues.add(copy(value)); } if(matches(key,rewindableValues)) for(VALUE value:rewindableValues) context.write(null,value); } protected abstract VALUE copy(VALUE value); protected abstract boolean matches(KEY key, Iterable<VALUE> values); }