package com.yahoo.glimmer.indexing.preprocessor; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import com.yahoo.glimmer.indexing.preprocessor.ResourceRecordWriter.OUTPUT; import com.yahoo.glimmer.indexing.preprocessor.ResourceRecordWriter.OutputCount; import com.yahoo.glimmer.indexing.preprocessor.TuplesToResourcesMapper.TupleElementName; import com.yahoo.glimmer.util.BySubjectRecord; /** * Reducer * * For the given Subject resource key concatenates all <predicate> <object> * <context> . for that key It also appends PREDICATE, OBJECT and/or CONTEXT * keywords if that keyword occurs once or more as a value. * */ public class ResourcesReducer extends Reducer<Text, Text, Text, Object> { private OutputCount outputCount = new OutputCount(); private BySubjectRecord bySubjectRecord = new BySubjectRecord(); // Given that there is only 1 reducer writing a sorted list of subjects we // can use the order of docs to deduce the document ID and add it to bysubjects // The alternative would be to generate a MPH over the list of subjects but // that would require more memory when building the indices. private long docId; static enum Counters { TOO_MANY_RELATIONS, DUPLICATE_RELATIONS, KEYS, KEY_SUBJECT, KEY_PREDICATE, KEY_OBJECT, KEY_CONTEXT, VALUES; } private final static Text SUBJECT_TEXT = new Text(TupleElementName.SUBJECT.name()); private final static Text PREDICATE_TEXT = new Text(TupleElementName.PREDICATE.name()); private final static Text OBJECT_TEXT = new Text(TupleElementName.OBJECT.name()); private final static Text CONTEXT_TEXT = new Text(TupleElementName.CONTEXT.name()); protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Object>.Context context) throws IOException, InterruptedException { context.getCounter(Counters.KEYS).increment(1); int keyPredicateCount = 0; int keyObjectCount = 0; int keyContextCount = 0; int relationsCount = 0; int duplicateRelatations = 0; outputCount.output = OUTPUT.ALL; outputCount.count = 0; context.write(key, outputCount); bySubjectRecord.clearRelations(); String lastValue = null; for (Text value : values) { context.getCounter(Counters.VALUES).increment(1); if (PREDICATE_TEXT.equals(value)) { keyPredicateCount++; } else if (OBJECT_TEXT.equals(value)) { keyObjectCount++; } else if (CONTEXT_TEXT.equals(value)) { keyContextCount++; } else if (SUBJECT_TEXT.equals(value)) { throw new IllegalArgumentException("Reducer got a SUBJECT value!?. Should only be \"PREDICATE\", \"OBJECT\", \"CONTEXT\" or a relation String."); } else if (value.getLength() > 0) { String valueString = value.toString(); if (!valueString.equals(lastValue)) { bySubjectRecord.addRelation(valueString); relationsCount++; lastValue = valueString; } else { duplicateRelatations++; } } } if (relationsCount > 0) { if (duplicateRelatations > 0) { context.getCounter(Counters.DUPLICATE_RELATIONS).increment(duplicateRelatations); } // The docId's should match with OUTPUT.ALL hash values bySubjectRecord.setId(docId); bySubjectRecord.setSubject(key.toString()); if (bySubjectRecord.getRelationsCount() != relationsCount) { System.out.println("Too many relations. Only indexing " + bySubjectRecord.getRelationsCount() + " of " + relationsCount + ". Subject is:" + key.toString()); context.getCounter(Counters.TOO_MANY_RELATIONS).increment(1); } context.write(key, bySubjectRecord); bySubjectRecord.setPreviousId(docId); context.getCounter(Counters.KEY_SUBJECT).increment(relationsCount); } if (keyPredicateCount > 0) { outputCount.output = OUTPUT.PREDICATE; outputCount.count = keyPredicateCount; context.write(key, outputCount); context.getCounter(Counters.KEY_PREDICATE).increment(keyPredicateCount); } if (keyObjectCount > 0) { outputCount.output = OUTPUT.OBJECT; outputCount.count = keyObjectCount; context.write(key, outputCount); context.getCounter(Counters.KEY_OBJECT).increment(keyObjectCount); } if (keyContextCount > 0) { outputCount.output = OUTPUT.CONTEXT; outputCount.count = keyContextCount; context.write(key, outputCount); context.getCounter(Counters.KEY_CONTEXT).increment(keyContextCount); } docId++; }; }