package com.yahoo.glimmer.indexing.generator; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Reducer; import com.yahoo.glimmer.indexing.generator.TermValue.Type; public class TermReduce extends Reducer<TermKey, TermValue, IntWritable, IndexRecordWriterValue> { private static final Log LOG = LogFactory.getLog(TermReduce.class); public static final String MAX_INVERTEDLIST_SIZE_PARAMETER = "maxInvertiedListSize"; public static final String MAX_POSITIONLIST_SIZE_PARAMETER = "maxPositionListSize"; private IntWritable writerKey; private IndexRecordWriterTermValue writerTermValue; private IndexRecordWriterDocValue writerDocValue; private IndexRecordWriterSizeValue writerSizeValue; private ArrayList<Long> predicatedIds; private long termKeysProcessed; @Override protected void setup(org.apache.hadoop.mapreduce.Reducer<TermKey, TermValue, IntWritable, IndexRecordWriterValue>.Context context) throws IOException, InterruptedException { writerKey = new IntWritable(); writerTermValue = new IndexRecordWriterTermValue(); writerDocValue = new IndexRecordWriterDocValue(); writerSizeValue = new IndexRecordWriterSizeValue(); predicatedIds = new ArrayList<Long>(); }; @Override public void reduce(TermKey key, Iterable<TermValue> values, Context context) throws IOException, InterruptedException { if (key == null || key.equals("")) { return; } if (termKeysProcessed % 10000 == 0) { String statusString = "Reducing " + key.toString(); context.setStatus(statusString); LOG.info(statusString); } writerKey.set(key.getIndex()); if (key.getIndex() == DocumentMapper.ALIGNMENT_INDEX) { long lastPredicateId = Long.MIN_VALUE; for (TermValue value : values) { if (value.getType() != Type.INDEX_ID) { throw new IllegalStateException("Got a " + value.getType() + " value when expecting only " + Type.INDEX_ID); } if (lastPredicateId != value.getV1()) { lastPredicateId = value.getV1(); predicatedIds.add(lastPredicateId); } } writerTermValue.setTerm(key.getTerm()); writerTermValue.setOccurrenceCount(0); writerTermValue.setTermFrequency(predicatedIds.size()); writerTermValue.setSumOfMaxTermPositions(0); context.write(writerKey, writerTermValue); for (Long predicateId : predicatedIds) { writerDocValue.setDocument(predicateId); context.write(writerKey, writerDocValue); } predicatedIds.clear(); } else if (TermKey.DOC_SIZE_TERM.equals(key.getTerm())) { // Write .sizes files Iterator<TermValue> valuesIt = values.iterator(); while (valuesIt.hasNext()) { TermValue value = valuesIt.next(); if (Type.DOC_SIZE != value.getType()) { throw new IllegalStateException("Got a " + value.getType() + " value when expecting only " + Type.DOC_SIZE); } writerSizeValue.setDocument(value.getV1()); writerSizeValue.setSize(value.getV2()); context.write(writerKey, writerSizeValue); } } else { int termFrequency = 0; int termCount = 0; int sumOfMaxTermPositions = 0; TermValue value = null; Iterator<TermValue> valuesIt = values.iterator(); while (valuesIt.hasNext()) { value = valuesIt.next(); if (Type.TERM_STATS != value.getType()) { break; } termFrequency++; termCount += value.getV1(); sumOfMaxTermPositions += value.getV2(); } if (Type.OCCURRENCE != value.getType()) { throw new IllegalStateException("Got a " + value.getType() + " value when expecting only " + Type.OCCURRENCE); } writerTermValue.setTerm(key.getTerm()); writerTermValue.setOccurrenceCount(termCount); writerTermValue.setTermFrequency(termFrequency); writerTermValue.setSumOfMaxTermPositions(sumOfMaxTermPositions); context.write(writerKey, writerTermValue); TermValue prevValue = new TermValue(); prevValue.set(value); while (value != null && value.getType() == Type.OCCURRENCE) { long docId = value.getV1(); if (docId < 0) { throw new IllegalStateException("Negative DocID. Key:" + key + "\nValue:" + value); } if (docId != prevValue.getV1()) { // New document, write out previous postings writerDocValue.setDocument(prevValue.getV1()); context.write(writerKey, writerDocValue); // The first occerrence of this docId/ writerDocValue.clearOccerrences(); writerDocValue.addOccurrence(value.getV2()); } else { writerDocValue.addOccurrence(value.getV2()); } prevValue.set(value); boolean last = false; if (valuesIt.hasNext()) { value = valuesIt.next(); // LOG.warn("Value:" + value.toString()); // Skip equivalent occurrences if (value.equals(prevValue)) { // This should never happen.. Is it legacy code? throw new IllegalStateException("For indexId " + key.getIndex() + " and term " + key.getTerm() + " got a duplicate occurrence " + value.toString()); } while (value.equals(prevValue) && valuesIt.hasNext()) { value = valuesIt.next(); } if (value.equals(prevValue) && !valuesIt.hasNext()) { last = true; } } else { last = true; } if (last) { // This is the last occurrence: write out the remaining // positions writerDocValue.setDocument(prevValue.getV1()); if (writerDocValue.getDocument() < 0) { throw new IllegalStateException("Negative DocID. Key:" + key + "\nprevValue:" + prevValue + "\nValue:" + value + "\nwriterDocValue:" + writerDocValue); } context.write(writerKey, writerDocValue); writerDocValue.clearOccerrences(); value = null; } } } termKeysProcessed++; } }