/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dkpro.bigdata.collocations; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Reducer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Reducer for Pass 1 of the collocation identification job. Generates counts for ngrams and subgrams. */ public class CollocReducer extends Reducer<GramKey, Gram, Gram, Gram> { private static final Logger log = LoggerFactory.getLogger(CollocReducer.class); public static final String MIN_SUPPORT = "minSupport"; public static final int DEFAULT_MIN_SUPPORT = 2; public enum Skipped { LESS_THAN_MIN_SUPPORT, MALFORMED_KEY_TUPLE, MALFORMED_TUPLE, MALFORMED_TYPES, MALFORMED_UNIGRAM } private int minSupport; /** * collocation finder: pass 1 reduce phase: * <p> * given input from the mapper, * </p> * <pre> * k:head_subgram,ngram, v:ngram:partial freq * k:head_subgram v:head_subgram:partial freq * k:tail_subgram,ngram, v:ngram:partial freq * k:tail_subgram v:tail_subgram:partial freq * k:unigram v:unigram:partial freq * </pre> * sum gram frequencies and output for llr calculation * <p> * output is: * </p> * <pre> * k:ngram:ngramfreq v:head_subgram:head_subgramfreq * k:ngram:ngramfreq v:tail_subgram:tail_subgramfreq * k:unigram:unigramfreq v:unigram:unigramfreq * </pre> * <p> * Each ngram's frequency is essentially counted twice, once for head, once for tail. * frequency should be the same for the head and tail. Fix this to count only for the * head and move the count into the value? * </p> */ @Override protected void reduce(GramKey key, Iterable<Gram> values, Context context) throws IOException, InterruptedException { Gram.Type keyType = key.getType(); if (keyType == Gram.Type.UNIGRAM) { // sum frequencies for unigrams. processUnigram(values.iterator(), context); } else if (keyType == Gram.Type.HEAD || keyType == Gram.Type.TAIL) { // sum frequencies for subgrams, ngram and collect for each ngram. processSubgram(values.iterator(), context); } else { context.getCounter(Skipped.MALFORMED_TYPES).increment(1); } } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); this.minSupport = conf.getInt(MIN_SUPPORT, DEFAULT_MIN_SUPPORT); boolean emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS); emitUnigrams=true; log.info("Min support is {}", minSupport); log.info("Emit Unitgrams is {}", emitUnigrams); } /** * Sum frequencies for unigrams and deliver to the collector */ protected void processUnigram(Iterator<Gram> values, Context context) throws IOException, InterruptedException { int freq = 0; Gram value = null; // accumulate frequencies from values. while (values.hasNext()) { value = values.next(); freq += value.getFrequency(); } if (freq < minSupport) { context.getCounter(Skipped.LESS_THAN_MIN_SUPPORT).increment(1); return; } value.setFrequency(freq); context.write(value, value); } /** Sum frequencies for subgram, ngrams and deliver ngram, subgram pairs to the collector. * <p> * Sort order guarantees that the subgram/subgram pairs will be seen first and then * subgram/ngram1 pairs, subgram/ngram2 pairs ... subgram/ngramN pairs, so frequencies for * ngrams can be calcualted here as well. * </p> * We end up calculating frequencies for ngrams for each sugram (head, tail) here, which is * some extra work. * @throws InterruptedException */ protected void processSubgram(Iterator<Gram> values, Context context) throws IOException, InterruptedException { Gram subgram = null; Gram currentNgram = null; while (values.hasNext()) { Gram value = values.next(); if (value.getType() == Gram.Type.HEAD || value.getType() == Gram.Type.TAIL) { // collect frequency for subgrams. if (subgram == null) { subgram = new Gram(value); } else { subgram.incrementFrequency(value.getFrequency()); } } else if (!value.equals(currentNgram)) { // we've collected frequency for all subgrams and we've encountered a new ngram. // collect the old ngram if there was one and we have sufficient support and // create the new ngram. if (currentNgram != null) { if (currentNgram.getFrequency() < minSupport) { context.getCounter(Skipped.LESS_THAN_MIN_SUPPORT).increment(1); } else { context.write(currentNgram, subgram); } } currentNgram = new Gram(value); } else { currentNgram.incrementFrequency(value.getFrequency()); } } // collect last ngram. if (currentNgram != null) { if (currentNgram.getFrequency() < minSupport) { context.getCounter(Skipped.LESS_THAN_MIN_SUPPORT).increment(1); return; } context.write(currentNgram, subgram); } } }