/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.vectorizer.collocations.llr; import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.mahout.common.StringTuple; import org.apache.mahout.common.lucene.IteratorTokenStream; import org.apache.mahout.math.function.ObjectIntProcedure; import org.apache.mahout.math.map.OpenObjectIntHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; /** * Pass 1 of the Collocation discovery job which generated ngrams and emits ngrams an their component n-1grams. * Input is a SequeceFile<Text,StringTuple>, where the key is a document id and the value is the tokenized documents. * <p/> */ public class CollocMapper extends Mapper<Text, StringTuple, GramKey, Gram> { private static final byte[] EMPTY = new byte[0]; public static final String MAX_SHINGLE_SIZE = "maxShingleSize"; private static final int DEFAULT_MAX_SHINGLE_SIZE = 2; public enum Count { NGRAM_TOTAL } private static final Logger log = LoggerFactory.getLogger(CollocMapper.class); private int maxShingleSize; private boolean emitUnigrams; /** * Collocation finder: pass 1 map phase. * <p/> * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of * the appropriate size which are then decomposed into head and tail subgrams which are collected in the * following manner * <p/> * <pre> * k:head_key, v:head_subgram * k:head_key,ngram_key, v:ngram * k:tail_key, v:tail_subgram * k:tail_key,ngram_key, v:ngram * </pre> * <p/> * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram. * <p/> * For example, given 'click and clack' and an ngram length of 3: * <pre> * k: head_'click and' v:head_'click and' * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack' * k: tail_'clack', v:tail_'clack' * k: tail_'clack',ngram_'click and clack' v:ngram_'click and clack' * </pre> * <p/> * Also counts the total number of ngrams encountered and adds it to the counter * CollocDriver.Count.NGRAM_TOTAL * </p> * * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output. */ @Override protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize); try { int count = 0; // ngram count OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(value.getEntries().size() * (maxShingleSize - 1)); OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size()); do { String term = sf.getAttribute(CharTermAttribute.class).toString(); String type = sf.getAttribute(TypeAttribute.class).type(); if ("shingle".equals(type)) { count++; ngrams.adjustOrPutValue(term, 1, 1); } else if (emitUnigrams && !term.isEmpty()) { // unigram unigrams.adjustOrPutValue(term, 1, 1); } } while (sf.incrementToken()); final GramKey gramKey = new GramKey(); ngrams.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String term, int frequency) { // obtain components, the leading (n-1)gram and the trailing unigram. int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages. if (i != -1) { // bigram, trigram etc try { Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM); Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD); Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL); gramKey.set(head, EMPTY); context.write(gramKey, head); gramKey.set(head, ngram.getBytes()); context.write(gramKey, ngram); gramKey.set(tail, EMPTY); context.write(gramKey, tail); gramKey.set(tail, ngram.getBytes()); context.write(gramKey, ngram); } catch (IOException e) { throw new IllegalStateException(e); } catch (InterruptedException e) { throw new IllegalStateException(e); } } return true; } }); unigrams.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String term, int frequency) { try { Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM); gramKey.set(unigram, EMPTY); context.write(gramKey, unigram); } catch (IOException e) { throw new IllegalStateException(e); } catch (InterruptedException e) { throw new IllegalStateException(e); } return true; } }); context.getCounter(Count.NGRAM_TOTAL).increment(count); sf.end(); } finally { Closeables.closeQuietly(sf); } } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); this.maxShingleSize = conf.getInt(MAX_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS); if (log.isInfoEnabled()) { log.info("Max Ngram size is {}", this.maxShingleSize); log.info("Emit Unitgrams is {}", emitUnigrams); } } }