/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dkpro.bigdata.collocations;
import static org.apache.uima.fit.util.JCasUtil.select;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.math.function.ObjectIntProcedure;
import org.apache.mahout.math.map.OpenObjectIntHashMap;
import org.apache.uima.cas.CASException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.metadata.ResourceMetaData;
import org.apache.uima.resource.metadata.impl.ResourceMetaData_impl;
import org.apache.uima.util.XMLParser;
import org.dkpro.bigdata.io.hadoop.CASWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* Pass 1 of the Collocation discovery job which generated ngrams and emits ngrams an their
* component n-1grams. Input is a SequeceFile<Text,StringTuple>, where the key is a document id and
* the value is the tokenized documents.
*/
public class CollocMapper
extends Mapper<Text, CASWritable, GramKey, Gram>
{
private static final byte[] EMPTY = new byte[0];
public static final String MAX_SHINGLE_SIZE = "maxShingleSize";
private static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
public enum Count
{
NGRAM_TOTAL, OVERFLOW, MULTIWORD, EMITTED_UNIGRAM, SENTENCES, LEMMA, DOCSIZE, EMPTYDOC, WINDOWS
}
public enum Window
{
DOCUMENT, SENTENCE, S_WINDOW, C_WINDOW, FIXED
}
private static final Logger log = LoggerFactory.getLogger(CollocMapper.class);
private boolean emitUnigrams;
private Collection<String> multiwords;
private ResourceMetaData metadata;
private GramKey gramKey;
private int window = 3;
private Window windowMode = Window.SENTENCE;
private final int MAX_NGRAMS = 5000;
Pattern pattern = Pattern.compile(".*[\"\'#ยง$%&:\\+!,-]+.*");
Class<? extends Annotation> annotation = Lemma.class;
/**
* Used by FeatureCountHadoopDriver to map each CAS to a set of features, e.g. its n-grams or
* cooccurrences.
*/
public interface CountableFeaturePairExtractor
{
public void configure(JobConf job);
public void extract(final Context context, final JCas jcas, int lemmaCount);
}
/**
* Collocation finder: pass 1 map phase.
* <p>
* Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter
* delivers ngrams of the appropriate size which are then decomposed into head and tail subgrams
* which are collected in the following manner
* </p>
*
* <pre>
* k:head_key, v:head_subgram
* k:head_key,ngram_key, v:ngram
* k:tail_key, v:tail_subgram
* k:tail_key,ngram_key, v:ngram
* </pre>
* <p>
* The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or
* tail of the ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail
* is a (1)gram.
* </p>
* For example, given 'click and clack' and an ngram length of 3:
*
* <pre>
* k: head_'click and' v:head_'click and'
* k: head_'click and',ngram_'click and clack' v:ngram_'click and clack'
* k: tail_'clack', v:tail_'clack'
* k: tail_'clack',ngram_'click and clack' v:ngram_'click and clack'
* </pre>
* <p>
* Also counts the total number of ngrams encountered and adds it to the counter
* CollocDriver.Count.NGRAM_TOTAL
* </p>
*
* @throws IOException
* if there's a problem with the ShingleFilter reading data or the collector
* collecting output.
*/
@Override
protected void map(Text key, CASWritable value, final Context context)
throws IOException, InterruptedException
{
// ShingleFilter sf = new ShingleFilter(new
// IteratorTokenStream(value.getEntries().iterator()), maxShingleSize);
gramKey = new GramKey();
try {
// int count = 0; // ngram count
// final CAS aCAS = CasCreationUtils.createCas(asList(this.metadata));
// final String xml = value.toString();
//
// XCASDeserializer.deserialize(new StringInputStream(xml), aCAS);
final JCas jcas = value.getCAS().getJCas();
int lemmaCount = jcas.getAnnotationIndex(Lemma.type).size();
context.getCounter(Count.LEMMA).increment(lemmaCount);
context.getCounter(Count.DOCSIZE).increment(jcas.getDocumentText().length());
if (this.windowMode == Window.DOCUMENT) {
extractWholeDocument(context, jcas, lemmaCount);
}
else if (this.windowMode == Window.SENTENCE) {
extractSentence(context, jcas, lemmaCount);
}
else if (this.windowMode == Window.C_WINDOW)
{
extractWindow(context, jcas, lemmaCount);
// OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(lemmaCount *
// 4);
// OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(lemmaCount);
// int sentenceCount=0;
// for (final Annotation sentence : select(jcas, Sentence.class)) {
//
// sentenceCount++;
// count += collectCooccurencesFromCoveringAnnotation(context, jcas, sentence, ngrams,
// unigrams);
// if (count > 10000) {
// flushCollocations(context, ngrams, unigrams);
// // I suspect the clear method is not working properly
// ngrams = new OpenObjectIntHashMap<String>(lemmaCount * 4);
// unigrams = new OpenObjectIntHashMap<String>(lemmaCount);
// context.getCounter(Count.SENTENCES).increment(sentenceCount);
// context.getCounter(Count.NGRAM_TOTAL).increment(count);
// count = 0;
// sentenceCount = 0;
// }
// }
// flushCollocations(context, ngrams, unigrams);
// context.getCounter(Count.NGRAM_TOTAL).increment(count);
}
}
catch (NullPointerException e1) {
context.getCounter(Count.EMPTYDOC).increment(1);
}
catch (CASException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
finally {
// Closeables.closeQuietly(sf);
}
}
private void extractWindow(org.apache.hadoop.mapreduce.Mapper.Context context, JCas jcas,
int lemmaCount)
{
OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(lemmaCount * 4);
OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(lemmaCount);
int counta = 0;
int ngramcount = 0;
// int count = collectCooccurencesFromCas(context, jcas, ngrams, unigrams);
ArrayList<Lemma> terms = new ArrayList<Lemma>();
for (final Lemma term : JCasUtil.select(jcas, Lemma.class)) {
terms.add(term);
}
for (int wcount = 0; wcount < (terms.size() / window); wcount++) {
for (int i = 0; i < window; i++) {
if ((wcount * window) + i > terms.size()) {
break;
}
String termText = terms.get((wcount * window) + i).getValue().toLowerCase();
if (!isValid(termText)) {
continue;
}
int countb = 0;
context.getCounter(Count.WINDOWS).increment(1);
unigrams.adjustOrPutValue(termText, 1, 1);
for (int j = 0; j < window; j++) {
if ((wcount * window) + j > terms.size()) {
break;
}
String termText2 = terms.get((wcount * window) + j).getValue().toLowerCase();
// // out.set(termText, termText2);
// ngrams.adjustOrPutValue(termText+" "+termText2, 1, 1);
// count++;
if (!isValid(termText2)) {
continue;
}
ngrams.adjustOrPutValue(termText + "\t" + termText2, 1, 1);
if (ngramcount++ > 10000) {
flushCollocations(context, ngrams, unigrams);
context.getCounter(Count.NGRAM_TOTAL).increment(i);
ngrams = new OpenObjectIntHashMap<String>(lemmaCount * 4);
unigrams = new OpenObjectIntHashMap<String>(lemmaCount);
ngramcount = 0;
}
context.getCounter("test", "iteration").increment(1);
if (countb++ > 1000) {
break;
}
}
if (counta++ > 1000) {
break;
}
}
}
flushCollocations(context, ngrams, unigrams);
context.getCounter(Count.NGRAM_TOTAL).increment(ngramcount);
}
private int extractSentence(final Context context, final JCas jcas, int lemmaCount)
{
OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(lemmaCount * 4);
OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(lemmaCount);
int sentenceCount = 0;
int count = 0;
Annotation[] previous = new Annotation[window];
for (final Annotation sentence : select(jcas, Sentence.class)) {
for (int j = 0; j < previous.length - 1; j++) {
previous[j] = previous[j + 1];
}
previous[previous.length - 1] = sentence;
sentenceCount++;
count += collectCooccurencesFromCoveringAnnotation(context, jcas, sentence, ngrams,
unigrams);
if (count > 10000) {
flushCollocations(context, ngrams, unigrams);
// I suspect the clear method is not working properly
ngrams = new OpenObjectIntHashMap<String>(lemmaCount * 4);
unigrams = new OpenObjectIntHashMap<String>(lemmaCount);
context.getCounter(Count.SENTENCES).increment(sentenceCount);
context.getCounter(Count.NGRAM_TOTAL).increment(count);
count = 0;
sentenceCount = 0;
}
}
flushCollocations(context, ngrams, unigrams);
return count;
}
private void extractWholeDocument(final Context context, final JCas jcas, int lemmaCount)
{
OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(lemmaCount * 4);
OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(lemmaCount);
int counta = 0;
int i = 0;
int j = 0;
// int count = collectCooccurencesFromCas(context, jcas, ngrams, unigrams);
for (final Lemma term : JCasUtil.select(jcas, Lemma.class)) {
String termText = term.getValue().toLowerCase();
POS pos = null;
for (POS p : JCasUtil.selectCovered(jcas, POS.class, term)) {
pos = p;
}
if (!isValid(termText)) {
continue;
}
int countb = 0;
unigrams.adjustOrPutValue(termText, 1, 1);
for (final Lemma term2 : JCasUtil.select(jcas, Lemma.class)) {
final String termText2 = term2.getValue().toLowerCase();
// // out.set(termText, termText2);
// ngrams.adjustOrPutValue(termText+" "+termText2, 1, 1);
// count++;
if (!isValid(termText2)) {
continue;
}
ngrams.adjustOrPutValue(termText + "\t" + termText2, 1, 1);
if (i++ > 10000) {
flushCollocations(context, ngrams, unigrams);
context.getCounter(Count.NGRAM_TOTAL).increment(i);
ngrams = new OpenObjectIntHashMap<String>(lemmaCount * 4);
unigrams = new OpenObjectIntHashMap<String>(lemmaCount);
i = 0;
}
context.getCounter("test", "iteration").increment(1);
if (countb++ > 1000) {
break;
}
}
if (counta++ > 1000) {
break;
}
}
flushCollocations(context, ngrams, unigrams);
context.getCounter(Count.NGRAM_TOTAL).increment(i);
}
private String getValue(final Annotation term)
{
if (term instanceof Token) {
return ((Token) term).getCoveredText().toLowerCase();
}
if (term instanceof Lemma) {
return ((Lemma) term).getValue().toLowerCase();
}
if (term instanceof Stem) {
return ((Stem) term).getValue().toLowerCase();
}
throw new UnsupportedOperationException("Unknown annotation type "
+ term.getClass().getCanonicalName());
}
private void flushCollocations(final Context context, OpenObjectIntHashMap<String> ngrams,
OpenObjectIntHashMap<String> unigrams)
{
ngrams.forEachPair(new ObjectIntProcedure<String>()
{
@Override
public boolean apply(String term, int frequency)
{
// obtain components, the leading (n-1)gram and the trailing unigram.
int i = term.lastIndexOf('\t');
if (i != -1) { // bigram, trigram etc
try {
Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);
gramKey.set(head, EMPTY);
context.write(gramKey, head);
gramKey.set(head, ngram.getBytes());
context.write(gramKey, ngram);
gramKey.set(tail, EMPTY);
context.write(gramKey, tail);
gramKey.set(tail, ngram.getBytes());
context.write(gramKey, ngram);
}
catch (IOException e) {
throw new IllegalStateException(e);
}
catch (InterruptedException e) {
throw new IllegalStateException(e);
}
}
return true;
}
});
unigrams.forEachPair(new ObjectIntProcedure<String>()
{
@Override
public boolean apply(String term, int frequency)
{
try {
Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
gramKey.set(unigram, EMPTY);
context.write(gramKey, unigram);
}
catch (IOException e) {
throw new IllegalStateException(e);
}
catch (InterruptedException e) {
throw new IllegalStateException(e);
}
return true;
}
});
unigrams.clear();
ngrams.clear();
}
private int collectCooccurencesFromCoveringAnnotation(final Context context, JCas jcas,
final Annotation sentence, OpenObjectIntHashMap<String> ngrams,
OpenObjectIntHashMap<String> unigrams)
{
int count = 0;
int i = 0;
if (sentence != null) {
for (final Lemma term : JCasUtil.selectCovered(jcas, Lemma.class, sentence)) {
final String termText = term.getValue().toLowerCase();
if (!isValid(termText)) {
continue;
}
String left = termText;
unigrams.adjustOrPutValue(left, 1, 1);
for (final Lemma term2 : JCasUtil.selectCovered(jcas, Lemma.class, sentence)) {
final String termText2 = term2.getValue().toLowerCase();
// // out.set(termText, termText2);
// ngrams.adjustOrPutValue(termText+" "+termText2, 1, 1);
// count++;
if (!isValid(termText2)) {
continue;
}
if (!left.equals(termText2)) {
ngrams.adjustOrPutValue(left + "\t" + termText2, 1, 1);
}
count++;
}
if (i++ > 1000) {
context.getCounter(Count.OVERFLOW).increment(1);
return count;
}
}
}
return count;
}
private boolean isValid(final String termText)
{
return !(termText.length() == 1 || pattern.matcher(termText).matches() || termText
.contains(".."));
}
@Override
protected void setup(Context context)
throws IOException, InterruptedException
{
super.setup(context);
Configuration conf = context.getConfiguration();
this.window = conf.getInt(CollocDriver.WINDOW_SIZE, 3);
this.windowMode = Window
.valueOf(conf.get(CollocDriver.WINDOW_TYPE, Window.SENTENCE.name()));
this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS,
CollocDriver.DEFAULT_EMIT_UNIGRAMS);
this.metadata = new ResourceMetaData_impl();
final Element aElement;
final XMLParser aParser = org.apache.uima.UIMAFramework.getXMLParser();
// try {
//
// this.metadata = aParser.parseResourceMetaData(new XMLInputSource(new StringInputStream(
// Metadata.getMetadata()), new File(".")));
// }
// catch (final InvalidXMLException e1) {
// // TODO Auto-generated catch block
// e1.printStackTrace();
// }
if (log.isInfoEnabled()) {
// log.info("Max Ngram size is {}", this.maxShingleSize);
log.info("Emit Unitgrams is {}", emitUnigrams);
log.info("Window Mode is {}", this.windowMode.name());
log.info("Window Size is {}", window);
log.info("Emit Unitgrams is {}", emitUnigrams);
}
}
}