package edu.stanford.nlp.pipeline;
import hu.u_szeged.utils.NLPUtils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Timing;
public class MweDictAnnotator implements Annotator {
private Timing timer;
private boolean VERBOSE;
private Map<List<String>, Map<String, Integer>> mweFreqs;
public MweDictAnnotator(String file) {
this(false, file);
}
public MweDictAnnotator(boolean verbose, String file) {
timer = new Timing();
this.VERBOSE = verbose;
if (VERBOSE) {
timer.start();
System.err.print("Adding normalized token annotation...");
}
mweFreqs = new HashMap<List<String>, Map<String, Integer>>();
List<String> lines = new LinkedList<String>();
NLPUtils.readDocToCollection(file, lines);
for (String line : lines) {
String[] lineParts = line.split("\t");
Map<String, Integer> frequencies = new HashMap<String, Integer>();
for (int col = 1; col < lineParts.length; ++col) {
frequencies.put(lineParts[col], Integer.parseInt(lineParts[++col]));
}
List<String> tokens = new LinkedList<String>();
for (String token : lineParts[0].split(" ")) {
tokens.add(token);
}
mweFreqs.put(tokens, frequencies);
}
if (VERBOSE) {
System.err.print("MWE dictionary ");
timer.done();
}
}
private List<String> getLemmatizedListForm(List<CoreLabel> coreLabels) {
List<String> lemmatized = new ArrayList<String>(coreLabels.size());
for (int st = 0; st < coreLabels.size(); ++st) {
if (st == coreLabels.size() - 1)
lemmatized.add(coreLabels.get(st).getString(LemmaAnnotation.class).toLowerCase());
else
lemmatized.add(coreLabels.get(st).word().toLowerCase());
}
return lemmatized;
}
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
timer.start();
System.err.print("Adding MWE token annotation...");
}
if (annotation.has(SentencesAnnotation.class)) {
List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
List<CoreLabel> successiveTokens = new ArrayList<CoreLabel>(4);
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (int i = 0; i < tokens.size(); i++) {
CoreLabel token = tokens.get(i);
token.set(MWEAnnotation.class, "O");
if (successiveTokens.size() == 4) {
successiveTokens.remove(0);
}
successiveTokens.add(token);
int size = successiveTokens.size();
for (int p = 0; size > 1 && p < size - 1; ++p) {
List<String> lemmatized = getLemmatizedListForm(successiveTokens.subList(p, size));
Map<String, Integer> freqs = mweFreqs.get(lemmatized);
if (freqs != null) {
boolean firstFreq = true;
for (Entry<String, Integer> freq : freqs.entrySet()) {
String type = transformType(freq.getKey());
for (int s = 0; s < lemmatized.size(); s++) {
CoreLabel prevToken = tokens.get(i - s);
String prevMweLabel = prevToken.get(MWEAnnotation.class).replace("O", "");
boolean insertDelimiter = prevMweLabel.length() > 0 && firstFreq;
if (s == 0) {
prevMweLabel += (prevMweLabel.length() == 0 ? "" : (insertDelimiter ? "@" : "|")) + "E-";
} else if (s == lemmatized.size() - 1) {
prevMweLabel += (prevMweLabel.length() == 0 ? "" : (insertDelimiter ? "@" : "|")) + "B-";
} else {
prevMweLabel += (prevMweLabel.length() == 0 ? "" : (insertDelimiter ? "@" : "|")) + "I-";
}
prevMweLabel += type + "_" + Integer.toString(freq.getValue());
tokens.get(i - s).set(MWEAnnotation.class, prevMweLabel);
}
firstFreq = false;
}
}
}
}
}
} else {
throw new RuntimeException("unable to find words/tokens in: " + annotation);
}
if (VERBOSE)
timer.stop("done.");
}
private String transformType(String type) {
if (type.equals("link:")) {
return "L";
} else if (type.equals("italic:")) {
return "I";
} else if (type.equals("linkItalicBold:")) {
return "LIB";
} else if (type.equals("bold:")) {
return "B";
} else if (type.equals("linkBold:")) {
return "LB";
} else if (type.equals("linkItalic:")) {
return "LI";
} else if (type.equals("boldItalic:")) {
return "IB";
} else {
return "N/A";
}
}
public static class MWEAnnotation implements CoreAnnotation<String> {
public Class<String> getType() {
return String.class;
}
}
@Override
public Set<Requirement> requires() {
return Collections.unmodifiableSet(new ArraySet<Requirement>());
}
@Override
public Set<Requirement> requirementsSatisfied() {
return Collections.singleton(TOKENIZE_REQUIREMENT);
}
}