package edu.stanford.nlp.pipeline;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.*;
import edu.stanford.nlp.ie.crf.CRFBiasedClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.logging.Redwood;
public class TrueCaseAnnotator implements Annotator {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(TrueCaseAnnotator.class);
@SuppressWarnings("unchecked")
private final CRFBiasedClassifier<CoreLabel> trueCaser;
private final Map<String,String> mixedCaseMap;
private final boolean overwriteText;
private final boolean verbose;
public static final String DEFAULT_MODEL_BIAS = "INIT_UPPER:-0.7,UPPER:-0.7,O:0";
private static final String DEFAULT_OVERWRITE_TEXT = "false";
private static final String DEFAULT_VERBOSE = "false";
public TrueCaseAnnotator() {
this(true);
}
public TrueCaseAnnotator(boolean verbose) {
this(System.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL),
System.getProperty("truecase.bias", DEFAULT_MODEL_BIAS),
System.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST),
Boolean.parseBoolean(System.getProperty("truecase.overwriteText", TrueCaseAnnotator.DEFAULT_OVERWRITE_TEXT)),
verbose);
}
public TrueCaseAnnotator(Properties properties) {
this(properties.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL),
properties.getProperty("truecase.bias", TrueCaseAnnotator.DEFAULT_MODEL_BIAS),
properties.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST),
Boolean.parseBoolean(properties.getProperty("truecase.overwriteText", TrueCaseAnnotator.DEFAULT_OVERWRITE_TEXT)),
Boolean.parseBoolean(properties.getProperty("truecase.verbose", TrueCaseAnnotator.DEFAULT_VERBOSE)));
}
public TrueCaseAnnotator(String modelLoc,
String classBias,
String mixedCaseFileName,
boolean overwriteText,
boolean verbose) {
this.overwriteText = overwriteText;
this.verbose = verbose;
Properties props = PropertiesUtils.asProperties(
"loadClassifier", modelLoc,
"mixedCaseMapFile", mixedCaseFileName,
"classBias", classBias);
trueCaser = new CRFBiasedClassifier<>(props);
if (modelLoc != null) {
trueCaser.loadClassifierNoExceptions(modelLoc, props);
} else {
throw new RuntimeException("Model location not specified for true-case classifier!");
}
if (classBias != null) {
StringTokenizer biases = new java.util.StringTokenizer(classBias,",");
while (biases.hasMoreTokens()) {
StringTokenizer bias = new java.util.StringTokenizer(biases.nextToken(),":");
String cname = bias.nextToken();
double w = Double.parseDouble(bias.nextToken());
trueCaser.setBiasWeight(cname,w);
if (this.verbose) log.info("Setting bias for class " + cname + " to " + w);
}
}
// Load map containing mixed-case words:
mixedCaseMap = loadMixedCaseMap(mixedCaseFileName);
}
@Override
public void annotate(Annotation annotation) {
if (verbose) {
log.info("Adding true-case annotation...");
}
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
// classify tokens for each sentence
for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
List<CoreLabel> output = this.trueCaser.classifySentence(tokens);
for (int i = 0, size = tokens.size(); i < size; i++) {
// add the truecaser tag to each token
String neTag = output.get(i).get(CoreAnnotations.AnswerAnnotation.class);
tokens.get(i).set(CoreAnnotations.TrueCaseAnnotation.class, neTag);
setTrueCaseText(tokens.get(i));
}
}
} else {
throw new RuntimeException("unable to find sentences in: " + annotation);
}
}
private void setTrueCaseText(CoreLabel l) {
String trueCase = l.getString(CoreAnnotations.TrueCaseAnnotation.class);
String text = l.word();
String trueCaseText = text;
switch (trueCase) {
case "UPPER":
trueCaseText = text.toUpperCase();
break;
case "LOWER":
trueCaseText = text.toLowerCase();
break;
case "INIT_UPPER":
trueCaseText = Character.toTitleCase(text.charAt(0)) + text.substring(1).toLowerCase();
break;
case "O":
// The model predicted mixed case, so lookup the map:
String lower = text.toLowerCase();
if (mixedCaseMap.containsKey(lower)) {
trueCaseText = mixedCaseMap.get(lower);
}
// else leave it as it was?
break;
}
// System.err.println(text + " was classified as " + trueCase + " and so became " + trueCaseText);
l.set(CoreAnnotations.TrueCaseTextAnnotation.class, trueCaseText);
if (overwriteText) {
l.set(CoreAnnotations.TextAnnotation.class, trueCaseText);
l.set(CoreAnnotations.ValueAnnotation.class, trueCaseText);
}
}
private static Map<String,String> loadMixedCaseMap(String mapFile) {
Map<String,String> map = Generics.newHashMap();
try {
BufferedReader br = IOUtils.readerFromString(mapFile);
for (String line : ObjectBank.getLineIterator(br)) {
line = line.trim();
String[] els = line.split("\\s+");
if (els.length != 2) {
throw new RuntimeException("Wrong format: " + mapFile);
}
map.put(els[0], els[1]);
}
br.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
return map;
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.PositionAnnotation.class,
CoreAnnotations.SentencesAnnotation.class
)));
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TrueCaseTextAnnotation.class,
CoreAnnotations.TrueCaseAnnotation.class,
CoreAnnotations.AnswerAnnotation.class,
CoreAnnotations.ShapeAnnotation.class
)));
}
}