package edu.stanford.nlp.pipeline; import java.io.BufferedReader; import java.io.IOException; import java.util.*; import edu.stanford.nlp.ie.crf.CRFBiasedClassifier; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.objectbank.ObjectBank; import edu.stanford.nlp.util.ArraySet; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.logging.Redwood; public class TrueCaseAnnotator implements Annotator { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(TrueCaseAnnotator.class); @SuppressWarnings("unchecked") private final CRFBiasedClassifier<CoreLabel> trueCaser; private final Map<String,String> mixedCaseMap; private final boolean overwriteText; private final boolean verbose; public static final String DEFAULT_MODEL_BIAS = "INIT_UPPER:-0.7,UPPER:-0.7,O:0"; private static final String DEFAULT_OVERWRITE_TEXT = "false"; private static final String DEFAULT_VERBOSE = "false"; public TrueCaseAnnotator() { this(true); } public TrueCaseAnnotator(boolean verbose) { this(System.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL), System.getProperty("truecase.bias", DEFAULT_MODEL_BIAS), System.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST), Boolean.parseBoolean(System.getProperty("truecase.overwriteText", TrueCaseAnnotator.DEFAULT_OVERWRITE_TEXT)), verbose); } public TrueCaseAnnotator(Properties properties) { this(properties.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL), properties.getProperty("truecase.bias", TrueCaseAnnotator.DEFAULT_MODEL_BIAS), properties.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST), Boolean.parseBoolean(properties.getProperty("truecase.overwriteText", TrueCaseAnnotator.DEFAULT_OVERWRITE_TEXT)), Boolean.parseBoolean(properties.getProperty("truecase.verbose", TrueCaseAnnotator.DEFAULT_VERBOSE))); } public TrueCaseAnnotator(String modelLoc, String classBias, String mixedCaseFileName, boolean overwriteText, boolean verbose) { this.overwriteText = overwriteText; this.verbose = verbose; Properties props = PropertiesUtils.asProperties( "loadClassifier", modelLoc, "mixedCaseMapFile", mixedCaseFileName, "classBias", classBias); trueCaser = new CRFBiasedClassifier<>(props); if (modelLoc != null) { trueCaser.loadClassifierNoExceptions(modelLoc, props); } else { throw new RuntimeException("Model location not specified for true-case classifier!"); } if (classBias != null) { StringTokenizer biases = new java.util.StringTokenizer(classBias,","); while (biases.hasMoreTokens()) { StringTokenizer bias = new java.util.StringTokenizer(biases.nextToken(),":"); String cname = bias.nextToken(); double w = Double.parseDouble(bias.nextToken()); trueCaser.setBiasWeight(cname,w); if (this.verbose) log.info("Setting bias for class " + cname + " to " + w); } } // Load map containing mixed-case words: mixedCaseMap = loadMixedCaseMap(mixedCaseFileName); } @Override public void annotate(Annotation annotation) { if (verbose) { log.info("Adding true-case annotation..."); } if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) { // classify tokens for each sentence for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> output = this.trueCaser.classifySentence(tokens); for (int i = 0, size = tokens.size(); i < size; i++) { // add the truecaser tag to each token String neTag = output.get(i).get(CoreAnnotations.AnswerAnnotation.class); tokens.get(i).set(CoreAnnotations.TrueCaseAnnotation.class, neTag); setTrueCaseText(tokens.get(i)); } } } else { throw new RuntimeException("unable to find sentences in: " + annotation); } } private void setTrueCaseText(CoreLabel l) { String trueCase = l.getString(CoreAnnotations.TrueCaseAnnotation.class); String text = l.word(); String trueCaseText = text; switch (trueCase) { case "UPPER": trueCaseText = text.toUpperCase(); break; case "LOWER": trueCaseText = text.toLowerCase(); break; case "INIT_UPPER": trueCaseText = Character.toTitleCase(text.charAt(0)) + text.substring(1).toLowerCase(); break; case "O": // The model predicted mixed case, so lookup the map: String lower = text.toLowerCase(); if (mixedCaseMap.containsKey(lower)) { trueCaseText = mixedCaseMap.get(lower); } // else leave it as it was? break; } // System.err.println(text + " was classified as " + trueCase + " and so became " + trueCaseText); l.set(CoreAnnotations.TrueCaseTextAnnotation.class, trueCaseText); if (overwriteText) { l.set(CoreAnnotations.TextAnnotation.class, trueCaseText); l.set(CoreAnnotations.ValueAnnotation.class, trueCaseText); } } private static Map<String,String> loadMixedCaseMap(String mapFile) { Map<String,String> map = Generics.newHashMap(); try { BufferedReader br = IOUtils.readerFromString(mapFile); for (String line : ObjectBank.getLineIterator(br)) { line = line.trim(); String[] els = line.split("\\s+"); if (els.length != 2) { throw new RuntimeException("Wrong format: " + mapFile); } map.put(els[0], els[1]); } br.close(); } catch (IOException e) { throw new RuntimeIOException(e); } return map; } @Override public Set<Class<? extends CoreAnnotation>> requires() { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.SentencesAnnotation.class ))); } @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.TrueCaseTextAnnotation.class, CoreAnnotations.TrueCaseAnnotation.class, CoreAnnotations.AnswerAnnotation.class, CoreAnnotations.ShapeAnnotation.class ))); } }