package edu.stanford.nlp.sequences; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.Function; import edu.stanford.nlp.objectbank.LineIterator; import edu.stanford.nlp.objectbank.IteratorFromReaderFactory; import java.io.PrintWriter; import java.io.Reader; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * adapt from Jenny's TrueCasingDocumentReaderAndWriter.java * @author Pi-Chuan Chang */ public class TrueCasingForNISTDocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel> { public static final String THREE_CLASSES_PROPERTY = "3class"; public static final boolean THREE_CLASSES = Boolean.parseBoolean(System.getProperty(THREE_CLASSES_PROPERTY, "false")); /** * */ private static final long serialVersionUID = -3000389291781534479L; private IteratorFromReaderFactory<List<CoreLabel>> factory; private Boolean verboseForTrueCasing = false; private static Pattern alphabet = Pattern.compile("[A-Za-z]+"); /** * for test only **/ public static void main(String[] args) throws IOException{ Reader reader = new BufferedReader(new FileReader(args[0])); TrueCasingForNISTDocumentReaderAndWriter raw = new TrueCasingForNISTDocumentReaderAndWriter(); raw.init(null); Iterator<List<CoreLabel>> it = raw.getIterator(reader); while(it.hasNext()) { List<CoreLabel> l = it.next(); for (CoreLabel cl : l) { System.out.println(cl); } System.out.println("========================================"); } } public void init(SeqClassifierFlags flags) { verboseForTrueCasing = flags.verboseForTrueCasing; factory = LineIterator.getFactory(new LineToTrueCasesParser()); // todo } public static Set knownWords = null; public static boolean known(String s) { return knownWords.contains(s.toLowerCase()); } public Iterator<List<CoreLabel>> getIterator(Reader r) { return factory.getIterator(r); } public void printAnswers(List<CoreLabel> doc, PrintWriter out) { List<String> sentence = new ArrayList<String>(); int wrong = 0; for (CoreLabel wi : doc) { StringBuilder sb = new StringBuilder(); if (! wi.get(CoreAnnotations.AnswerAnnotation.class).equals(wi.get(CoreAnnotations.GoldAnswerAnnotation.class))) { wrong++; } if (!THREE_CLASSES && wi.get(CoreAnnotations.AnswerAnnotation.class).equals("UPPER")) { sb.append(wi.word().toUpperCase()); } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("LOWER")) { sb.append(wi.word().toLowerCase()); } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("INIT_UPPER")) { sb.append(wi.word().substring(0,1).toUpperCase()) .append(wi.word().substring(1)); } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("O")) { // in this case, if it cotains a-z at all, then append "MIX" at the end sb.append(wi.word()); Matcher alphaMatcher = alphabet.matcher(wi.word()); if (alphaMatcher.matches()) { sb.append("/MIX"); } } if (verboseForTrueCasing) { sb.append("/GOLD-") .append(wi.get(CoreAnnotations.GoldAnswerAnnotation.class)) .append("/GUESS-") .append(wi.get(CoreAnnotations.AnswerAnnotation.class)); } sentence.add(sb.toString()); } out.print(StringUtils.join(sentence, " ")); System.err.printf("> wrong = %d ; total = %d\n", wrong, doc.size()); out.println(); } public static class LineToTrueCasesParser implements Function<String,List<CoreLabel>> { private static Pattern allLower = Pattern.compile("[^A-Z]*?[a-z]+[^A-Z]*?"); private static Pattern allUpper = Pattern.compile("[^a-z]*?[A-Z]+[^a-z]*?"); private static Pattern startUpper = Pattern.compile("[A-Z].*"); public List<CoreLabel> apply(String line) { List<CoreLabel> doc = new ArrayList<CoreLabel>(); int pos = 0; //line = line.replaceAll(" +"," "); //System.err.println("pichuan: processing line = "+line); String[] toks = line.split(" "); for (String word : toks) { CoreLabel wi = new CoreLabel(); Matcher lowerMatcher = allLower.matcher(word); if (lowerMatcher.matches()) { wi.set(CoreAnnotations.AnswerAnnotation.class, "LOWER"); wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "LOWER"); } else { Matcher upperMatcher = allUpper.matcher(word); if (!THREE_CLASSES && upperMatcher.matches()) { wi.set(CoreAnnotations.AnswerAnnotation.class, "UPPER"); wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "UPPER"); } else { Matcher startUpperMatcher = startUpper.matcher(word); boolean isINIT_UPPER; // = false; if (word.length() > 1) { String w2 = word.substring(1); String lcw2 = w2.toLowerCase(); isINIT_UPPER = w2.equals(lcw2); } else { isINIT_UPPER = false; } if (startUpperMatcher.matches() && isINIT_UPPER) { wi.set(CoreAnnotations.AnswerAnnotation.class, "INIT_UPPER"); wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "INIT_UPPER"); } else { wi.set(CoreAnnotations.AnswerAnnotation.class, "O"); wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "O"); } } } wi.setWord(word.toLowerCase()); wi.set(CoreAnnotations.PositionAnnotation.class, pos + ""); doc.add(wi); pos++; } return doc; } } }