package edu.stanford.nlp.sequences; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.AfterAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.BeforeAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.UnknownAnnotation; import edu.stanford.nlp.process.PTBTokenizer; import edu.stanford.nlp.objectbank.XMLBeginEndIterator; import edu.stanford.nlp.process.WordToSentenceProcessor; import edu.stanford.nlp.util.StringUtils; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author Jenny Finkel */ public class TrueCasingDocumentReaderAndWriter implements DocumentReaderAndWriter { private static final long serialVersionUID = 1731527027473052481L; public void init(SeqClassifierFlags flags) {} private static final Pattern sgml = Pattern.compile("<[^>]*>"); private static final Pattern allLower = Pattern.compile("[^A-Z]*?[a-z]+[^A-Z]*?"); private static final Pattern allUpper = Pattern.compile("[^a-z]*?[A-Z]+[^a-z]*?"); private static final Pattern startUpper = Pattern.compile("[A-Z].*"); private static WordToSentenceProcessor<CoreLabel,Object,Object> wts = new WordToSentenceProcessor<CoreLabel,Object,Object>(); public static Set<String> knownWords; // = null; public static boolean known(String s) { return knownWords.contains(s.toLowerCase()); } public Iterator<List<CoreLabel>> getIterator(Reader r) { List<List<CoreLabel>> documents = new ArrayList<List<CoreLabel>>(); String s = StringUtils.slurpReader(r); Set<String> wordsSeenOnce = new HashSet<String>(); Set<String> wordsSeenMultiple = new HashSet<String>(); XMLBeginEndIterator xmlIter = new XMLBeginEndIterator(new StringReader(s), "TEXT"); while (xmlIter.hasNext()) { PTBTokenizer<CoreLabel> ptb = PTBTokenizer.newPTBTokenizer(new StringReader((String)xmlIter.next()), false, true); List<CoreLabel> document = new ArrayList<CoreLabel>(); Set<String> words = new HashSet<String>(); while (ptb.hasNext()) { CoreLabel w = ptb.next(); words.add(w.word().toLowerCase()); Matcher m = sgml.matcher(w.word()); if (m.matches()) { if (document.size() > 0) { documents.addAll(wts.process(document)); document = new ArrayList<CoreLabel>(); } continue; } document.add(w); } if (document.size() > 0) { documents.addAll(wts.process(document)); } for (String word : words) { if (wordsSeenMultiple.contains(word)) { /* continue */ } else if (wordsSeenOnce.contains(word)) { wordsSeenOnce.remove(word); wordsSeenMultiple.add(word); } else { wordsSeenOnce.add(word); } } } xmlIter = new XMLBeginEndIterator(new StringReader(s), "TXT"); while (xmlIter.hasNext()) { PTBTokenizer<CoreLabel> ptb = PTBTokenizer.newPTBTokenizer(new StringReader((String)xmlIter.next()), false, true); List<CoreLabel> document = new ArrayList<CoreLabel>(); Set<String> words = new HashSet<String>(); while (ptb.hasNext()) { CoreLabel w = ptb.next(); words.add(w.word().toLowerCase()); Matcher m = sgml.matcher(w.word()); if (m.matches()) { if (document.size() > 0) { documents.addAll(wts.process(document)); document = new ArrayList<CoreLabel>(); } continue; } document.add(w); } if (document.size() > 0) { documents.addAll(wts.process(document)); } for (String word : words) { if (wordsSeenMultiple.contains(word)) { /* continue */ } else if (wordsSeenOnce.contains(word)) { wordsSeenOnce.remove(word); wordsSeenMultiple.add(word); } else { wordsSeenOnce.add(word); } } } knownWords = wordsSeenMultiple; knownWords.addAll(wordsSeenOnce); wordsSeenMultiple = null; List<List<CoreLabel>> docs = new ArrayList<List<CoreLabel>>(); for (List<CoreLabel> document : documents) { System.err.println(document); List<CoreLabel> doc = new ArrayList<CoreLabel>(); int pos = 0; for (CoreLabel w : document) { CoreLabel wi = new CoreLabel(); Matcher lowerMatcher = allLower.matcher(w.word()); if (lowerMatcher.matches()) { wi.set(AnswerAnnotation.class, "LOWER"); } else { Matcher upperMatcher = allUpper.matcher(w.word()); if (upperMatcher.matches()) { wi.set(AnswerAnnotation.class, "UPPER"); } else { Matcher startUpperMatcher = startUpper.matcher(w.word()); if (startUpperMatcher.matches()) { wi.set(AnswerAnnotation.class, "INIT_UPPER"); } else { wi.set(AnswerAnnotation.class, "O"); } } } wi.setWord(w.word().toLowerCase()); wi.set(UnknownAnnotation.class, (wordsSeenOnce.contains(w.word().toLowerCase()) ? "true" : "false")); wi.set(PositionAnnotation.class, Integer.toString(pos)); if (wi.get(UnknownAnnotation.class).equals("true")) { System.err.println(wi.word()+" :: "+wi.get(UnknownAnnotation.class)+" :: "+wi.get(PositionAnnotation.class)); } doc.add(wi); pos++; } System.err.println(); docs.add(doc); } return docs.iterator(); } public void printAnswers(List<CoreLabel> doc, PrintWriter out) { for (CoreLabel wi : doc) { // cdm note: jan 2009: This used to pad with the {Prev,After}SGMLAnnotation // but I think this was just wrong, and it should have been the regular // whitespace annotation. I changed it to that while removing SGML String prev = wi.get(BeforeAnnotation.class); out.print(prev); String w = wi.word(); if (wi.get(AnswerAnnotation.class).equals("UPPER")) { out.print(w.toUpperCase()); } else if (wi.get(AnswerAnnotation.class).equals("LOWER")) { out.print(w.toLowerCase()); } else if (wi.get(AnswerAnnotation.class).equals("INIT_UPPER")) { out.print(w.substring(0,1).toUpperCase()); out.print(w.substring(1)); } else { out.print(w); } String after = wi.get(AfterAnnotation.class); out.print(after); } out.println(); } }