package edu.stanford.nlp.sequences; import edu.stanford.nlp.util.logging.Redwood; import java.io.BufferedReader; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.objectbank.IteratorFromReaderFactory; import edu.stanford.nlp.objectbank.XMLBeginEndIterator; import java.util.function.Function; import edu.stanford.nlp.process.PTBTokenizer; /** * DocumentReader for MUC format. * * @author Jenny Finkel */ public class MUCDocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel> { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(MUCDocumentReaderAndWriter.class); /** * */ private static final long serialVersionUID = -8334720781758500037L; private SeqClassifierFlags flags; private IteratorFromReaderFactory<List<CoreLabel>> factory; public void init(SeqClassifierFlags flags) { this.flags = flags; factory = XMLBeginEndIterator.getFactory("DOC", new MUCDocumentParser(), true, true); } public Iterator<List<CoreLabel>> getIterator(Reader r) { return factory.getIterator(r); } static class MUCDocumentParser implements Function<String, List<CoreLabel>> { private static final Pattern sgml = Pattern.compile("<([^>\\s]*)[^>]*>"); private static final Pattern beginEntity = Pattern.compile("<(ENAMEX|TIMEX|NUMEX) TYPE=\"([a-z]+)\"[^>]*>", Pattern.CASE_INSENSITIVE); private static final Pattern endEntity = Pattern.compile("</(ENAMEX|TIMEX|NUMEX)>"); public List<CoreLabel> apply(String doc) { if (doc == null) { return null; } String section = ""; String entity = "O"; String entityClass = ""; int pNum = 0; int sNum = 0; int wNum = 0; PTBTokenizer ptb = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(doc)), false, true); List<CoreLabel> words = ptb.tokenize(); List<CoreLabel> result = new ArrayList<>(); CoreLabel prev = null; String prevString = ""; Matcher matcher; for (CoreLabel word : words) { matcher = sgml.matcher(word.word()); if (matcher.matches()) { String tag = matcher.group(1); if (word.word().equalsIgnoreCase("<p>")) { pNum++; sNum = 0; wNum = 0; if (prev != null) { String s = prev.get(CoreAnnotations.AfterAnnotation.class); s += word.originalText()+word.after(); prev.set(CoreAnnotations.AfterAnnotation.class, s); } prevString += word.before() + word.originalText(); } else if (word.word().equalsIgnoreCase("<s>")) { sNum++; wNum = 0; if (prev != null) { String s = prev.get(CoreAnnotations.AfterAnnotation.class); s += word.originalText()+word.after(); prev.set(CoreAnnotations.AfterAnnotation.class, s); } prevString += word.before() + word.originalText(); } else { matcher = beginEntity.matcher(word.word()); if (matcher.matches()) { entityClass = matcher.group(1); entity = matcher.group(2); if (prev != null) { String s = prev.get(CoreAnnotations.AfterAnnotation.class); s += word.after(); prev.set(CoreAnnotations.AfterAnnotation.class, s); } prevString += word.before(); } else { matcher = endEntity.matcher(word.word()); if (matcher.matches()) { entityClass = ""; entity = "O"; if (prev != null) { String s = prev.get(CoreAnnotations.AfterAnnotation.class); s += word.after(); prev.set(CoreAnnotations.AfterAnnotation.class, s); } prevString += word.before(); } else if (word.word().equalsIgnoreCase("<doc>")) { prevString += word.before() + word.originalText(); } else if (word.word().equalsIgnoreCase("</doc>")) { String s = prev.get(CoreAnnotations.AfterAnnotation.class); s += word.originalText(); prev.set(CoreAnnotations.AfterAnnotation.class, s); } else { section = tag.toUpperCase(); if (prev != null) { String s = prev.get(CoreAnnotations.AfterAnnotation.class); s += word.originalText() + word.after(); prev.set(CoreAnnotations.AfterAnnotation.class, s); } prevString += word.before() + word.originalText(); } } } } else { CoreLabel wi = new CoreLabel(); wi.setWord(word.word()); wi.set(CoreAnnotations.OriginalTextAnnotation.class, word.originalText()); wi.set(CoreAnnotations.BeforeAnnotation.class, prevString+word.before()); wi.set(CoreAnnotations.AfterAnnotation.class, word.after()); wi.set(CoreAnnotations.WordPositionAnnotation.class, ""+wNum); wi.set(CoreAnnotations.SentencePositionAnnotation.class, ""+sNum); wi.set(CoreAnnotations.ParaPositionAnnotation.class, ""+pNum); wi.set(CoreAnnotations.SectionAnnotation.class, section); wi.set(CoreAnnotations.AnswerAnnotation.class, entity); wi.set(CoreAnnotations.EntityClassAnnotation.class, entityClass); wNum++; prevString = ""; result.add(wi); prev = wi; } } //log.info(doc); //log.info(edu.stanford.nlp.util.StringUtils.join(result, "\n")); //System.exit(0); return result; } } public void printAnswers(List<CoreLabel> doc, PrintWriter pw) { String prevAnswer = "O"; String prevClass = ""; String afterLast = ""; for (CoreLabel word : doc) { if (!prevAnswer.equals("O") && !prevAnswer.equals(word.get(CoreAnnotations.AnswerAnnotation.class))) { pw.print("</"+prevClass+">"); prevClass = ""; } pw.print(word.get(CoreAnnotations.BeforeAnnotation.class)); if (!word.get(CoreAnnotations.AnswerAnnotation.class).equals("O") && !word.get(CoreAnnotations.AnswerAnnotation.class).equals(prevAnswer)) { if (word.get(CoreAnnotations.AnswerAnnotation.class).equalsIgnoreCase("PERSON") || word.get(CoreAnnotations.AnswerAnnotation.class).equalsIgnoreCase("ORGANIZATION") || word.get(CoreAnnotations.AnswerAnnotation.class).equalsIgnoreCase("LOCATION")) { prevClass = "ENAMEX"; } else if (word.get(CoreAnnotations.AnswerAnnotation.class).equalsIgnoreCase("DATE") || word.get(CoreAnnotations.AnswerAnnotation.class).equalsIgnoreCase("TIME")) { prevClass = "TIMEX"; } else if (word.get(CoreAnnotations.AnswerAnnotation.class).equalsIgnoreCase("PERCENT") || word.get(CoreAnnotations.AnswerAnnotation.class).equalsIgnoreCase("MONEY")) { prevClass = "NUMEX"; } else { log.info("unknown type: "+word.get(CoreAnnotations.AnswerAnnotation.class)); System.exit(0); } pw.print("<"+prevClass+" TYPE=\""+word.get(CoreAnnotations.AnswerAnnotation.class)+"\">"); } pw.print(word.get(CoreAnnotations.OriginalTextAnnotation.class)); afterLast = word.get(CoreAnnotations.AfterAnnotation.class); prevAnswer = word.get(CoreAnnotations.AnswerAnnotation.class); } if (!prevAnswer.equals("O")) { pw.print("</"+prevClass+">"); prevClass = ""; } pw.println(afterLast); } }