package edu.stanford.nlp.pipeline;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreAnnotations.AfterAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.BeforeAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.DocDateAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ForcedSentenceEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.OriginalTextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.XmlContextAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.XMLUtils;
/**
* An annotator which removes all xml tags (as identified by the tokenizer) and possibly selectively keeps the text between them. Can also
* add sentence ending markers depending on the xml tag.
*
* This is a modification of the CleanXmlAnnotator, which tolerates flawed XMLs even more than the original one.
*
* @author John Bauer
* @author BerendGabor
*/
public class MyCleanXmlAnnotator implements Annotator {
/**
* A regular expression telling us where to look for tokens we care about
*/
private final Pattern xmlTagMatcher;
public static final String DEFAULT_XML_TAGS = ".*";
/**
* This regular expression tells us which tags end a sentence... for example, <p> would be a great candidate
*/
private final Pattern sentenceEndingTagMatcher;
public static final String DEFAULT_SENTENCE_ENDERS = "";
/**
* This tells us which XML tags wrap document date
*/
private final Pattern dateTagMatcher;
public static final String DEFAULT_DATE_TAGS = "datetime|date";
public MyCleanXmlAnnotator() {
this(DEFAULT_XML_TAGS, DEFAULT_SENTENCE_ENDERS, DEFAULT_DATE_TAGS);
}
public MyCleanXmlAnnotator(String xmlTagsToRemove, String sentenceEndingTags, String dateTags) {
if (xmlTagsToRemove != null) {
xmlTagMatcher = Pattern.compile(xmlTagsToRemove);
if (sentenceEndingTags != null && sentenceEndingTags.length() > 0) {
sentenceEndingTagMatcher = Pattern.compile(sentenceEndingTags);
} else {
sentenceEndingTagMatcher = null;
}
} else {
xmlTagMatcher = null;
sentenceEndingTagMatcher = null;
}
if (dateTags != null) {
dateTagMatcher = Pattern.compile(dateTags, Pattern.CASE_INSENSITIVE);
} else {
dateTagMatcher = null;
}
}
public void annotate(Annotation annotation) {
if (annotation.has(TokensAnnotation.class)) {
List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
List<CoreLabel> dateTokens = new ArrayList<CoreLabel>();
List<CoreLabel> newTokens = process(tokens, dateTokens);
// We assume that if someone is using this annotator, they don't
// want the old tokens any more and get rid of them
annotation.set(TokensAnnotation.class, newTokens);
// if the doc date was found, save it. it is used by SUTime (inside the "ner" annotator)
if (dateTokens.size() > 0) {
StringBuffer os = new StringBuffer();
boolean first = true;
for (CoreLabel t : dateTokens) {
if (!first)
os.append(" ");
os.append(t.word());
first = false;
}
// System.err.println("DOC DATE IS: " + os.toString());
annotation.set(DocDateAnnotation.class, os.toString());
}
}
}
public List<CoreLabel> process(List<CoreLabel> tokens) {
return process(tokens, null);
}
public List<CoreLabel> process(List<CoreLabel> tokens, List<CoreLabel> dateTokens) {
List<String> history = new ArrayList<String>(5);
// As we are processing, this stack keeps track of which tags we
// are currently inside
List<String> enclosingTags = new LinkedList<String>();
// here we keep track of the current enclosingTags
// this lets multiple tokens reuse the same tag stack
List<String> currentTagSet = null;
// How many matching tags we've seen
int matchDepth = 0;
// stores the filtered tags as we go
List<CoreLabel> newTokens = new ArrayList<CoreLabel>();
// we use this to store the before & after annotations if the
// tokens were tokenized for "invertible"
StringBuilder removedText = new StringBuilder();
// we keep track of this so we can look at the last tag after
// we're outside the loop
// TODO additionally added in Szeged to overcome the issue of being even more admissible with flowingness
List<String> endTags = new LinkedList<String>();
for (CoreLabel token : tokens) {
// TODO additionally added in Szeged to overome some previous (probably by now not current) bug
String word = token.word().replace((char) 160, ' ').trim();
if (history.size() == 5) {
history.remove(0);
}
history.add(word);
// TODO additionally added in Szeged to overome some previous (probably by now not current) bug
XMLUtils.XMLTag tag = XMLUtils.parseTag(word.toLowerCase().replace("'", "\"").replaceAll("(a +href=)[^\"]", "$1\""));
// If it's not a tag, we do manipulations such as unescaping
if (tag == null) {
Iterator<String> endingIt = endTags.iterator();
while (endingIt.hasNext()) {
String t = endingIt.next();
if (enclosingTags.remove(t))
endingIt.remove();
}
for (String endTag : endTags) {
System.err.println("Got a close tag " + endTag + " found after " + history + " which does not match " + "any open tag");
}
endTags.clear();
// TODO: put this into the lexer instead of here
token.setWord(XMLUtils.unescapeStringForXML(token.word()));
// TODO: was there another annotation that also represents the word?
if (matchDepth > 0 || xmlTagMatcher == null || xmlTagMatcher.matcher("").matches()) {
newTokens.add(token);
}
// if we removed any text, and the tokens are "invertible" and therefore keep track of their
// before/after text, append what we removed to the appropriate tokens
if (removedText.length() > 0) {
boolean added = false;
String before = token.get(BeforeAnnotation.class);
if (before != null) {
token.set(BeforeAnnotation.class, removedText + before);
added = true;
}
if (added && newTokens.size() > 1) {
CoreLabel previous = newTokens.get(newTokens.size() - 2);
String after = previous.get(AfterAnnotation.class);
if (after != null)
previous.set(AfterAnnotation.class, after + removedText);
else
previous.set(AfterAnnotation.class, removedText.toString());
}
removedText = new StringBuilder();
}
if (currentTagSet == null) {
// We wrap the list in an unmodifiable list because we reuse the same list object many times.
// We don't want to let someone modify one list and screw up all the others.
currentTagSet = Collections.unmodifiableList(new ArrayList<String>(enclosingTags));
}
token.set(XmlContextAnnotation.class, currentTagSet);
// is this token part of the doc date sequence?
if (dateTagMatcher != null && currentTagSet.size() > 0 && dateTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
dateTokens.add(token);
}
continue;
}
// At this point, we know we have a tag
// we are removing a token and its associated text... keep track of that
String currentRemoval = token.get(BeforeAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
currentRemoval = token.get(OriginalTextAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
if (token == tokens.get(tokens.size() - 1)) {
currentRemoval = token.get(AfterAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
}
// If the tag matches the sentence ending tags, and we have some existing words,
// mark that word as being somewhere we want to end the sentence.
if (sentenceEndingTagMatcher != null && sentenceEndingTagMatcher.matcher(tag.name).matches() && newTokens.size() > 0) {
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(ForcedSentenceEndAnnotation.class, true);
}
if (xmlTagMatcher == null)
continue;
if (tag.isSingleTag) {
continue;
}
// at this point, we can't reuse the "currentTagSet" vector any more, since the current tag set has changed
currentTagSet = null;
if (tag.isEndTag) {
endTags.add(tag.name);
} else {
// open tag, since all other cases are exhausted
enclosingTags.add(tag.name);
if (xmlTagMatcher.matcher(tag.name).matches())
matchDepth++;
}
}
if (enclosingTags.size() > 0) {
System.err.println("Unclosed tags: " + enclosingTags);
}
// If we ended with a string of xml tokens, that text needs to be
// appended to the "AfterAnnotation" of one of the tokens...
// Note that we clear removedText when we see a real token, so
// if removedText is not empty, that must be because we just
// dropped an xml tag. Therefore we ignore that old After
// annotation, since that text was already absorbed in the Before
// annotation of the xml tag we threw away
if (newTokens.size() > 0 && removedText.length() > 0) {
CoreLabel lastToken = newTokens.get(newTokens.size() - 1);
// sometimes AfterAnnotation seems to be null even when we are
// collecting before & after annotations, but OriginalTextAnnotation
// is only non-null if we are invertible. Hopefully.
if (lastToken.get(OriginalTextAnnotation.class) != null) {
lastToken.set(AfterAnnotation.class, removedText.toString());
}
}
return newTokens;
}
@Override
public Set<Requirement> requires() {
return Collections.singleton(TOKENIZE_REQUIREMENT);
}
@Override
public Set<Requirement> requirementsSatisfied() {
return Collections.singleton(CLEAN_XML_REQUIREMENT);
}
}