package edu.stanford.nlp.objectbank;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.XMLUtils;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;
/**
* A class which iterates over Strings occuring between the begin and end of
* a selected tag or tags. The element is specified by a regexp, matched
* against the name of the element (i.e., excluding the angle bracket
* characters) using <code>matches()</code>).
* The class ignores all other characters in the input Reader.
*
* @author Teg Grenager (grenager@stanford.edu)
*/
public class XMLBeginEndIterator<E> extends AbstractIterator<E> implements Tokenizer<E> {
private final Pattern tagNamePattern;
private final BufferedReader in;
private final Function<String,E> op;
private final boolean keepInternalTags;
private final boolean keepDelimitingTags;
private E nextToken; // stores the read-ahead next token to return
@SuppressWarnings({"unchecked"}) // Can't seem to do IdentityFunction without warning!
public XMLBeginEndIterator(Reader in, String tagNameRegexp) {
this(in, tagNameRegexp, new IdentityFunction(), false);
}
@SuppressWarnings({"unchecked"})
public XMLBeginEndIterator(Reader in, String tagNameRegexp, boolean keepInternalTags) {
this(in, tagNameRegexp, new IdentityFunction(), keepInternalTags);
}
public XMLBeginEndIterator(Reader in, String tagNameRegexp, Function<String,E> op, boolean keepInternalTags) {
this(in, tagNameRegexp, op, keepInternalTags, false);
}
@SuppressWarnings({"unchecked"})
public XMLBeginEndIterator(Reader in, String tagNameRegexp, boolean keepInternalTags, boolean keepDelimitingTags) {
this(in, tagNameRegexp, new IdentityFunction(), keepInternalTags, keepDelimitingTags);
}
public XMLBeginEndIterator(Reader in, String tagNameRegexp, Function<String,E> op, boolean keepInternalTags, boolean keepDelimitingTags) {
this.tagNamePattern = Pattern.compile(tagNameRegexp);
this.op = op;
this.keepInternalTags = keepInternalTags;
this.keepDelimitingTags = keepDelimitingTags;
this.in = new BufferedReader(in);
setNext();
}
private void setNext() {
String s = getNext();
nextToken = parseString(s);
}
// returns null if there is no next object
private String getNext() {
StringBuilder result = new StringBuilder();
try {
XMLUtils.XMLTag tag;
do {
// String text =
XMLUtils.readUntilTag(in);
// there may or may not be text before the next tag, but we discard it
// System.out.println("outside text: " + text );
tag = XMLUtils.readAndParseTag(in);
// System.out.println("outside tag: " + tag);
if (tag == null) {
return null; // couldn't find any more tags, so no more elements
}
} while ( ! tagNamePattern.matcher(tag.name).matches() || tag.isEndTag);
if (keepDelimitingTags) {
result.append(tag.toString());
}
while (true) {
String text = XMLUtils.readUntilTag(in);
if (text != null) {
// if the text isn't null, we append it
// System.out.println("inside text: " + text );
result.append(text);
}
String tagString = XMLUtils.readTag(in);
tag = XMLUtils.parseTag(tagString);
if (tag == null) {
return null; // unexpected end of this element, so no more elements
}
if (tagNamePattern.matcher(tag.name).matches() && tag.isEndTag) {
if (keepDelimitingTags) {
result.append(tagString);
}
// this is our end tag so we stop
break;
} else {
// not our end tag, so we optionally append it and keep going
if (keepInternalTags) {
result.append(tagString);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return result.toString();
}
protected E parseString(String s) {
return op.apply(s);
}
@Override
public boolean hasNext() {
return nextToken != null;
}
@Override
public E next() {
if (nextToken == null) {
throw new NoSuchElementException();
}
E token = nextToken;
setNext();
return token;
}
public E peek() {
return nextToken;
}
/**
* Returns pieces of text in element as a List of tokens.
*
* @return A list of all tokens remaining in the underlying Reader
*/
public List<E> tokenize() {
// System.out.println("tokenize called");
List<E> result = new ArrayList<E>();
while (hasNext()) {
result.add(next());
}
return result;
}
/**
* Returns a factory that vends BeginEndIterators that reads the contents of
* the given Reader, extracts text between the specified Strings, then
* returns the result.
*
* @param tag The tag the XMLBeginEndIterator will match on
* @return The IteratorFromReaderFactory
*/
public static IteratorFromReaderFactory<String> getFactory(String tag) {
return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<String>(tag, new IdentityFunction<String>(), false, false);
}
public static IteratorFromReaderFactory<String> getFactory(String tag, boolean keepInternalTags, boolean keepDelimitingTags) {
return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<String>(tag, new IdentityFunction<String>(), keepInternalTags, keepDelimitingTags);
}
public static <E> IteratorFromReaderFactory<E> getFactory(String tag, Function<String,E> op) {
return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<E>(tag, op, false, false);
}
public static <E> IteratorFromReaderFactory<E> getFactory(String tag, Function<String,E> op, boolean keepInternalTags, boolean keepDelimitingTags) {
return new XMLBeginEndIterator.XMLBeginEndIteratorFactory<E>(tag, op, keepInternalTags, keepDelimitingTags);
}
static class XMLBeginEndIteratorFactory<E> implements IteratorFromReaderFactory<E> {
private final String tag;
private final Function<String,E> op;
private final boolean keepInternalTags;
private final boolean keepDelimitingTags;
public XMLBeginEndIteratorFactory(String tag, Function<String,E> op, boolean keepInternalTags, boolean keepDelimitingTags) {
this.tag = tag;
this.op = op;
this.keepInternalTags = keepInternalTags;
this.keepDelimitingTags = keepDelimitingTags;
}
public Iterator<E> getIterator(Reader r) {
return new XMLBeginEndIterator<E>(r, tag, op, keepInternalTags, keepDelimitingTags);
}
}
public static void main(String[] args) throws IOException {
if (args.length < 3) {
System.err.println("usage: XMLBeginEndIterator file element keepInternalBoolean");
return;
}
Reader in = new FileReader(args[0]);
Iterator<String> iter = new XMLBeginEndIterator<String>(in, args[1], args[2].equalsIgnoreCase("true"));
while (iter.hasNext()) {
String s = iter.next();
System.out.println("*************************************************");
System.out.println(s);
}
in.close();
}
} // end class XMLBeginEndIterator