package edu.stanford.nlp.process;
import java.io.*;
import java.net.*;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.*;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.util.Function;
/**
* Produces a list of sentences from either a plain text or XML document.
* <p>
* Tokenization: The default tokenizer is {@link PTBTokenizer}. If null is passed to
* <code>setTokenizerFactory</code>, then whitespace tokenization is assumed.
* <p>
* Adding a new document type requires two steps:
* <ol>
* <li> Add a new DocType.
* <li> Create an iterator for the new DocType and modify the iterator() function to return the new iterator.
* </ol>
* <p>
* NOTE: This implementation should <em>not</em> use external libraries since it is used in the parser.
*
* @author Spence Green
*/
public class DocumentPreprocessor implements Iterable<List<HasWord>> {
public static enum DocType {Plain, XML}
private Reader inputReader = null;
private String inputPath = null;
private DocType docType = DocType.Plain;
//Configurable options
private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.factory();
private String encoding = null;
private String[] sentenceFinalPuncWords = {".", "?", "!"};
private Function<List<HasWord>,List<HasWord>> escaper = null;
private String sentenceDelimiter = null;
/**
* Example: if the words are already POS tagged and look like
* foo_VB, you want to set the tagDelimiter to "_"
*/
private String tagDelimiter = null;
/**
* When doing XML parsing, only accept text in between tags that
* match this regular expression. Defaults to everything.
*/
private String elementDelimiter = ".*";
//From PTB conventions
private final String[] sentenceFinalFollowers = {")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", "-RCB-"};
/**
* Constructs a preprocessor from an existing input stream.
*
* @param input An existing reader
*/
public DocumentPreprocessor(Reader input) {
this(input,DocType.Plain);
}
public DocumentPreprocessor(Reader input, DocType t) {
if (input == null)
throw new RuntimeException("Cannot read from null object!");
docType = t;
inputReader = input;
}
/**
* Constructs a preprocessor from a file at a path, which can be either
* a filesystem location or a URL.
*
* @param docPath
*/
public DocumentPreprocessor(String docPath) {
this(docPath,DocType.Plain);
}
public DocumentPreprocessor(String docPath, DocType t) {
if (docPath == null)
throw new RuntimeException("Cannot open null document path!");
docType = t;
inputPath = docPath;
}
/**
* Set the character encoding.
*
* @param encoding The character encoding used by Readers
* @throws IllegalCharsetNameException If the JVM does not support the named character set.
*/
public void setEncoding(String encoding) throws IllegalCharsetNameException {
if (Charset.isSupported(encoding))
this.encoding = encoding;
}
/**
* Sets the end-of-sentence delimiters.
* <p>
* For newline tokenization, use the argument {"\n"}.
*
* @param sentenceFinalPuncWords
*/
public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
this.sentenceFinalPuncWords = sentenceFinalPuncWords;
}
/**
* Sets the factory from which to produce a {@link Tokenizer}. The default is
* {@link PTBTokenizer}.
* <p>
* NOTE: If a null argument is used, then the document is assumed to be tokenized
* and DocumentPreprocessor performs no tokenization.
*
*/
public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory) {
tokenizerFactory = newTokenizerFactory;
}
/**
* Set an escaper.
*
* @param e The escaper
*/
public void setEscaper(Function<List<HasWord>,List<HasWord>> e) { escaper = e; }
/**
* Make the processor assume that the document is already delimited
* by the supplied parameter.
*
* @param s The sentence delimiter
*/
public void setSentenceDelimiter(String s) { sentenceDelimiter = s; }
/**
* Split tags from tokens. The tag will be placed in the TagAnnotation of
* the returned label.
* <p>
* Note that for strings that contain two or more instances of the tag delimiter,
* the last instance is treated as the split point.
* <p>
* The tag delimiter should not contain any characters that must be escaped in a Java
* regex.
*
* @param s POS tag delimiter
*/
public void setTagDelimiter(String s) { tagDelimiter = s; }
/**
* Only read text from between these XML tokens if in XML mode.
* Otherwise, will read from all tokens.
*/
public void setElementDelimiter(String s) { elementDelimiter = s; }
/**
* Returns sentences until the document is exhausted. Calls close() if the end of the document
* is reached. Otherwise, the user is required to close the stream.
*/
public Iterator<List<HasWord>> iterator() {
try {
if (inputReader == null)
inputReader = getReaderFromPath(inputPath);
//TODO: Add new document types here
if (docType == DocType.Plain) {
return new PlainTextIterator();
} else if (docType == DocType.XML) {
return new XMLIterator();
}
} catch (IOException e) {
System.err.printf("%s: Could not open path %s\n", this.getClass().getName(), inputPath);
}
return new Iterator<List<HasWord>>() {
public boolean hasNext() { return false; }
public List<HasWord> next() { throw new NoSuchElementException(); }
public void remove() {}
};
}
private Reader getReaderFromPath(String path) throws IOException {
//Check if it is a URL first, otherwise look for a file
try {
URL url = new URL(path);
URLConnection connection = url.openConnection();
return new BufferedReader(new InputStreamReader(connection.getInputStream()));
} catch(MalformedURLException e) {
//Do nothing: the path may be a file
}
File file = new File(path);
if (file.exists()) {
return (encoding == null) ? new FileReader(path) : new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding));
}
throw new IOException("Unable to open " + path);
}
private class PlainTextIterator implements Iterator<List<HasWord>> {
private Tokenizer<? extends HasWord> tokenizer;
private Set<String> sentDelims;
private Set<String> delimFollowers = new HashSet<String>(Arrays.asList(sentenceFinalFollowers));
private Function<String, String[]> splitTag;
private List<HasWord> nextSent = null;
private List<HasWord> nextSentCarryover = new ArrayList<HasWord>();
public PlainTextIterator() {
// Establish how to find sentence boundaries
sentDelims = new HashSet<String>();
boolean eolIsSignificant = false;
if (sentenceDelimiter == null) {
if (sentenceFinalPuncWords != null) {
sentDelims = new HashSet<String>(Arrays.asList(sentenceFinalPuncWords));
}
} else {
sentDelims.add(sentenceDelimiter);
delimFollowers = new HashSet<String>();
eolIsSignificant = sentenceDelimiter.matches("\\s+");
if(eolIsSignificant) // For Stanford English Tokenizer
sentDelims.add(PTBLexer.NEWLINE_TOKEN);
}
// Setup the tokenizer
if(tokenizerFactory == null) {
tokenizer = WhitespaceTokenizer.
newWordWhitespaceTokenizer(inputReader, eolIsSignificant);
} else {
if(eolIsSignificant)
tokenizerFactory.setOptions("tokenizeNLs");//wsg2010: This key currently used across all tokenizers
tokenizer = tokenizerFactory.getTokenizer(inputReader);
}
// If tokens are tagged, then we must split them
// Note that if the token contains two or more instances of the delimiter, then the last
// instance is regarded as the split point.
if (tagDelimiter != null) {
splitTag = new Function<String,String[]>() {
private final String splitRegex = String.format("%s(?!.*%s)",tagDelimiter,tagDelimiter);
public String[] apply(String in) {
final String[] splits = in.trim().split(splitRegex);
if(splits.length == 2)
return splits;
else {
String[] oldStr = {in};
return oldStr;
}
}
};
}
}
private void primeNext() {
nextSent = new ArrayList<HasWord>(nextSentCarryover);
nextSentCarryover.clear();
boolean seenBoundary = false;
while (tokenizer.hasNext()) {
HasWord token = tokenizer.next();
if (splitTag != null) {
String[] toks = splitTag.apply(token.word());
token.setWord(toks[0]);
if(toks.length == 2 && token instanceof HasTag) {
//wsg2011: Some of the underlying tokenizers return old
//JavaNLP labels. We could convert to CoreLabel here, but
//we choose a conservative implementation....
((HasTag) token).setTag(toks[1]);
}
}
if (sentDelims.contains(token.word())) {
seenBoundary = true;
} else if (seenBoundary && !delimFollowers.contains(token.word())) {
nextSentCarryover.add(token);
break;
}
if ( ! (token.word().matches("\\s+") ||
token.word().equals(PTBLexer.NEWLINE_TOKEN))) {
nextSent.add(token);
}
// If there are no words that can follow a sentence delimiter,
// then there are two cases. In one case is we already have a
// sentence, in which case there is no reason to look at the
// next token, since that just causes buffering without any
// chance of the current sentence being extended, since
// delimFollowers = {}. In the other case, we have an empty
// sentence, which at this point means the sentence delimiter
// was a whitespace token such as \n. We might as well keep
// going as if we had never seen anything.
if (seenBoundary && delimFollowers.size() == 0) {
if (nextSent.size() > 0) {
break;
} else {
seenBoundary = false;
}
}
}
if (nextSent.size() == 0 && nextSentCarryover.size() == 0) {
IOUtils.closeIgnoringExceptions(inputReader);
inputReader = null;
nextSent = null;
} else if (escaper != null) {
nextSent = escaper.apply(nextSent);
}
}
public boolean hasNext() {
if (nextSent == null) {
primeNext();
}
return nextSent != null;
}
public List<HasWord> next() {
if (nextSent == null) {
primeNext();
}
if (nextSent == null) {
throw new NoSuchElementException();
}
List<HasWord> thisIteration = nextSent;
nextSent = null;
return thisIteration;
}
public void remove() { throw new UnsupportedOperationException(); }
}
private class XMLIterator implements Iterator<List<HasWord>> {
private final XMLBeginEndIterator<String> xmlItr;
private final Reader originalDocReader;
private PlainTextIterator plainItr; // = null;
private List<HasWord> nextSent; // = null;
public XMLIterator() {
xmlItr = new XMLBeginEndIterator<String>(inputReader, elementDelimiter);
originalDocReader = inputReader;
primeNext();
}
private void primeNext() {
// It is necessary to loop because if a document has a pattern
// that goes: <tag></tag> the xmlItr will return an empty
// string, which the plainItr will process to null. If we
// didn't loop to find the next tag, the iterator would stop.
do {
if (plainItr != null && plainItr.hasNext()) {
nextSent = plainItr.next();
} else if (xmlItr.hasNext()) {
String block = xmlItr.next();
inputReader = new BufferedReader(new StringReader(block));
plainItr = new PlainTextIterator();
if (plainItr.hasNext()) {
nextSent = plainItr.next();
} else {
nextSent = null;
}
} else {
IOUtils.closeIgnoringExceptions(originalDocReader);
nextSent = null;
break;
}
} while (nextSent == null);
}
public boolean hasNext() {
return nextSent != null;
}
public List<HasWord> next() {
if (nextSent == null) {
throw new NoSuchElementException();
}
List<HasWord> thisSentence = nextSent;
primeNext();
return thisSentence;
}
public void remove() { throw new UnsupportedOperationException(); }
}
/**
* This provides a simple test method for DocumentPreprocessor. <br/>
* Usage:
* java
* DocumentPreprocessor -file filename [-xml tag] [-suppressEscaping] [-noTokenization]
* <p>
* A filename is required. The code doesn't run as a filter currently.
* <p>
* tag is the element name of the XML from which to extract text. It can
* be a regular expression which is called on the element with the
* matches() method, such as 'TITLE|P'.
*
* @param args Command-line arguments
*/
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("usage: DocumentPreprocessor filename [OPTS]");
System.exit(-1);
}
DocumentPreprocessor docPreprocessor = new DocumentPreprocessor(args[0]);
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-xml")) {
docPreprocessor = new DocumentPreprocessor(args[0], DocType.XML);
docPreprocessor.setTagDelimiter(args[++i]);
} else if (args[i].equals("-suppressEscaping")) {
String options = "ptb3Escaping=false";
docPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(),options));
} else if (args[i].equals("-tokenizerOptions") && i+1 < args.length) {
String options = args[i+1];
docPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(),options));
i++;
} else if (args[i].equals("-noTokenization")) {
docPreprocessor.setTokenizerFactory(null);
docPreprocessor.setSentenceDelimiter(System.getProperty("line.separator"));
} else if (args[i].equals("-tag")) {
docPreprocessor.setTagDelimiter(args[++i]);
}
}
docPreprocessor.setEncoding("UTF-8");
int numSents = 0;
for (List<HasWord> sentence : docPreprocessor) {
numSents++;
System.err.println("Length: " + sentence.size());
boolean printSpace = false;
for (HasWord word : sentence) {
if (printSpace) System.out.print(" ");
printSpace = true;
System.out.print(word.word());
}
System.out.println();
}
System.err.println("Read in " + numSents + " sentences.");
}
}