DocumentPreprocessor.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.process;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Properties;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Pattern;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

/**
 * Produces a list of sentences from either a plain text or XML document.
 * This class acts like a Reader: It allows you to make a single pass through a
 * list of sentences in a document. If you need to pass through the document
 * multiple times, then you need to create a second DocumentProcessor.
 * <p>
 * Tokenization: The default tokenizer is {@link PTBTokenizer}. If null is passed
 * to {@code setTokenizerFactory}, then whitespace tokenization is assumed.
 * <p>
 * Adding a new document type requires two steps:
 * <ol>
 * <li> Add a new DocType.
 * <li> Create an iterator for the new DocType and modify the iterator()
 *     function to return the new iterator.
 * </ol>
 * <p>
 * NOTE: This implementation should <em>not</em> use external libraries since it
 * is used in the parser.
 *
 * @author Spence Green
 */
public class DocumentPreprocessor implements Iterable<List<HasWord>>  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(DocumentPreprocessor.class);

  public enum DocType {Plain, XML}

  // todo: Should probably change this to be regex, but I've added some multi-character punctuation in the meantime
  private static final String[] DEFAULT_SENTENCE_DELIMS = {".", "?", "!", "!!", "!!!", "??", "?!", "!?"};

  // inputReader is used in a fairly yucky way at the moment to communicate
  // from a XMLIterator across to a PlainTextIterator.  Maybe redo by making
  // the inner classes static and explicitly passing things around.
  private Reader inputReader;
  private final DocType docType;

  //Configurable options
  private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.coreLabelFactory();
  private String[] sentenceFinalPuncWords = DEFAULT_SENTENCE_DELIMS;
  private Function<List<HasWord>,List<HasWord>> escaper; // = null;
  private String sentenceDelimiter; // = null;
  /**
   * Example: if the words are already POS tagged and look like
   * foo_VB, you want to set the tagDelimiter to "_"
   */
  private String tagDelimiter; // = null;
  /**
   * When doing XML parsing, only accept text in between tags that
   * match this regular expression.  Defaults to everything.
   */
  private String elementDelimiter = ".*";

  private static final Pattern wsPattern = Pattern.compile("\\s+");

  //From PTB conventions
  private final String[] sentenceFinalFollowers = {")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", "-RCB-"};

  private boolean keepEmptySentences; // = false;


  /**
   * Constructs a preprocessor from an existing input stream.
   *
   * @param input An existing reader
   */
  public DocumentPreprocessor(Reader input) {
    this(input,DocType.Plain);
  }

  public DocumentPreprocessor(Reader input, DocType t) {
    if (input == null) {
      throw new IllegalArgumentException("Cannot read from null object!");
    }
    docType = t;
    inputReader = input;
  }

  public DocumentPreprocessor(String docPath) {
    this(docPath, DocType.Plain, "UTF-8");
  }

  public DocumentPreprocessor(String docPath, DocType t) {
    this(docPath, t, "UTF-8");
  }


  /**
   * Constructs a preprocessor from a file at a path, which can be either
   * a filesystem location, a classpath entry, or a URL.
   *
   * @param docPath The path
   * @param encoding The character encoding used by Readers
   */
  public DocumentPreprocessor(String docPath, DocType t, String encoding) {
    if (docPath == null) {
      throw new IllegalArgumentException("Cannot open null document path!");
    }

    docType = t;
    try {
      inputReader = IOUtils.readerFromString(docPath, encoding);
    } catch (IOException ioe) {
      throw new RuntimeIOException(String.format("%s: Could not open path %s", this.getClass().getName(), docPath),
              ioe);
    }
  }

  /**
   * Set whether or not the tokenizer keeps empty sentences in
   * whitespace mode.  Useful for programs that want to echo blank
   * lines.  Not relevant for the non-whitespace model.
   */
  public void setKeepEmptySentences(boolean keepEmptySentences) {
    this.keepEmptySentences = keepEmptySentences;
  }

  /**
   * Sets the end-of-sentence delimiters.
   * <p>
   * For newline tokenization, use the argument {"\n"}.
   *
   * @param sentenceFinalPuncWords An array of words that count as sentence final punctuation.
   */
  public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
    this.sentenceFinalPuncWords = sentenceFinalPuncWords;
  }

  /**
   * Sets the factory from which to produce a {@link Tokenizer}.  The default is
   * {@link PTBTokenizer}.
   * <p>
   * NOTE: If a null argument is used, then the document is assumed to be tokenized
   * and DocumentPreprocessor performs no tokenization.
   *
   */
  public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory) {
    tokenizerFactory = newTokenizerFactory;
  }

  /**
   * Set an escaper.
   *
   * @param e The escaper
   */
  public void setEscaper(Function<List<HasWord>,List<HasWord>> e) { escaper = e; }

  /**
   * Make the processor assume that the document is already delimited
   * by the supplied parameter.
   *
   * @param s The sentence delimiter
   */
  public void setSentenceDelimiter(String s) { sentenceDelimiter = s; }

  /**
   * Split tags from tokens. The tag will be placed in the TagAnnotation of
   * the returned label.
   * <p>
   * Note that for strings that contain two or more instances of the tag delimiter,
   * the last instance is treated as the split point.
   * <p>
   * The tag delimiter should not contain any characters that must be escaped in a Java
   * regex.
   *
   * @param s POS tag delimiter
   */
  public void setTagDelimiter(String s) { tagDelimiter = s; }

  /**
   * Only read text from inside these XML elements if in XML mode.
   * <i>Note:</i> This class implements an approximation to XML via regex.
   *
   * Otherwise, text will read from all tokens.
   */
  public void setElementDelimiter(String s) { elementDelimiter = s; }


  /**
   * Returns sentences until the document is exhausted. Calls close() if the end of the document
   * is reached. Otherwise, the user is required to close the stream.
   *
   * @return An Iterator over sentences (each a List of word tokens).
   * Although the type is given as {@code List<HasWord>}, in practice you get a List of CoreLabel,
   * and you can cast down to that. (Someday we might manage to fix the generic typing....)
   */
  @Override
  public Iterator<List<HasWord>> iterator() {
    // Add new document types here
    if (docType == DocType.Plain) {
      return new PlainTextIterator();
    } else if (docType == DocType.XML) {
      return new XMLIterator();
    } else {
      throw new IllegalStateException("Someone didn't add a handler for a new docType.");
    }
  }


  private class PlainTextIterator implements Iterator<List<HasWord>> {

    private final Tokenizer<? extends HasWord> tokenizer;
    private final Set<String> sentDelims;
    private final Set<String> delimFollowers;
    private final Function<String, String[]> splitTag;
    private List<HasWord> nextSent; // = null;
    private final List<HasWord> nextSentCarryover = Generics.newArrayList();

    public PlainTextIterator() {
      // Establish how to find sentence boundaries
      boolean eolIsSignificant = false;
      sentDelims = Generics.newHashSet();
      if (sentenceDelimiter == null) {
        if (sentenceFinalPuncWords != null) {
          sentDelims.addAll(Arrays.asList(sentenceFinalPuncWords));
        }
        delimFollowers = Generics.newHashSet(Arrays.asList(sentenceFinalFollowers));
      } else {
        sentDelims.add(sentenceDelimiter);
        delimFollowers = Generics.newHashSet();
        eolIsSignificant = wsPattern.matcher(sentenceDelimiter).matches();
        if(eolIsSignificant) { // For Stanford English Tokenizer
          sentDelims.add(PTBTokenizer.getNewlineToken());
        }
      }

      // Setup the tokenizer
      if (tokenizerFactory == null) {
        eolIsSignificant = sentDelims.contains(WhitespaceLexer.NEWLINE);
        tokenizer = WhitespaceTokenizer.
          newWordWhitespaceTokenizer(inputReader, eolIsSignificant);
      } else {
        if (eolIsSignificant) {
          tokenizer = tokenizerFactory.getTokenizer(inputReader, "tokenizeNLs");
        } else {
          tokenizer = tokenizerFactory.getTokenizer(inputReader);
        }
      }

      // If tokens are tagged, then we must split them
      // Note that if the token contains two or more instances of the delimiter, then the last
      // instance is regarded as the split point.
      if (tagDelimiter == null) {
        splitTag = null;
      } else {
        splitTag = new Function<String,String[]>() {
          private final String splitRegex = String.format("%s(?!.*%s)", tagDelimiter, tagDelimiter);
          @Override
          public String[] apply(String in) {
            final String[] splits = in.trim().split(splitRegex);
            if(splits.length == 2)
              return splits;
            else {
              String[] oldStr = {in};
              return oldStr;
            }
          }
        };
      }
    }

    private void primeNext() {
      if (inputReader == null) {
        // we've already been out of stuff and have closed the input reader; so just return
        return;
      }

      nextSent = Generics.newArrayList(nextSentCarryover);
      nextSentCarryover.clear();
      boolean seenBoundary = false;

      if (!tokenizer.hasNext()) {
        IOUtils.closeIgnoringExceptions(inputReader);
        inputReader = null;
        // nextSent = null; // WRONG: There may be something in it from the nextSentCarryover
        if (nextSent.isEmpty()) {
          nextSent = null;
        }
        return;
      }

      do {
        HasWord token = tokenizer.next();
        if (splitTag != null) {
          String[] toks = splitTag.apply(token.word());
          token.setWord(toks[0]);
          if (token instanceof Label) {
            ((Label) token).setValue(toks[0]);
          }
          if(toks.length == 2 && token instanceof HasTag) {
            //wsg2011: Some of the underlying tokenizers return old
            //JavaNLP labels.  We could convert to CoreLabel here, but
            //we choose a conservative implementation....
            ((HasTag) token).setTag(toks[1]);
          }
        }

        if (sentDelims.contains(token.word())) {
          seenBoundary = true;
        } else if (seenBoundary && !delimFollowers.contains(token.word())) {
          nextSentCarryover.add(token);
          break;
        }

        if ( ! (wsPattern.matcher(token.word()).matches() ||
                token.word().equals(PTBTokenizer.getNewlineToken()))) {
          nextSent.add(token);
        }

        // If there are no words that can follow a sentence delimiter,
        // then there are two cases.  In one case is we already have a
        // sentence, in which case there is no reason to look at the
        // next token, since that just causes buffering without any
        // chance of the current sentence being extended, since
        // delimFollowers = {}.  In the other case, we have an empty
        // sentence, which at this point means the sentence delimiter
        // was a whitespace token such as \n.  We might as well keep
        // going as if we had never seen anything.
        if (seenBoundary && delimFollowers.isEmpty()) {
          if ( ! nextSent.isEmpty() || keepEmptySentences) {
            break;
          } else {
            seenBoundary = false;
          }
        }
      } while (tokenizer.hasNext());

      if (nextSent.isEmpty() && nextSentCarryover.isEmpty() && ! keepEmptySentences) {
        IOUtils.closeIgnoringExceptions(inputReader);
        inputReader = null;
        nextSent = null;
      } else if (escaper != null) {
        nextSent = escaper.apply(nextSent);
      }
    }

    @Override
    public boolean hasNext() {
      if (nextSent == null) {
        primeNext();
      }
      return nextSent != null;
    }

    @Override
    public List<HasWord> next() {
      if (nextSent == null) {
        primeNext();
      }
      if (nextSent == null) {
        throw new NoSuchElementException();
      }
      List<HasWord> thisIteration = nextSent;
      nextSent = null;
      return thisIteration;
    }

    @Override
    public void remove() { throw new UnsupportedOperationException(); }

  }


  private class XMLIterator implements Iterator<List<HasWord>> {

    private final XMLBeginEndIterator<String> xmlItr;
    private final Reader originalDocReader;
    private PlainTextIterator plainItr; // = null;
    private List<HasWord> nextSent; // = null;

    public XMLIterator() {
      xmlItr = new XMLBeginEndIterator<>(inputReader, elementDelimiter);
      originalDocReader = inputReader;
      primeNext();
    }

    private void primeNext() {
      // It is necessary to loop because if a document has a pattern
      // that goes: <tag></tag> the xmlItr will return an empty
      // string, which the plainItr will process to null.  If we
      // didn't loop to find the next tag, the iterator would stop.
      do {
        if (plainItr != null && plainItr.hasNext()) {
          nextSent = plainItr.next();
        } else if (xmlItr.hasNext()) {
          String block = xmlItr.next();
          inputReader = new BufferedReader(new StringReader(block));
          plainItr = new PlainTextIterator();
          if (plainItr.hasNext()) {
            nextSent = plainItr.next();
          } else {
            nextSent = null;
          }
        } else {
          IOUtils.closeIgnoringExceptions(originalDocReader);
          nextSent = null;
          break;
        }
      } while (nextSent == null);
    }

    @Override
    public boolean hasNext() {
      return nextSent != null;
    }

    @Override
    public List<HasWord> next() {
      if (nextSent == null) {
        throw new NoSuchElementException();
      }
      List<HasWord> thisSentence = nextSent;
      primeNext();
      return thisSentence;
    }

    @Override
    public void remove() { throw new UnsupportedOperationException(); }

  } // end class XMLIterator


  private static String usage() {
    StringBuilder sb = new StringBuilder();
    String nl = System.lineSeparator();
    sb.append(String.format("Usage: java %s [OPTIONS] [file] [< file]%n%n", DocumentPreprocessor.class.getName()));
    sb.append("Options:").append(nl);
    sb.append("-xml delim              : XML input with associated delimiter.").append(nl);
    sb.append("-encoding type          : Input encoding (default: UTF-8).").append(nl);
    sb.append("-printSentenceLengths   : ").append(nl);
    sb.append("-noTokenization         : Split on newline delimiters only.").append(nl);
    sb.append("-printOriginalText      : Print the original, not normalized form of tokens.").append(nl);
    sb.append("-suppressEscaping       : Suppress PTB escaping.").append(nl);
    sb.append("-tokenizerOptions opts  : Specify custom tokenizer options.").append(nl);
    sb.append("-tag delim              : Input tokens are tagged. Split tags.").append(nl);
    sb.append("-whitespaceTokenization : Whitespace tokenization only.").append(nl);
    return sb.toString();
  }

  private static Map<String,Integer> argOptionDefs() {
    Map<String,Integer> argOptionDefs = Generics.newHashMap();
    argOptionDefs.put("help", 0);
    argOptionDefs.put("xml", 1);
    argOptionDefs.put("encoding", 1);
    argOptionDefs.put("printSentenceLengths", 0);
    argOptionDefs.put("noTokenization", 0);
    argOptionDefs.put("suppressEscaping", 0);
    argOptionDefs.put("tag", 1);
    argOptionDefs.put("tokenizerOptions", 1);
    argOptionDefs.put("whitespaceTokenization", 0);
    return argOptionDefs;
  }

  /**
   * A simple, deterministic sentence-splitter. This method only supports the English
   * tokenizer, so for other languages you should run the tokenizer first and then
   * run this sentence splitter with the "-whitespaceTokenization" option.
   *
   * @param args Command-line arguments
   */
  public static void main(String[] args) throws IOException {
    final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
    if (options.containsKey("help")) {
      log.info(usage());
      return;
    }

    // Command-line flags
    String encoding = options.getProperty("encoding", "utf-8");
    boolean printSentenceLengths = PropertiesUtils.getBool(options,"printSentenceLengths", false);
    String xmlElementDelimiter = options.getProperty("xml", null);
    DocType docType = xmlElementDelimiter == null ? DocType.Plain : DocType.XML;
    String sentenceDelimiter = options.containsKey("noTokenization") ? System.getProperty("line.separator") : null;
    String tagDelimiter = options.getProperty("tag", null);
    String[] sentenceDelims = null;

    // Setup the TokenizerFactory
    int numFactoryFlags = 0;
    boolean suppressEscaping = options.containsKey("suppressEscaping");
    if (suppressEscaping) numFactoryFlags += 1;
    boolean customTokenizer = options.containsKey("tokenizerOptions");
    if (customTokenizer) numFactoryFlags += 1;
    boolean printOriginalText = options.containsKey("printOriginalText");
    if (printOriginalText) numFactoryFlags += 1;
    boolean whitespaceTokenization = options.containsKey("whitespaceTokenization");
    if (whitespaceTokenization) numFactoryFlags += 1;
    if (numFactoryFlags > 1) {
      log.info("Only one tokenizer flag allowed at a time: ");
      log.info("  -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
      return;
    }

    TokenizerFactory<? extends HasWord> tf = null;
    if (suppressEscaping) {
      tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
    } else if (customTokenizer) {
      tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions"));
    } else if (printOriginalText) {
      tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
    } else if (whitespaceTokenization) {
      List<String> whitespaceDelims =
          new ArrayList<>(Arrays.asList(DocumentPreprocessor.DEFAULT_SENTENCE_DELIMS));
      whitespaceDelims.add(WhitespaceLexer.NEWLINE);
      sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
    } else {
      tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    }

    String fileList = options.getProperty("", null);
    String[] files = fileList == null ? new String[1] : fileList.split("\\s+");

    int numSents = 0;
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);
    for (String file : files) {
      DocumentPreprocessor docPreprocessor;
      if (file == null || file.isEmpty()) {
        docPreprocessor = new DocumentPreprocessor(new InputStreamReader(System.in, encoding));
      } else {
        docPreprocessor = new DocumentPreprocessor(file, docType, encoding);
      }
      if (docType == DocType.XML) {
        docPreprocessor.setElementDelimiter(xmlElementDelimiter);
      }
      docPreprocessor.setTokenizerFactory(tf);
      if (sentenceDelimiter != null) {
        docPreprocessor.setSentenceDelimiter(sentenceDelimiter);
      }
      if (tagDelimiter != null) {
        docPreprocessor.setTagDelimiter(tagDelimiter);
      }
      if (sentenceDelims != null) {
        docPreprocessor.setSentenceFinalPuncWords(sentenceDelims);
      }

      for (List<HasWord> sentence : docPreprocessor) {
        numSents++;
        if (printSentenceLengths) {
          System.err.printf("Length: %d%n", sentence.size());
        }
        boolean printSpace = false;
        for (HasWord word : sentence) {
          if (printOriginalText) {
            CoreLabel cl = (CoreLabel) word;
            if ( ! printSpace) {
              pw.print(cl.get(CoreAnnotations.BeforeAnnotation.class));
              printSpace = true;
            }
            pw.print(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
            pw.print(cl.get(CoreAnnotations.AfterAnnotation.class));
          } else {
            if (printSpace) pw.print(" ");
            printSpace = true;
            pw.print(word.word());
          }
        }
        pw.println();
      }
    }
    pw.close();
    System.err.printf("Read in %d sentences.%n", numSents);
  }

}