DocumentPreprocessor.java example

Explorer

stanford-stemming-server-master
- TaggerDemo.java
- src
  - edu
    - stanford
      - nlp
        international
        morph
        MorphoFeatureSpecification.java
        MorphoFeatures.java
        io
        BZip2PipedOutputStream.java
        EncodingFileReader.java
        EncodingPrintWriter.java
        ExtensionFileFilter.java
        FileUtils.java
        IOUtils.java
        InDataStreamFile.java
        NumberRangesFileFilter.java
        OutDataStreamFile.java
        PrintFile.java
        RuntimeIOException.java
        ling
        AnnotationLookup.java
        CategoryWordTag.java
        CategoryWordTagFactory.java
        CoreAnnotation.java
        CoreAnnotations.java
        CoreLabel.java
        CyclicCoreLabel.java
        Datum.java
        Document.java
        Featurizable.java
        HasCategory.java
        HasContext.java
        HasIndex.java
        HasOffset.java
        HasTag.java
        HasWord.java
        Label.java
        LabelFactory.java
        Labeled.java
        LabeledWord.java
        Sentence.java
        StringLabel.java
        StringLabelFactory.java
        TaggedWord.java
        TaggedWordFactory.java
        ValueLabel.java
        Word.java
        WordFactory.java
        WordLemmaTag.java
        WordLemmaTagFactory.java
        WordTag.java
        WordTagFactory.java
        math
        ArrayMath.java
        SloppyMath.java
        maxent
        CGRunner.java
        Convert.java
        DataGeneric.java
        Experiments.java
        Feature.java
        Features.java
        Problem.java
        iis
        LambdaSolve.java
        movetrees
        EmptyTreeLeaf.java
        HasTrace.java
        objectbank
        IdentityFunction.java
        IteratorFromReaderFactory.java
        LineIterator.java
        ObjectBank.java
        ReaderIteratorFactory.java
        TokenizerFactory.java
        XMLBeginEndIterator.java
        optimization
        CGMinimizer.java
        DiffFloatFunction.java
        DiffFunction.java
        Evaluator.java
        FloatFunction.java
        Function.java
        HasEvaluators.java
        Minimizer.java
        QNMinimizer.java
        StochasticCalculateMethods.java
        process
        AbstractTokenizer.java
        Americanize.java
        CoreLabelTokenFactory.java
        CoreTokenFactory.java
        DocumentPreprocessor.java
        LexedTokenFactory.java
        ListProcessor.java
        Morpha.java
        Morphology.java
        PTB2TextLexer.java
        PTBLexer.java
        PTBTokenizer.java
        StemmingServer.java
        Tokenizer.java
        TokenizerAdapter.java
        TransformXML.java
        WhitespaceLexer.java
        WhitespaceTokenizer.java
        WordShapeClassifier.java
        WordToSentenceProcessor.java
        WordTokenFactory.java
        sequences
        BestSequenceFinder.java
        DocumentReaderAndWriter.java
        ExactBestSequenceFinder.java
        PlainTextDocumentReaderAndWriter.java
        SeqClassifierFlags.java
        SequenceModel.java
        stats
        AbstractCounter.java
        ClassicCounter.java
        Counter.java
        Counters.java
        IntCounter.java
        TwoDimensionalCounter.java
        tagger
        common
        TaggerConstants.java
        io
        TSVTaggedFileReader.java
        TaggedFileReader.java
        TaggedFileRecord.java
        TextTaggedFileReader.java
        TreeTaggedFileReader.java
        maxent
        ASBCunkDict.java
        AmbiguityClass.java
        AmbiguityClasses.java
        CTBunkDict.java
        CountWrapper.java
        CtbDict.java
        DataWordTag.java
        Dictionary.java
        DictionaryExtractor.java
        Extractor.java
        ExtractorDistsim.java
        ExtractorFrames.java
        ExtractorFramesRare.java
        ExtractorVerbalVBNZero.java
        Extractors.java
        FeatureKey.java
        History.java
        HistoryTable.java
        LambdaSolveTagger.java
        MaxentTagger.java
        MaxentTaggerGUI.java
        MaxentTaggerServer.java
        PairsHolder.java
        ReadDataTagged.java
        TTags.java
        TagCount.java
        TaggerConfig.java
        TaggerExperiments.java
        TaggerFeature.java
        TaggerFeatures.java
        TemplateHash.java
        TestClassifier.java
        TestSentence.java
        documentation
        TaggerDemo.java
        trees
        AbstractCollinsHeadFinder.java
        AbstractTreebankLanguagePack.java
        BobChrisTreeNormalizer.java
        CollinsHeadFinder.java
        CollocationFinder.java
        CompositeTreeTransformer.java
        CompositeTreebank.java
        Constituent.java
        ConstituentFactory.java
        CoordinationTransformer.java
        Dependencies.java
        Dependency.java
        DependencyFactory.java
        DependencyTreeTransformer.java
        DiskTreebank.java
        EnglishGrammaticalRelations.java
        GrammaticalRelation.java
        GrammaticalStructure.java
        GrammaticalStructureFactory.java
        HeadFinder.java
        Labeled.java
        LabeledConstituent.java
        LabeledScoredTreeFactory.java
        LabeledScoredTreeNode.java
        LabeledScoredTreeReaderFactory.java
        MemoryTreebank.java
        ModCollinsHeadFinder.java
        NPTmpRetainingTreeNormalizer.java
        NamedDependency.java
        PennTreeReader.java
        PennTreeReaderFactory.java
        PennTreebankLanguagePack.java
        PennTreebankTokenizer.java
        QPTreeTransformer.java
        SemanticHeadFinder.java
        SimpleConstituent.java
        SimpleConstituentFactory.java
        SimpleTree.java
        SimpleTreeFactory.java
        SimpleTreeReaderFactory.java
        TransformingTreebank.java
        Tree.java
        TreeCoreAnnotations.java
        TreeFactory.java
        TreeFunctions.java
        TreeGraph.java
        TreeGraphNode.java
        TreeGraphNodeFactory.java
        TreeNormalizer.java
        TreePrint.java
        TreeReader.java
        TreeReaderFactory.java
        TreeTokenizerFactory.java
        TreeTransformer.java
        TreeVisitor.java
        Treebank.java
        TreebankLanguagePack.java
        Trees.java
        TypedDependency.java
        UnnamedDependency.java
        WordNetConnection.java
        WordStemmer.java
        international
        pennchinese
        CHTBLexer.java
        CHTBTokenizer.java
        CTBTreeReaderFactory.java
        ChineseEnglishWordMap.java
        ChineseHeadFinder.java
        ChineseTreebankLanguagePack.java
        FragDiscardingPennTreeReader.java
        tregex
        CoordinationPattern.java
        DescriptionPattern.java
        ParseException.java
        Relation.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        TregexMatcher.java
        TregexParser.java
        TregexParserConstants.java
        TregexParserTokenManager.java
        TregexPattern.java
        TregexPatternCompiler.java
        VariableStrings.java
        tsurgeon
        AdjoinNode.java
        AdjoinToFootNode.java
        AdjoinToHeadNode.java
        AuxiliaryTree.java
        CoindexNodes.java
        CoindexationGenerator.java
        DeleteNode.java
        ExciseNode.java
        FetchNode.java
        HoldTreeNode.java
        InsertNode.java
        JJTTsurgeonParserState.java
        MoveNode.java
        Node.java
        ParseException.java
        PruneNode.java
        RelabelNode.java
        ReplaceNode.java
        SimpleCharStream.java
        SimpleNode.java
        Token.java
        TokenMgrError.java
        TreeLocation.java
        Tsurgeon.java
        TsurgeonParser.java
        TsurgeonParserConstants.java
        TsurgeonParserTokenManager.java
        TsurgeonParserTreeConstants.java
        TsurgeonPattern.java
        TsurgeonPatternRoot.java
        util
        AbstractIterator.java
        ArrayCoreMap.java
        ArrayMap.java
        ArrayUtils.java
        BinaryHeapPriorityQueue.java
        ByteStreamGobbler.java
        CollectionFactory.java
        CollectionUtils.java
        CollectionValuedMap.java
        ConcatenationIterator.java
        CoreMap.java
        DataFilePaths.java
        DeltaCollectionValuedMap.java
        DeltaMap.java
        ErasureUtils.java
        Factory.java
        FilePathProcessor.java
        FileProcessor.java
        Filter.java
        FilteredIterator.java
        Filters.java
        FixedPrioritiesPriorityQueue.java
        Function.java
        Generics.java
        HashIndex.java
        HashableCoreMap.java
        IdentityHashSet.java
        Index.java
        IntPair.java
        IntQuadruple.java
        IntTriple.java
        IntTuple.java
        IntUni.java
        Interner.java
        MapFactory.java
        MetaClass.java
        MutableDouble.java
        MutableInteger.java
        Pair.java
        PriorityQueue.java
        ReflectionLoading.java
        Scored.java
        ScoredObject.java
        Sets.java
        StreamGobbler.java
        StringUtils.java
        Timing.java
        Triple.java
        TypesafeMap.java
        XMLUtils.java
        concurrent
        SynchronizedInterner.java

package edu.stanford.nlp.process;

import java.io.*;
import java.net.*;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.*;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.util.Function;

/**
 * Produces a list of sentences from either a plain text or XML document.
 * <p>
 * Tokenization: The default tokenizer is {@link PTBTokenizer}. If null is passed to
 * <code>setTokenizerFactory</code>, then whitespace tokenization is assumed.
 * <p>
 * Adding a new document type requires two steps:
 * <ol>
 * <li> Add a new DocType.
 * <li> Create an iterator for the new DocType and modify the iterator() function to return the new iterator.
 * </ol>
 * <p>
 * NOTE: This implementation should <em>not</em> use external libraries since it is used in the parser.
 *
 * @author Spence Green
 */
public class DocumentPreprocessor implements Iterable<List<HasWord>> {

  public static enum DocType {Plain, XML}

  private Reader inputReader = null;
  private String inputPath = null;
  private DocType docType = DocType.Plain;

  //Configurable options
  private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.factory();
  private String encoding = null;
  private String[] sentenceFinalPuncWords = {".", "?", "!"};
  private Function<List<HasWord>,List<HasWord>> escaper = null;
  private String sentenceDelimiter = null;
  /**
   * Example: if the words are already POS tagged and look like
   * foo_VB, you want to set the tagDelimiter to "_"
   */
  private String tagDelimiter = null;
  /**
   * When doing XML parsing, only accept text in between tags that
   * match this regular expression.  Defaults to everything.
   */
  private String elementDelimiter = ".*";

  //From PTB conventions
  private final String[] sentenceFinalFollowers = {")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", "-RCB-"};

  /**
   * Constructs a preprocessor from an existing input stream.
   *
   * @param input An existing reader
   */
  public DocumentPreprocessor(Reader input) {
    this(input,DocType.Plain);
  }

  public DocumentPreprocessor(Reader input, DocType t) {
    if (input == null)
      throw new RuntimeException("Cannot read from null object!");

    docType = t;
    inputReader = input;
  }

  /**
   * Constructs a preprocessor from a file at a path, which can be either
   * a filesystem location or a URL.
   *
   * @param docPath
   */
  public DocumentPreprocessor(String docPath) {
    this(docPath,DocType.Plain);
  }

  public DocumentPreprocessor(String docPath, DocType t) {
    if (docPath == null)
      throw new RuntimeException("Cannot open null document path!");

    docType = t;
    inputPath = docPath;
  }

  /**
   * Set the character encoding.
   *
   * @param encoding The character encoding used by Readers
   * @throws IllegalCharsetNameException If the JVM does not support the named character set.
   */
  public void setEncoding(String encoding) throws IllegalCharsetNameException {
    if (Charset.isSupported(encoding))
      this.encoding = encoding;
  }

  /**
   * Sets the end-of-sentence delimiters.
   * <p>
   * For newline tokenization, use the argument {"\n"}.
   *
   * @param sentenceFinalPuncWords
   */
  public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
    this.sentenceFinalPuncWords = sentenceFinalPuncWords;
  }

  /**
   * Sets the factory from which to produce a {@link Tokenizer}.  The default is
   * {@link PTBTokenizer}.
   * <p>
   * NOTE: If a null argument is used, then the document is assumed to be tokenized
   * and DocumentPreprocessor performs no tokenization.
   *
   */
  public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory) {
    tokenizerFactory = newTokenizerFactory;
  }

  /**
   * Set an escaper.
   *
   * @param e The escaper
   */
  public void setEscaper(Function<List<HasWord>,List<HasWord>> e) { escaper = e; }

  /**
   * Make the processor assume that the document is already delimited
   * by the supplied parameter.
   *
   * @param s The sentence delimiter
   */
  public void setSentenceDelimiter(String s) { sentenceDelimiter = s; }

  /**
   * Split tags from tokens. The tag will be placed in the TagAnnotation of
   * the returned label.
   * <p>
   * Note that for strings that contain two or more instances of the tag delimiter,
   * the last instance is treated as the split point.
   * <p>
   * The tag delimiter should not contain any characters that must be escaped in a Java
   * regex.
   *
   * @param s POS tag delimiter
   */
  public void setTagDelimiter(String s) { tagDelimiter = s; }

  /**
   * Only read text from between these XML tokens if in XML mode.
   * Otherwise, will read from all tokens.
   */
  public void setElementDelimiter(String s) { elementDelimiter = s; }


  /**
   * Returns sentences until the document is exhausted. Calls close() if the end of the document
   * is reached. Otherwise, the user is required to close the stream.
   */
  public Iterator<List<HasWord>> iterator() {
    try {
      if (inputReader == null)
        inputReader = getReaderFromPath(inputPath);

      //TODO: Add new document types here
      if (docType == DocType.Plain) {
        return new PlainTextIterator();
      } else if (docType == DocType.XML) {
        return new XMLIterator();
      }

    } catch (IOException e) {
      System.err.printf("%s: Could not open path %s\n", this.getClass().getName(), inputPath);
    }

    return new Iterator<List<HasWord>>() {
      public boolean hasNext() { return false; }
      public List<HasWord> next() { throw new NoSuchElementException(); }
      public void remove() {}
    };
  }


  private Reader getReaderFromPath(String path) throws IOException {
    //Check if it is a URL first, otherwise look for a file
    try {
      URL url = new URL(path);
      URLConnection connection = url.openConnection();
      return new BufferedReader(new InputStreamReader(connection.getInputStream()));

    } catch(MalformedURLException e) {
      //Do nothing: the path may be a file
    }

    File file = new File(path);
    if (file.exists()) {
      return (encoding == null) ? new FileReader(path) : new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding));
    }
    throw new IOException("Unable to open " + path);
  }

  private class PlainTextIterator implements Iterator<List<HasWord>> {

    private Tokenizer<? extends HasWord> tokenizer;
    private Set<String> sentDelims;
    private Set<String> delimFollowers = new HashSet<String>(Arrays.asList(sentenceFinalFollowers));
    private Function<String, String[]> splitTag;
    private List<HasWord> nextSent = null;
    private List<HasWord> nextSentCarryover = new ArrayList<HasWord>();

    public PlainTextIterator() {
      // Establish how to find sentence boundaries
      sentDelims = new HashSet<String>();
      boolean eolIsSignificant = false;
      if (sentenceDelimiter == null) {
        if (sentenceFinalPuncWords != null) {
          sentDelims = new HashSet<String>(Arrays.asList(sentenceFinalPuncWords));
        }
      } else {
        sentDelims.add(sentenceDelimiter);
        delimFollowers = new HashSet<String>();
        eolIsSignificant = sentenceDelimiter.matches("\\s+");
        if(eolIsSignificant) // For Stanford English Tokenizer
          sentDelims.add(PTBLexer.NEWLINE_TOKEN);
      }

      // Setup the tokenizer
      if(tokenizerFactory == null) {
        tokenizer = WhitespaceTokenizer.
          newWordWhitespaceTokenizer(inputReader, eolIsSignificant);
      } else {
        if(eolIsSignificant)
          tokenizerFactory.setOptions("tokenizeNLs");//wsg2010: This key currently used across all tokenizers
        tokenizer = tokenizerFactory.getTokenizer(inputReader);
      }

      // If tokens are tagged, then we must split them
      // Note that if the token contains two or more instances of the delimiter, then the last
      // instance is regarded as the split point.
      if (tagDelimiter != null) {
        splitTag = new Function<String,String[]>() {
          private final String splitRegex = String.format("%s(?!.*%s)",tagDelimiter,tagDelimiter);
          public String[] apply(String in) {
            final String[] splits = in.trim().split(splitRegex);
            if(splits.length == 2)
              return splits;
            else {
              String[] oldStr = {in};
              return oldStr;
            }
          }
        };
      }
    }

    private void primeNext() {
      nextSent = new ArrayList<HasWord>(nextSentCarryover);
      nextSentCarryover.clear();
      boolean seenBoundary = false;

      while (tokenizer.hasNext()) {

        HasWord token = tokenizer.next();
        if (splitTag != null) {
          String[] toks = splitTag.apply(token.word());
          token.setWord(toks[0]);
          if(toks.length == 2 && token instanceof HasTag) {
            //wsg2011: Some of the underlying tokenizers return old
            //JavaNLP labels.  We could convert to CoreLabel here, but
            //we choose a conservative implementation....
            ((HasTag) token).setTag(toks[1]);
          }
        }

        if (sentDelims.contains(token.word())) {
          seenBoundary = true;
        } else if (seenBoundary && !delimFollowers.contains(token.word())) {
          nextSentCarryover.add(token);
          break;
        }

        if ( ! (token.word().matches("\\s+") || 
                token.word().equals(PTBLexer.NEWLINE_TOKEN))) {
          nextSent.add(token);
        }

        // If there are no words that can follow a sentence delimiter,
        // then there are two cases.  In one case is we already have a
        // sentence, in which case there is no reason to look at the
        // next token, since that just causes buffering without any
        // chance of the current sentence being extended, since
        // delimFollowers = {}.  In the other case, we have an empty
        // sentence, which at this point means the sentence delimiter
        // was a whitespace token such as \n.  We might as well keep
        // going as if we had never seen anything.
        if (seenBoundary && delimFollowers.size() == 0) {
          if (nextSent.size() > 0) {
            break;
          } else {
            seenBoundary = false;
          }
        }
      }

      if (nextSent.size() == 0 && nextSentCarryover.size() == 0) {
        IOUtils.closeIgnoringExceptions(inputReader);
        inputReader = null;
        nextSent = null;
      } else if (escaper != null) {
        nextSent = escaper.apply(nextSent);
      }
    }

    public boolean hasNext() { 
      if (nextSent == null) {
        primeNext();
      }
      return nextSent != null; 
    }

    public List<HasWord> next() {
      if (nextSent == null) {
        primeNext();
      }
      if (nextSent == null) {
        throw new NoSuchElementException();
      }
      List<HasWord> thisIteration = nextSent;
      nextSent = null;
      return thisIteration;
    }

    public void remove() { throw new UnsupportedOperationException(); }
  }

  private class XMLIterator implements Iterator<List<HasWord>> {

    private final XMLBeginEndIterator<String> xmlItr;
    private final Reader originalDocReader;
    private PlainTextIterator plainItr; // = null;
    private List<HasWord> nextSent; // = null;

    public XMLIterator() {
      xmlItr = new XMLBeginEndIterator<String>(inputReader, elementDelimiter);
      originalDocReader = inputReader;
      primeNext();
    }

    private void primeNext() {
      // It is necessary to loop because if a document has a pattern
      // that goes: <tag></tag> the xmlItr will return an empty
      // string, which the plainItr will process to null.  If we
      // didn't loop to find the next tag, the iterator would stop.
      do {
        if (plainItr != null && plainItr.hasNext()) {
          nextSent = plainItr.next();
        } else if (xmlItr.hasNext()) {
          String block = xmlItr.next();
          inputReader = new BufferedReader(new StringReader(block));
          plainItr = new PlainTextIterator();
          if (plainItr.hasNext()) {
            nextSent = plainItr.next();
          } else {
            nextSent = null;
          }
        } else {
          IOUtils.closeIgnoringExceptions(originalDocReader);
          nextSent = null;
          break;
        }
      } while (nextSent == null);
    }

    public boolean hasNext() {
      return nextSent != null;
    }

    public List<HasWord> next() {
      if (nextSent == null) {
        throw new NoSuchElementException();
      }
      List<HasWord> thisSentence = nextSent;
      primeNext();
      return thisSentence;
    }

    public void remove() { throw new UnsupportedOperationException(); }
  }


  /**
   * This provides a simple test method for DocumentPreprocessor. <br/>
   * Usage:
   * java
   * DocumentPreprocessor -file filename [-xml tag] [-suppressEscaping] [-noTokenization]
   * <p>
   * A filename is required. The code doesn't run as a filter currently.
   * <p>
   * tag is the element name of the XML from which to extract text.  It can
   * be a regular expression which is called on the element with the
   * matches() method, such as 'TITLE|P'.
   *
   * @param args Command-line arguments
   */
  public static void main(String[] args) {
    if (args.length < 1) {
      System.err.println("usage: DocumentPreprocessor filename [OPTS]");
      System.exit(-1);
    }

    DocumentPreprocessor docPreprocessor = new DocumentPreprocessor(args[0]);

    for (int i = 1; i < args.length; i++) {
      if (args[i].equals("-xml")) {
        docPreprocessor = new DocumentPreprocessor(args[0], DocType.XML);
        docPreprocessor.setTagDelimiter(args[++i]);

      } else if (args[i].equals("-suppressEscaping")) {
        String options = "ptb3Escaping=false";
        docPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(),options));

      } else if (args[i].equals("-tokenizerOptions") && i+1 < args.length) {
        String options = args[i+1];
        docPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(),options));
        i++;
      } else if (args[i].equals("-noTokenization")) {
        docPreprocessor.setTokenizerFactory(null);
        docPreprocessor.setSentenceDelimiter(System.getProperty("line.separator"));

      } else if (args[i].equals("-tag")) {
        docPreprocessor.setTagDelimiter(args[++i]);
      }
    }

    docPreprocessor.setEncoding("UTF-8");

    int numSents = 0;
    for (List<HasWord> sentence : docPreprocessor) {
      numSents++;
      System.err.println("Length: " + sentence.size());
      boolean printSpace = false;
      for (HasWord word : sentence) {
        if (printSpace) System.out.print(" ");
        printSpace = true;
        System.out.print(word.word());
      }
      System.out.println();
    }
    System.err.println("Read in " + numSents + " sentences.");
  }

}