CorpusDictionary.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.wordseg;

import java.util.*;
import java.io.*;


import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Generics;

/**
 * Check if a bigram exists in bakeoff corpora.
 * The dictionaries that this class reads have to be in UTF-8.
 *
 * @author Huihsin Tseng
 * @author Pichuan Chang
 */


public class CorpusDictionary {

  private static Redwood.RedwoodChannels logger = Redwood.channels(CorpusDictionary.class);

  private Set<String> oneWord; // = null;

  /** Load a dictionary of words.
   *
   * @param filename A file of words, one per line. It must be in UTF-8.
   */
  public CorpusDictionary(String filename) {
    this(filename, false);
  }

  public CorpusDictionary(String filename, boolean normalize) {
    if (oneWord == null) {
      oneWord = readDict(filename, normalize);
    }
  }

  public Set<String> getTable() {
    return oneWord;
  }


  private static Set<String> readDict(String filename, boolean normalize)  {
    Set<String> word = Generics.newHashSet();

    logger.info("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);

    try {
      InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);
      BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
      int i = 0;
      for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) {
        i++;
        //String[] fields = wordDetectorLine.split("	");
        //logger.debug("DEBUG: "+filename+" "+wordDetectorLine);
        int origLeng = wordDetectorLine.length();
        wordDetectorLine = wordDetectorLine.trim();
        int newLeng = wordDetectorLine.length();
        if (newLeng != origLeng) {
          EncodingPrintWriter.err.println("Line " + i + " of " + filename + " has leading/trailing whitespace: |" + wordDetectorLine + "|", "UTF-8");
        }
        if (newLeng == 0) {
          EncodingPrintWriter.err.println("Line " + i + " of " + filename + " is empty", "UTF-8");
        } else {
          if (normalize) {
            wordDetectorLine = ChineseUtils.normalize(wordDetectorLine,
                                                      ChineseUtils.ASCII,
                                                      ChineseUtils.ASCII,
                                                      ChineseUtils.NORMALIZE);
          }
          word.add(wordDetectorLine);
        }
      }
      is.close();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
    return word;
  }

  public boolean contains(String word) {
    return getTable().contains(word);
  }

  public String getW(String a1) {
    if (contains(a1))
      return "1";
    return "0";
  }

} // end class CorpusDictionary