ChineseDictionary.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.wordseg;

import java.io.*;
import java.util.*;
import java.util.regex.Pattern;


import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.process.ChineseDocumentToSentenceProcessor;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;

/** This class provides a main method that loads various dictionaries, and
 *  saves them in a serialized version, and runtime compiles them into a word list used as a feature in the segmenter.
 *
 *  The features are added in the method {@link Sighan2005DocumentReaderAndWriter#addDictionaryFeatures}.
 *
 *  @author Pi-Chuan Chang
 */

public class ChineseDictionary {

  private static final boolean DEBUG = false;

  // todo [2017]: This should be redone sometime to not have such a hardcoded upper limit.
  public static final int MAX_LEXICON_LENGTH = 6;

  private static Redwood.RedwoodChannels logger = Redwood.channels(ChineseDictionary.class);
  @SuppressWarnings({"unchecked"})
  private final Set<String>[] words_ = new HashSet[MAX_LEXICON_LENGTH+1];

  private final ChineseDocumentToSentenceProcessor cdtos_; // = null;

  private void serializeDictionary(String serializePath) {
    logger.info("Serializing dictionaries to " + serializePath + " ... ");

    try {
      ObjectOutputStream oos = IOUtils.writeStreamFromString(serializePath);

      //oos.writeObject(MAX_LEXICON_LENGTH);
      oos.writeObject(words_);
      //oos.writeObject(cdtos_);
      oos.close();
      logger.info("done.");
    } catch (Exception e) {
      logger.error("Failed", e);
      throw new RuntimeIOException(e);
    }
  }

  @SuppressWarnings({"unchecked"})
  private static Set<String>[] loadDictionary(String serializePath) {
    Set<String>[] dict = new HashSet[MAX_LEXICON_LENGTH+1];
    for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
      dict[i] = Generics.newHashSet();
    }

    // logger.info("loading dictionaries from " + serializePath + "...");

    try {
      // once we read MAX_LEXICON_LENGTH and cdtos as well
      // now these files only store one object we care about
      //ChineseDictionary.MAX_LEXICON_LENGTH = (int) ois.readObject();
      dict = IOUtils.readObjectFromURLOrClasspathOrFileSystem(serializePath);
    } catch (Exception e) {
      logger.error("Failed to load Chinese dictionary " + serializePath, e);
      throw new RuntimeException(e);
    }
    return dict;
  }


  public ChineseDictionary(String dict) {
    this(new String[] { dict });
  }

  public ChineseDictionary(String[] dicts) {
    this(dicts, null);
  }

  public ChineseDictionary(String[] dicts,
                           ChineseDocumentToSentenceProcessor cdtos) {
    this(dicts, cdtos, false);
  }

  /**
   * The first argument can be one file path, or multiple files separated by
   * commas.
   */
  public ChineseDictionary(String serDicts,
                           ChineseDocumentToSentenceProcessor cdtos,
                           boolean expandMidDot) {
    this(serDicts.split(","), cdtos, expandMidDot);
  }

  public ChineseDictionary(String[] dicts,
                           ChineseDocumentToSentenceProcessor cdtos,
                           boolean expandMidDot) {
    logger.info(String.format("Loading Chinese dictionaries from %d file%s:%n", dicts.length, (dicts.length == 1) ? "" : "s"));
    for (String dict : dicts) {
      logger.info("  " + dict);
    }

    for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
      words_[i] = Generics.newHashSet();
    }

    this.cdtos_ = cdtos;

    for (String dict : dicts) {
      if(dict.endsWith("ser.gz")) {
        // TODO: the way this is written does not work if we allow dictionaries to have different settings of MAX_LEXICON_LENGTH
        Set<String>[] dictwords = loadDictionary(dict);
        for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
          words_[i].addAll(dictwords[i]);
          dictwords[i] = null;
        }
      } else {
        addDict(dict, expandMidDot);
      }
    }

    int total = 0;
    for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
      total += words_[i].size();
    }
    logger.info(String.format("Done. Unique words in ChineseDictionary is: %d.%n", total));
  }

  private static final Pattern midDot = Pattern.compile(ChineseUtils.MID_DOT_REGEX_STR);

  private void addDict(String dict, boolean expandMidDot) {
    String content = IOUtils.slurpFileNoExceptions(dict,"utf-8");
    String[] lines = content.split("\n");
    logger.info("  " + dict + ": " + lines.length + " entries");
    for (String line : lines) {
      line = line.trim();
      // normalize any midDot
      if (expandMidDot) {
        // normalize down middot chars
        line = line.replaceAll(ChineseUtils.MID_DOT_REGEX_STR, "\u00B7");
      }
      addOneDict(line);
      if (DEBUG) EncodingPrintWriter.err.println("ORIG: " + line, "UTF-8");
      if (expandMidDot && midDot.matcher(line).find()) {
        line = line.replaceAll(ChineseUtils.MID_DOT_REGEX_STR, "");
        if (DEBUG) EncodingPrintWriter.err.println("ALSO: " + line, "UTF-8");
        addOneDict(line);
      }
    }
  }

  private void addOneDict(String item) {
    int length = item.length();
    if (length == 0) {
      // Do nothing for empty items
    } else if (length <= MAX_LEXICON_LENGTH-1) {
      if (cdtos_ != null) {
        item = cdtos_.normalization(item);
      }
      if (DEBUG) EncodingPrintWriter.err.println("DICT: "+item, "UTF-8");
      words_[length].add(item);
    } else {
      // insist on new String as it may save memory
      String subItem = new String(item.substring(0,MAX_LEXICON_LENGTH));
      if (cdtos_ != null) {
        subItem = cdtos_.normalization(subItem);
      }
      if (DEBUG) EncodingPrintWriter.err.println("DICT: "+subItem, "UTF-8");
      // length=MAX_LEXICON_LENGTH and MAX_LEXICON_LENGTH+
      words_[MAX_LEXICON_LENGTH].add(subItem);
    }
  }

  public boolean contains(String word) {
    int length = word.length();
    if (length <= MAX_LEXICON_LENGTH-1) {
      return words_[length].contains(word);
    } else {
      length = MAX_LEXICON_LENGTH;
      return words_[length].contains(word.substring(0,6));
    }
  }

  public static void main(String[] args) {
    String inputDicts = "/u/nlp/data/chinese-dictionaries/plain/ne_wikipedia-utf8.txt,/u/nlp/data/chinese-dictionaries/plain/newsexplorer_entities_utf8.txt,/u/nlp/data/chinese-dictionaries/plain/Ch-name-list-utf8.txt,/u/nlp/data/chinese-dictionaries/plain/wikilex-20070908-zh-en.txt,/u/nlp/data/chinese-dictionaries/plain/adso-1.25-050405-monolingual-clean.utf8.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_108k_normalized.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_mandarintools_normalized.txt,/u/nlp/data/chinese-dictionaries/plain/harbin-ChineseNames_utf8.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_HowNet_normalized.txt";

    String output = "/u/nlp/data/gale/segtool/stanford-seg/classifiers/dict-chris6.ser.gz";


    Map<String,Integer> flagMap = Generics.newHashMap();
    flagMap.put("-inputDicts", 1);
    flagMap.put("-output", 1);
    Map<String,String[]> argsMap = StringUtils.argsToMap(args,flagMap);
    // args = argsMap.get(null);
    if(argsMap.keySet().contains("-inputDicts")) {
      inputDicts = argsMap.get("-inputDicts")[0];
    }
    if(argsMap.keySet().contains("-output")) {
      output = argsMap.get("-output")[0];
    }

    String[] dicts = inputDicts.split(",");

    ChineseDocumentToSentenceProcessor cdtos
      = new ChineseDocumentToSentenceProcessor(null);
    boolean expandMidDot = true;

    ChineseDictionary dict = new ChineseDictionary(dicts, cdtos, expandMidDot);
    dict.serializeDictionary(output);

    /*
    //ChineseDictionary dict = new ChineseDictionary(args[0]);
    for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
      logger.info("Length: " + i+": "+dict.words[i].size());
    }
    for (int i = 0; i <= MAX_LEXICON_LENGTH; i++) {
      logger.info("Length: " + i+": "+dict.words[i].size());
      if (dict.words[i].size() < 1000) {
        for (String word : dict.words[i]) {
          EncodingPrintWriter.err.println(word, "UTF-8");
        }
      }
    }
    for  (int i = 1; i < args.length; i++) {
      logger.info(args[i] + " " + Boolean.valueOf(dict.contains(args[i])).toString());
    }
    */
  }

}