package edu.stanford.nlp.wordseg; import java.util.*; import java.io.*; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.io.EncodingPrintWriter; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils; import edu.stanford.nlp.util.Generics; /** * Check if a bigram exists in bakeoff corpora. * The dictionaries that this class reads have to be in UTF-8. * * @author Huihsin Tseng * @author Pichuan Chang */ public class CorpusDictionary { private static Redwood.RedwoodChannels logger = Redwood.channels(CorpusDictionary.class); private Set<String> oneWord; // = null; /** Load a dictionary of words. * * @param filename A file of words, one per line. It must be in UTF-8. */ public CorpusDictionary(String filename) { this(filename, false); } public CorpusDictionary(String filename, boolean normalize) { if (oneWord == null) { oneWord = readDict(filename, normalize); } } public Set<String> getTable() { return oneWord; } private static Set<String> readDict(String filename, boolean normalize) { Set<String> word = Generics.newHashSet(); logger.info("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename); try { InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename); BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8")); int i = 0; for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) { i++; //String[] fields = wordDetectorLine.split(" "); //logger.debug("DEBUG: "+filename+" "+wordDetectorLine); int origLeng = wordDetectorLine.length(); wordDetectorLine = wordDetectorLine.trim(); int newLeng = wordDetectorLine.length(); if (newLeng != origLeng) { EncodingPrintWriter.err.println("Line " + i + " of " + filename + " has leading/trailing whitespace: |" + wordDetectorLine + "|", "UTF-8"); } if (newLeng == 0) { EncodingPrintWriter.err.println("Line " + i + " of " + filename + " is empty", "UTF-8"); } else { if (normalize) { wordDetectorLine = ChineseUtils.normalize(wordDetectorLine, ChineseUtils.ASCII, ChineseUtils.ASCII, ChineseUtils.NORMALIZE); } word.add(wordDetectorLine); } } is.close(); } catch (IOException e) { throw new RuntimeIOException(e); } return word; } public boolean contains(String word) { return getTable().contains(word); } public String getW(String a1) { if (contains(a1)) return "1"; return "0"; } } // end class CorpusDictionary