package chipmunk.segmenter; import java.io.Serializable; import java.util.HashSet; import java.util.List; import java.util.Set; import marmot.util.LineIterator; public class Dictionary implements Serializable { private static final long serialVersionUID = 1L; private StringNormalizer normalizer_; private Set<String> set_; private int min_count_ = 5; public Dictionary(String path, String lang, int max_length) { normalizer_ = StringNormalizer.rawCreate(lang); init(path, max_length); } private void init(String path, int max_length) { set_ = new HashSet<>(); LineIterator iterator = new LineIterator(path); while (iterator.hasNext()) { List<String> line = iterator.next(); assert line.size() == 1 || line.size() == 2 : line; int count; String segment; if (line.size() == 2) { segment = line.get(1); count = Integer.parseInt(line.get(0)); } else { segment = line.get(0); count = min_count_; } if (count >= min_count_) { segment = normalizer_.normalize(segment); if (segment.length() <= max_length) { set_.add(segment); } } } } boolean contains(String segment) { return set_.contains(segment); } public int size() { return set_.size(); } }