// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package experimental.morfessor;
import java.io.Serializable;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import marmot.util.CollectableDouble;
import marmot.util.FileUtils;
import marmot.util.LineIterator;
import marmot.util.StringUtils;
import marmot.util.StringUtils.Mode;
import marmot.util.Trie;
public class CompoundSplitter implements Serializable, Splitter {
private static final long serialVersionUID = 1L;
public CompoundSplitter(boolean normalize) {
trie_ = new Trie<CollectableDouble>();
normalize_ = normalize;
}
private Trie<CollectableDouble> trie_;
private boolean normalize_;
public CompoundSplitter() {
this(false);
}
public void addWord(String word, double count) {
if (normalize_) {
word = StringUtils.normalize(word, Mode.lower);
}
trie_.addWord(word, new CollectableDouble(count));
}
public PriorityQueue<Segment> segment(Segment segment, String compound, int start_index, Scorer scorer) {
PriorityQueue<Segment> queue = new PriorityQueue<>();
Trie<CollectableDouble> trie = trie_;
for (int i=start_index; i < compound.length(); i++) {
char c = compound.charAt(i);
trie = trie.getChild(c);
if (trie == null) {
break;
}
if (trie.isTerminal()) {
Segment seg = new Segment(segment, i + 1, trie.getContent().getValue(), scorer);
if (i + 1 == compound.length()) {
queue.add(seg);
} else {
queue.addAll(segment(seg, compound, i + 1, scorer));
}
}
}
return queue;
}
public PriorityQueue<Segment> segment(String compound, Scorer scorer) {
if (normalize_) {
compound = StringUtils.normalize(compound, Mode.lower);
}
return segment(null, compound, 0, scorer);
}
public Trie<CollectableDouble> getTrie() {
return trie_;
}
public static void main(String[] args) throws JSAPException {
FlaggedOption opt;
JSAP jsap = new JSAP();
opt = new FlaggedOption("vocab-file").setRequired(true).setLongFlag(
"vocab-file");
jsap.registerParameter(opt);
opt = new FlaggedOption("splitter-file").setRequired(true).setLongFlag(
"splitter-file");
jsap.registerParameter(opt);
opt = new FlaggedOption("min-freq").setRequired(false).setStringParser(JSAP.INTEGER_PARSER).setLongFlag(
"min-freq").setDefault("10");
jsap.registerParameter(opt);
opt = new FlaggedOption("min-length").setRequired(false).setStringParser(JSAP.INTEGER_PARSER).setLongFlag(
"min-length").setDefault("3");
jsap.registerParameter(opt);
JSAPResult config = jsap.parse(args);
if (!config.success()) {
for (Iterator<?> errs = config.getErrorMessageIterator();
errs.hasNext();) {
System.err.println("Error: " + errs.next());
}
System.err.println("Usage: ");
System.err.println(jsap.getUsage());
System.err.println(jsap.getHelp());
System.err.println();
System.exit(1);
}
int min_freq = config.getInt("min-freq");
int min_length = config.getInt("min-length");
CompoundSplitter splitter = new CompoundSplitter(true);
splitter.loadFile(config.getString("vocab-file"), min_freq, min_length);
FileUtils.saveToFile(splitter, config.getString("splitter-file"));
}
public void loadFile(String filename, int min_freq, int min_length) {
LineIterator iterator = new LineIterator(filename);
while (iterator.hasNext()) {
List<String> list = iterator.next();
String form = list.get(0);
int freq = Integer.parseInt(list.get(1));
if (freq < min_freq) {
break;
}
if (form.length() >= min_length) {
addWord(form, freq);
}
}
}
public List<String> split(String word, Scorer scorer) {
PriorityQueue<Segment> queue = segment(word, scorer);
if (queue.isEmpty()) {
return Collections.singletonList(word);
}
List<String> list = new LinkedList<>();
Segment segment = queue.poll();
Segment run = segment.getPreviousSegment();
int from_index = word.length();
while (run != null) {
int to_index = run.getIndex();
String morph = word.substring(to_index, from_index);
list.add(0, morph);
run = run.getPreviousSegment();
from_index = to_index;
}
String morph = word.substring(0, from_index);
list.add(0, morph);
return list;
}
@Override
public List<String> split(String form) {
return split(form, new MeanScorer());
}
}