// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package experimental.morfessor;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.List;
import marmot.util.FileUtils;
import marmot.util.LineIterator;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
public class CorpusSegmenter {
public static void main(String[] args) throws JSAPException, IOException {
FlaggedOption opt;
JSAP jsap = new JSAP();
opt = new FlaggedOption("text-file").setRequired(true).setLongFlag(
"text-file");
jsap.registerParameter(opt);
opt = new FlaggedOption("morfessor").setRequired(false).setLongFlag(
"morfessor");
jsap.registerParameter(opt);
opt = new FlaggedOption("encoder").setRequired(false).setLongFlag(
"encoder");
jsap.registerParameter(opt);
opt = new FlaggedOption("splitter").setRequired(false).setLongFlag(
"splitter");
jsap.registerParameter(opt);
opt = new FlaggedOption("out-file").setRequired(true).setLongFlag(
"out-file");
jsap.registerParameter(opt);
JSAPResult config = jsap.parse(args);
if (!config.success()) {
for (Iterator<?> errs = config.getErrorMessageIterator();
errs.hasNext();) {
System.err.println("Error: " + errs.next());
}
System.err.println("Usage: ");
System.err.println(jsap.getUsage());
System.err.println(jsap.getHelp());
System.err.println();
System.exit(1);
}
Splitter splitter = null;
if (config.getString("splitter") != null) {
splitter = FileUtils.loadFromFile(config.getString("splitter"));
} else if (config.getString("morfessor") != null) {
CharEncoder encoder = null;
if (config.getString("encoder") != null) {
try {
encoder = FileUtils.<CharEncoder>loadFromFile(config.getString("encoder"));
} catch (RuntimeException e) {
System.err.println("Caught :" + e);
System.err.println("Trying text model ..." );
encoder = CharEncoder.loadFromFile(config.getString("encoder"));
}
}
splitter = new Morfessor(config.getString("morfessor"), encoder);
} else {
System.err.println("Error: Either splitter or morfessor must be specified!");
System.exit(1);
}
Writer writer = FileUtils.openFileWriter(config.getString("out-file"));
LineIterator iterator = new LineIterator(config.getString("text-file"));
int count = 0;
long time = System.currentTimeMillis();
while (iterator.hasNext() /*&& count < 1000000*/) {
List<String> line = iterator.next();
boolean first = true;
for (String word : line) {
List<String> morphs = splitter.split(word);
for (String morph : morphs) {
if (!first)
writer.write(' ');
writer.write(morph);
first = false;
}
}
writer.write('\n');
count++;
if (count % 100000 == 0) {
double delta = (System.currentTimeMillis() - time) / 1000.;
System.err.format("Processing at %g lines/s.\n", count / delta);
}
}
writer.close();
}
}