package chipmunk.segmenter;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
import marmot.util.FileUtils;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
public class SegmenterExperiment {
private final static String STEM_DICT = "stem-dict";
public static void main(String[] args) throws JSAPException, IOException {
FlaggedOption opt;
JSAP jsap = new JSAP();
opt = new FlaggedOption("dir").setRequired(true).setLongFlag("dir");
jsap.registerParameter(opt);
opt = new FlaggedOption("out").setRequired(true).setLongFlag("out").setDefault("");
jsap.registerParameter(opt);
opt = new FlaggedOption(STEM_DICT).setRequired(true).setLongFlag(STEM_DICT).setDefault("_");
jsap.registerParameter(opt);
opt = new FlaggedOption("num-chunks")
.setStringParser(JSAP.INTEGER_PARSER).setLongFlag("num-chunks")
.setDefault("10");
jsap.registerParameter(opt);
opt = new FlaggedOption("use-dict")
.setStringParser(JSAP.BOOLEAN_PARSER).setLongFlag("use-dict")
.setDefault("true");
jsap.registerParameter(opt);
SegmenterOptions options = new SegmenterOptions();
options.registerOptions(jsap);
JSAPResult config = jsap.parse(args);
if (!config.success()) {
for (Iterator<?> errs = config.getErrorMessageIterator(); errs
.hasNext();) {
System.err.println("Error: " + errs.next());
}
System.err.println("Usage: ");
System.err.println(jsap.getUsage());
System.err.println(jsap.getHelp());
System.err.println();
System.exit(1);
}
options.setOptions(config);
String dir = config.getString("dir");
String out = config.getString("out");
String lang = options.getString(SegmenterOptions.LANG);
int num_chunks = config.getInt("num-chunks");
boolean use_dict = config.getBoolean("use-dict");
String stem_dicts = config.getString(STEM_DICT);
String dict_path = "_";
if (use_dict) {
dict_path = String.format(
"%s/%s/wiktionary.txt %s/%s/aspell.txt %s/%s/wordlist.txt",
dir, lang, dir, lang, dir, lang);
if (!stem_dicts.equals("_")) {
dict_path = dict_path + " " + stem_dicts;
}
}
options.setOption(SegmenterOptions.VERBOSE, true);
options.setOption(SegmenterOptions.DICTIONARY_PATHS, dict_path);
Logger logger = Logger.getLogger(SegmenterTrainer.class.getName());
String global_trainfile = String.format("%s/%s/trn", dir, lang);
SegmentationDataReader global_reader = new SegmentationDataReader(
global_trainfile, lang,
options.getInt(SegmenterOptions.TAG_LEVEL));
double score_sum = 0.0;
int start_chunk = 0;
int end_chunk = num_chunks;
if (num_chunks < 10) {
start_chunk = num_chunks;
end_chunk = num_chunks + 1;
num_chunks = 1;
}
for (int i = start_chunk; i < end_chunk; i++) {
System.err.format("chunk: %d\n", i);
String trainfile = String.format("%s/%s/%d.trn", dir, lang, i);
String testfile = String.format("%s/%s/%d.tst", dir, lang, i);
List<Word> train = new SegmentationDataReader(trainfile, lang, 0)
.getData();
train = global_reader.map(train);
List<Word> test = new SegmentationDataReader(testfile, lang, 0)
.getData();
test = global_reader.map(test);
SegmenterTrainer trainer = new SegmenterTrainer(options);
Segmenter segmenter = trainer.train(train);
Scorer scorer = new Scorer();
scorer.eval(test, segmenter);
logger.info(String.format("%s F1 of chunk %d: %s\n", lang, i,
scorer.report()));
score_sum += scorer.getFscore();
if (!out.isEmpty()) {
FileUtils.mkDir(String.format("%s/%s", out, lang));
String outfile = String.format("%s/%s/%d.tst", out, lang, i);
segmenter.segmentToFile(outfile, test);
}
}
logger.info(String.format("%s Average F1: %g\n", lang, score_sum
/ num_chunks));
}
}