// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package experimental.morfessor; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import marmot.util.FileUtils; import marmot.util.StringUtils; import marmot.util.StringUtils.Mode; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; public class Morfessor implements Serializable, Splitter { private static final long serialVersionUID = 1L; private CharEncoder encoder_; private ViterbiDecoder viterbi_; private Expander expander_; public Morfessor(String train_dir) { this(train_dir, null); } public Morfessor(String train_dir, CharEncoder encoder) { viterbi_ = new ViterbiDecoder(train_dir + "/viterbitagsplit2.ii.probs.gz"); expander_ = new Expander(train_dir + "/viterbitagsplit2.ii.tagged.gz"); encoder_ = encoder; } private List<Morpheme> split_(String word) { return expander_.expand(viterbi_.split(word)); } public void setEncoder(CharEncoder encoder) { encoder_ = encoder; } public List<String> split(String word) { return split(word, 200); } public List<String> split(String word, int length_limit) { List<String> list = new LinkedList<String>(); for (String token : Vocab.tokenize(word)) { if (Vocab.isSpecial(token)) { list.add(token); continue; } String new_token = StringUtils.normalize(token, Mode.lower); if (new_token.length() != token.length()) { System.err.println(new_token + " ==> " + token); } if (new_token.length() > length_limit) { list.add(new_token); } else { if (encoder_ != null) new_token = encoder_.encode(new_token); List<Morpheme> morphemes = split_(new_token); int last_index = 0; for (Morpheme morph : morphemes) { int new_index = last_index + morph.getMorpheme().length(); String string = token.substring(last_index, new_index); last_index = new_index; list.add(string); } } } return list; } public static void main(String[] args) throws JSAPException, IOException { FlaggedOption opt; JSAP jsap = new JSAP(); opt = new FlaggedOption("morfessor").setRequired(true).setLongFlag( "morfessor"); jsap.registerParameter(opt); opt = new FlaggedOption("encoder").setRequired(false).setLongFlag( "encoder"); jsap.registerParameter(opt); JSAPResult config = jsap.parse(args); if (!config.success()) { for (Iterator<?> errs = config.getErrorMessageIterator(); errs .hasNext();) { System.err.println("Error: " + errs.next()); } System.err.println("Usage: "); System.err.println(jsap.getUsage()); System.err.println(jsap.getHelp()); System.err.println(); System.exit(1); } CharEncoder encoder = null; if (config.getString("encoder") != null) { encoder = FileUtils.loadFromFile(config.getString("encoder")); } Morfessor m = new Morfessor(config.getString("morfessor"), encoder); String commandLine; BufferedReader console = new BufferedReader(new InputStreamReader( System.in)); while (true) { System.out.print("morfessor> "); commandLine = console.readLine(); if (commandLine == null) { break; } if (commandLine.equals("")) { continue; } if (commandLine.equals("q") || commandLine.equals("e") || commandLine.equals("exit") || commandLine.equals("quit")) { break; } for (String morph : m.split(commandLine)) { System.out.print(' '); System.out.print(morph); } System.out.println(); } } // private void test(String filename) { // int error = 0; // int total = 0; // // try { // BufferedReader reader = File.openFile(filename); // // while (reader.ready()) { // String line = reader.readLine(); // // if (line.startsWith("#")) { // continue; // } // // int index; // int space_index = line.indexOf(' '); // int tab_index = line.indexOf('\t'); // // if (space_index == -1) { // index = tab_index; // } else if (tab_index == -1) { // index = space_index; // } else { // index = Math.min(tab_index, space_index); // } // // line = line.substring(index + 1); // List<Morpheme> morphs = Morpheme.split(line.trim()); // String word = Morpheme.join(morphs, false, false, ""); // List<Morpheme> morphs2 = split_(word); // // if (!morphs.equals(morphs2)) { // if (error < 100) { // System.err.format("%s : %s <-> %s\n", word, morphs, // morphs2); // } // error++; // } // total++; // } // // } catch (IOException e) { // throw new RuntimeException(e); // } // // System.err.println("Error rate: " + error + " / " + total); // } }