package chipmunk.segmenter; import java.io.Serializable; import java.util.Collection; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.StringParser; public class SegmenterOptions implements Serializable { private static final long serialVersionUID = 1L; public final static String CRF_MODE = "crf-mode"; public static final String NUM_ITERATIONS = "num-iterations"; public static final String AVERAGING = "averaging"; public static final String PENALTY = "penalty"; public static final String SEED = "seed"; public static final String MAX_CHARACTER_WINDOW = "max-character-window"; public static final String USE_SEGMENT_CONTEXT = "use-segment-context"; public static final String USE_CHARACTER_FEATURE = "use-character-feature"; public static final String DICTIONARY_PATHS = "dictionary-paths"; public static final String LANG = "lang"; public static final String VERBOSE = "verbose"; public static final String TAG_LEVEL = "tag-level"; private Map<String, Option> map_; private Random random_; private static class Option implements Serializable { private static final long serialVersionUID = 1L; String comment_; Object value_; Option(String comment, Object value) { comment_ = comment; value_ = value; } } public SegmenterOptions() { map_ = new HashMap<>(); setDefaults(); } private void setDefaults() { addOption(NUM_ITERATIONS, 15, "Num iterations (perceptron)"); addOption(AVERAGING, true, "Whether to use averaging (perceptron)"); addOption(CRF_MODE, false, "Train CRF instead of perceptron"); addOption(PENALTY, 0.0, "Quadratic penalty coefficient (CRF)"); addOption(SEED, 42l, "RNG seed"); addOption(MAX_CHARACTER_WINDOW, 3, "Maximum character window around segment"); addOption(USE_SEGMENT_CONTEXT, true, "Whether to join character window and segment feature"); addOption(USE_CHARACTER_FEATURE, true, "Use Ruokolinen-style character features."); addOption(DICTIONARY_PATHS, "_", "Space separated list of dictionary files or '_'"); addOption(LANG, "_", "Iso3 language code. Only used to canonicalize forms"); addOption(TAG_LEVEL, 0, "The tag level to use"); addOption(VERBOSE, false, "Verbosity"); } private void addOption(String name, Object value, String comment) { map_.put(name, new Option(comment, value)); } private Option getOption(String name) { Option opt = map_.get(name); if (opt == null) { throw new RuntimeException("No such option: " + name); } return opt; } private Object getObject(String name) { return getOption(name).value_; } public Integer getInt(String name) { return (Integer) getObject(name); } public String getString(String name) { return (String) getObject(name); } public Boolean getBoolean(String name) { return (Boolean) getObject(name); } public Double getDouble(String name) { return (Double) getObject(name); } public Random getRandom() { if (random_ == null) { random_ = new Random(getLong(SEED)); } return random_; } private long getLong(String name) { return (Long) getObject(name); } public void registerOptions(JSAP jsap) throws JSAPException { for (Map.Entry<String, Option> entry : map_.entrySet()) { String name = entry.getKey(); String comment = entry.getValue().comment_; Object value = entry.getValue().value_; StringParser parser; if (value.getClass() == String.class) { parser = JSAP.STRING_PARSER; } else if (value.getClass() == Boolean.class) { parser = JSAP.BOOLEAN_PARSER; } else if (value.getClass() == Integer.class) { parser = JSAP.INTEGER_PARSER; } else if (value.getClass() == Long.class) { parser = JSAP.LONG_PARSER; } else if (value.getClass() == Double.class) { parser = JSAP.DOUBLE_PARSER; } else { throw new RuntimeException(String.format( "Unknown type: %s %s\n", name, value.getClass())); } FlaggedOption opt = new FlaggedOption(name).setStringParser(parser) .setLongFlag(name).setDefault(value.toString()) .setRequired(false).setUsageName(comment); jsap.registerParameter(opt); } } public void setOptions(JSAPResult config) { for (Map.Entry<String, Option> entry : map_.entrySet()) { String name = entry.getKey(); setOption(name, config.getObject(name)); } } public void setOption(String name, Object new_value) { Option opt = getOption(name); if (opt.value_.getClass() != new_value.getClass()) { throw new RuntimeException(String.format( "Value is of wrong type, provided: %s, needed: %s", new_value.getClass(), opt.value_.getClass())); } opt.value_ = new_value; } public Collection<String> getDictionaries() { List<String> dicts = new LinkedList<>(); String dict_string = getString(SegmenterOptions.DICTIONARY_PATHS); String[] dict_strings = dict_string.split("\\s+"); for (String dict : dict_strings) { if (!(dict.isEmpty() || dict.equals("_"))) { dicts.add(dict); } } return dicts; } }