// Copyright 2015 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package lemming.lemma; import java.util.HashMap; import java.util.List; import java.util.Map; import marmot.util.Counter; public class SimpleLemmatizerTrainer implements LemmatizerGeneratorTrainer { public static class SimpleLemmatizerTrainerOptions extends LemmaOptions { private static final long serialVersionUID = 1L; public static final String HANDLE_UNSEEN = "handle-unseen"; public static final String USE_BACKUP = "use-backup"; public static final String ABSTAIN_IF_AMBIGIOUS = "abstain-if-ambigious"; private SimpleLemmatizerTrainerOptions() { super(); map_.put(HANDLE_UNSEEN, false); map_.put(USE_BACKUP, true); map_.put(ABSTAIN_IF_AMBIGIOUS, false); } public static SimpleLemmatizerTrainerOptions newInstance() { return new SimpleLemmatizerTrainerOptions(); } public boolean getHandleUnseen() { return (Boolean) getOption(HANDLE_UNSEEN); } public boolean getUseBackup() { return (Boolean) getOption(USE_BACKUP); } public boolean getAbstainIfAmbigous() { return (Boolean) getOption(ABSTAIN_IF_AMBIGIOUS); } } private SimpleLemmatizerTrainerOptions options_; public SimpleLemmatizerTrainer() { options_ = new SimpleLemmatizerTrainerOptions(); } @Override public LemmatizerGenerator train(List<LemmaInstance> instances, List<LemmaInstance> dev_instances) { Map<String, Counter<String>> map = new HashMap<>(); for (LemmaInstance instance : instances) { String key = null; if (options_.getUsePos()) { key = SimpleLemmatizer.toKey(instance); addToMap(key, map, instance); } if (options_.getUseBackup()) { key = SimpleLemmatizer.toSimpleKey(instance); addToMap(key, map, instance); } } return new SimpleLemmatizer(options_, map); } private void addToMap(String key, Map<String, Counter<String>> map, LemmaInstance instance) { Counter<String> lemmas = map.get(key); if (lemmas == null) { lemmas = new Counter<>(); map.put(key, lemmas); } lemmas.increment(instance.getLemma(), instance.getCount()); } @Override public LemmaOptions getOptions() { return options_; } }