// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.morph; import java.util.ArrayList; import java.util.List; import lemming.lemma.LemmaInstance; import lemming.lemma.Lemmatizer; import lemming.lemma.ranker.RankerCandidate; import marmot.core.Model; import marmot.core.Sequence; import marmot.core.SimpleTagger; import marmot.core.State; import marmot.core.Token; import marmot.core.WeightVector; public class MorphTagger extends SimpleTagger { private static final long serialVersionUID = 1L; private transient Lemmatizer lemmatizer_; public MorphTagger(Model model, int order, WeightVector weight_vector) { super(model, order, weight_vector); } public void setPipeLineLemmatizer(Lemmatizer lemmatizer) { lemmatizer_ = lemmatizer; } protected void addIndexes(Sequence sequence) { MorphModel model = (MorphModel) getModel(); for (Token token : sequence) { Word word = (Word) token; model.addIndexes(word, false); } } @Override public List<List<String>> tag(Sequence sequence) { addIndexes(sequence); return super.tag(sequence); } public List<List<String>> tagWithLemma(Sequence sequence) { addIndexes(sequence); List<State> states = tag_states(sequence); List<List<String>> list = new ArrayList<>(sequence.size()); int token_index = 0; for (State state : states) { List<String> lemma_tags = new ArrayList<>(); List<String> tags = indexesToStrings(stateToIndexes(state)); String lemma = null; if (state.getLemmaCandidates() != null) { RankerCandidate candidate = RankerCandidate.bestCandidate(state.getLemmaCandidates()); lemma = candidate.getLemma(); } else if (lemmatizer_ != null) { Word word = (Word) sequence.get(token_index); LemmaInstance instance = LemmaInstance.getInstance(word); instance.setPosTag(tags.get(0)); if (1 < tags.size()) { instance.setMorphTag(tags.get(1)); } lemma = lemmatizer_.lemmatize(instance); } lemma_tags.add(lemma); lemma_tags.addAll(tags); list.add(lemma_tags); token_index ++; } return list; } }