/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.tools.postag; import java.io.IOException; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.util.featuregen.DictionaryFeatureGenerator; import opennlp.tools.util.featuregen.WindowFeatureGenerator; import org.cogroo.dictionary.FeatureDictionary; import org.cogroo.dictionary.impl.FSADictionary; import org.cogroo.dictionary.impl.FSAFeatureDictionary; import org.cogroo.util.PairWordPOSTag; public class PortugueseExtPOSContextGenerator extends PortuguesePOSContextGenerator { private FSADictionary trans; private FeatureDictionary feat; private WindowFeatureGenerator dfg; public PortugueseExtPOSContextGenerator(Dictionary dict) { this(0, dict); } public PortugueseExtPOSContextGenerator(int cacheSize, Dictionary dict) { super(cacheSize, dict); // load transitivity dic try { this.trans = FSADictionary .createFromResources("/fsa_dictionaries/pos/pt_br_trans.dict"); this.feat = FSAFeatureDictionary .createFromResources("/fsa_dictionaries/featurizer/pt_br_feats.dict"); dfg = new WindowFeatureGenerator(new DictionaryFeatureGenerator("loc_prep=", new Dictionary(getClass().getResourceAsStream( "/dictionaries/pt_br/locucoes_prepositivas.xml"))), 2, 2); } catch (IllegalArgumentException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Override protected void getContext(final int index, String[] sequence, String[] priorDecisions, Object[] additionalContext, List<String> modContext) { super.getContext(index, sequence, priorDecisions, additionalContext, modContext); // locucoes // this.dfg.createFeatures(modContext, sequence, index, null); // this.dfg.clearAdaptiveData(); // Check transitivity if (index > 0) addTransitivity("prev_", sequence[index - 1], modContext); addTransitivity("", sequence[index], modContext); if (index < sequence.length - 1) addTransitivity("nxt_", sequence[index + 1], modContext); // specific features: // 'a' if ("a".equals(sequence[index].toLowerCase())) { if (index > 0 && isVerbTag(priorDecisions[index - 1])) { modContext.add("spec_a=v+a"); addVerbTransitivityAndLemma("spec_a", sequence[index - 1], modContext); } else if (index > 1 && isVerbTag(priorDecisions[index - 2])) { addVerbTransitivityAndLemma2("spec_pa", sequence[index - 2], priorDecisions[index - 1], modContext); } if (index < sequence.length - 1) { String next = sequence[index + 1]; if (next.length() > 2) { if (matchFeature(next, "n", "F")) { modContext.add("spec_af"); if (index > 0) modContext.add("spec_af|" + priorDecisions[index - 1]); } else if (matchFeature(next, "n", "M")) { modContext.add("spec_am"); if (index > 0) modContext.add("spec_am|" + priorDecisions[index - 1]); } } } } // 'que' if ("que".equals(sequence[index].toLowerCase())) { if (index < sequence.length - 1) { // at least one before and one after boolean nextIsVerb = false; boolean prevIsVerb = false; boolean prevprevIsVerb = false; nextIsVerb = isCanBeAVerb(sequence[index + 1]); if (index > 0) { modContext.add("spec_que_prev=" + priorDecisions[index - 1]); prevIsVerb = isCanBeAVerb(sequence[index - 1]); if (index > 1) { modContext.add("spec_que_pprev=" + priorDecisions[index - 2]); prevprevIsVerb = isCanBeAVerb(sequence[index - 2]); } } if (nextIsVerb) { modContext.add("spec_que=nv"); } if(prevIsVerb) { modContext.add("spec_que=pv"); modContext.add("spec_que_pv=" + priorDecisions[index - 1]); } if(prevprevIsVerb) { modContext.add("spec_que=ppv"); modContext.add("spec_que_ppv=" + priorDecisions[index - 2]); } } } } private boolean matchFeature(String next, String tag, String feature) { String[] features = feat.getFeatures(next, null); if (features == null) features = feat.getFeatures(next.toLowerCase(), null); if (features != null) { for (String f : features) { if (f.contains(feature)) { return true; } } } return false; } private boolean isCanBeAVerb(String candidate) { return trans.getTags(candidate) != null; } private void addVerbTransitivityAndLemma2(String prefix, String ppVerb, String prevTag, List<String> modContext) { List<PairWordPOSTag> tagsAndLemmas = trans.getTagsAndLemms(ppVerb); if (tagsAndLemmas != null && tagsAndLemmas.size() > 0) { for (PairWordPOSTag pairWordPOSTag : tagsAndLemmas) { modContext.add(prefix + "_lm=" + prevTag + "|" + pairWordPOSTag.getWord()); // adds the lemma modContext.add(prefix + "_tr=" + prevTag + "|" + pairWordPOSTag.getPosTag()); // adds the transitivity modContext.add(prefix + "_lmtr=" + prevTag + "|" + pairWordPOSTag.getWord() + "|" + pairWordPOSTag.getPosTag()); // adds // the // transitivity } } } private void addVerbTransitivityAndLemma(String prefix, String verb, List<String> modContext) { List<PairWordPOSTag> tagsAndLemmas = trans.getTagsAndLemms(verb); if (tagsAndLemmas != null && tagsAndLemmas.size() > 0) { for (PairWordPOSTag pairWordPOSTag : tagsAndLemmas) { modContext.add(prefix + "_lm=" + pairWordPOSTag.getWord()); // adds the // lemma modContext.add(prefix + "_tr=" + pairWordPOSTag.getPosTag()); // adds // the // transitivity modContext.add(prefix + "_lmtr=" + pairWordPOSTag.getWord() + "|" + pairWordPOSTag.getPosTag()); // adds the transitivity } } } private boolean isVerbTag(String string) { return string.startsWith("v-"); } private void addTransitivity(String prefix, String tok, List<String> modContext) { if (trans != null) { String[] tags = trans.getTags(tok); if (tags != null) { if (tags.length > 1) { Set<String> trans = new HashSet<String>(Arrays.asList(tags)); for (String t : trans) { modContext.add(prefix + "trans=" + t); } } else { modContext.add(prefix + "trans=" + tags[0]); } } } } }