package hu.ppke.itk.nlpg.purepos.common.lemma; import hu.ppke.itk.nlpg.docmodel.IToken; import hu.ppke.itk.nlpg.purepos.model.ICombiner; import hu.ppke.itk.nlpg.purepos.model.IVocabulary; import hu.ppke.itk.nlpg.purepos.model.ModelData; import hu.ppke.itk.nlpg.purepos.model.internal.AbstractRawModelData; import hu.ppke.itk.nlpg.purepos.model.internal.LogLinearBiCombiner; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.tuple.Pair; public class LemmaUtil { public static Map<IToken, Pair<ILemmaTransformation<String, Integer>, Double>> batchConvert( Map<ILemmaTransformation<String, Integer>, Double> probMap, String word, IVocabulary<String, Integer> vocab) { Map<IToken, Pair<ILemmaTransformation<String, Integer>, Double>> ret = new HashMap<IToken, Pair<ILemmaTransformation<String, Integer>, Double>>(); for (Map.Entry<ILemmaTransformation<String, Integer>, Double> entry : probMap .entrySet()) { IToken lemma = entry.getKey().convert(word, vocab); ret.put(lemma, Pair.of(entry.getKey(), entry.getValue())); // Pair<ILemmaTransformation<String, Integer>, Double> ent = ret.get(lemma); // if (ent == null) { // ret.put(lemma, Pair.of(entry.getKey(), entry.getValue())); // } else if (ent.getRight() < entry.getValue()) { // ret.put(lemma, Pair.of(entry.getKey(), entry.getValue())); // } } return ret; } public static ILemmaTransformation<String, Integer> defaultLemmaRepresentation( String word, String stem, Integer tag) { return new SuffixLemmaTransformation(word, stem, tag); } public static ILemmaTransformation<String, Integer> defaultLemmaRepresentation( IToken tok, ModelData<String, Integer> data ) { Integer t = data.tagVocabulary.getIndex(tok.getTag()); return defaultLemmaRepresentation(tok.getToken(), tok.getStem(), t); } public static ICombiner defaultCombiner() { return new LogLinearBiCombiner(); } public static void storeLemma(String word, String lemma, Integer tag, String tagString, AbstractRawModelData<String, Integer> rawModelData) { rawModelData.lemmaUnigramModel.increment(lemma); int count = 1; ILemmaTransformation<String, Integer> lemmaTrans = defaultLemmaRepresentation( word, lemma, tag); rawModelData.lemmaSuffixTree.addWord(word, lemmaTrans, count, lemmaTrans.minimalCutLength()); // rawModelData.lemmaFreqTree.addWord(lemma, mainPosTag(tagString), // count); } protected static Pattern mainPosPat = Pattern .compile("\\[([^.\\]]*)[.\\]]"); public static String mainPosTag(String tag) { Matcher matcher = mainPosPat.matcher(tag); matcher.find(); try { return matcher.group(1); } catch (Exception e) { System.err.println(tag); return null; } } }