/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.tools.featurizer; import java.text.NumberFormat; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Locale; import java.util.regex.Pattern; import org.cogroo.tools.chunker2.TokenTag; import opennlp.tools.util.featuregen.StringPattern; import opennlp.tools.util.featuregen.TokenClassFeatureGenerator; /** * A context generator for the Featurizer. */ public class DefaultFeaturizerContextGenerator implements FeaturizerContextGenerator { protected final String SE = "*SE*"; protected final String SB = "*SB*"; private static final int PREFIX_LENGTH = 4; private static final int SUFFIX_LENGTH = 4; private TokenClassFeatureGenerator tokenClassFeatureGenerator = new TokenClassFeatureGenerator(); // TODO: this is language dependent! private NumberFormat nf = NumberFormat.getInstance(new Locale("pt")); private boolean isWiderContext; private boolean isSuffixFeats; private boolean isHiphenedFeats; private boolean isNumberFeats; private boolean isClassFeatures; /** * Default is shnc * @param flags */ public DefaultFeaturizerContextGenerator(String flags) { this.isWiderContext = flags.contains("w"); this.isSuffixFeats = flags.contains("s"); this.isHiphenedFeats = flags.contains("h"); this.isNumberFeats = flags.contains("n"); this.isClassFeatures = flags.contains("c"); } protected static String[] getPrefixes(String lex) { String[] prefs = new String[PREFIX_LENGTH]; for (int li = 0, ll = PREFIX_LENGTH; li < ll; li++) { prefs[li] = lex.substring(0, Math.min(li + 1, lex.length())); } return prefs; } protected static String[] getSuffixes(String lex) { String[] suffs = new String[SUFFIX_LENGTH]; for (int li = 0, ll = SUFFIX_LENGTH; li < ll; li++) { suffs[li] = lex.substring(Math.max(lex.length() - li - 1, 0)); } return suffs; } public String[] getContext(int index, TokenTag[] sequence, String[] priorDecisions, Object[] additionalContext) { String[] w = new String[sequence.length]; String[] t = new String[sequence.length]; TokenTag.extract(sequence, w, t); return getContext(index, w, t, priorDecisions); } /** * Returns the context for making a pos tag decision at the specified token * index given the specified tokens and previous tags. * * @param i * The index of the token for which the context is provided. * @param toks * The tokens in the sentence. * @param tags * pos-tags * @param preds * The tags assigned to the previous words in the sentence. * @return The context for making a pos tag decision at the specified token * index given the specified tokens and previous tags. */ public String[] getContext(int i, String[] toks, String[] tags, String[] preds) { List<String> e = new ArrayList<String>(); if(isWiderContext) createWindowFeats(i, toks, tags, preds, e); else create3WindowFeats(i, toks, tags, preds, e); if(i > 0) wrappWindowFeatures("prev_", i-1, toks, tags, preds, e); wrappWindowFeatures("", i, toks, tags, preds, e); if(i < toks.length - 1) wrappWindowFeatures("nxt_", i+1, toks, tags, preds, e); String[] context = e.toArray(new String[e.size()]); return context; } private void wrappWindowFeatures(String prefix, int i, String[] toks, String[] tags, String[] preds, List<String> e) { String lex = toks[i]; List<String> features = new ArrayList<String>(); if(isClassFeatures) tokenClassFeatureGenerator.createFeatures(features, toks, i, preds); if(isNumberFeats) createNumberFeats(i, toks, features); boolean suffixesCollected = false; if(isHiphenedFeats) { if(lex.length() >= 3) { if (lex.contains("_")) { createGroupSuffixex("us_", lex, features); suffixesCollected = true; } if (lex.contains("-")) { createGroupSuffixex("hf_", lex, features); suffixesCollected = true; } } } if(!suffixesCollected && isSuffixFeats) { createSuffixFeats(i, toks, tags, preds, features); } for (String f : features) { e.add(prefix + f); } } private static final Pattern UNDERLINE_PATTERN = Pattern.compile("[_-]"); private void createGroupSuffixex(String pre, String lex, List<String> e) { String[] parts = UNDERLINE_PATTERN.split(lex); if (parts.length < 2) // this is handled already return; for (int i = 0; i < parts.length; i++) { e.add(pre + "up_" + i + "=" + parts[i]); String prefix = pre + "prsf_" + i + "="; String[] suffixes = getSuffixes(parts[i]); for (String suf : suffixes) { e.add(prefix + suf); } } } private void createNumberFeats(int i, String[] toks, List<String> e) { String lex = toks[i]; // numbers would benefit from this StringPattern sp = StringPattern.recognize(lex); if (sp.containsDigit() && !sp.containsLetters()) { // TODO: make it generic !! this is only for Portuguese! String num = lex; // we need only the decimal separator try { Number number = nf.parse(num); if (number != null) { Double value = Math.abs(number.doubleValue()); if (value >= 2) { e.add("num=h2"); } else if (value >= 1) { e.add("num=h1"); } else if (value > 0) { e.add("num=h0"); } else { e.add("num=zero"); } } else { e.add("numNull"); } } catch (ParseException e1) { // nothing to do... // System.err.println("failed to parse num: " + num); e.add("notNum"); } } } private void createSuffixFeats(int i, String[] toks, String[] tags, String[] preds, List<String> e) { String lex = toks[i]; // do some basic suffix analysis String[] suffs = getSuffixes(lex); for (int j = 0; j < suffs.length; j++) { e.add("suf=" + suffs[j]); } String[] prefs = getPrefixes(lex); for (int j = 0; j < prefs.length; j++) { e.add("pre=" + prefs[j]); } // see if the word has any special characters if (lex.indexOf('-') != -1) { e.add("h"); } } // 0.9674293472168595 private void createWindowFeats(int i, String[] toks, String[] tags, String[] preds, List<String> feats) { // Words in a 5-word window String w_2, w_1, w0, w1, w2; // Tags in a 5-word window String t_2, t_1, t0, t1, t2; // Previous predictions String p_2, p_1; w_2 = w_1 = w0 = w1 = w2 = null; t_2 = t_1 = t0 = t1 = t2 = null; p_1 = p_2 = null; if (i < 2) { w_2 = "w_2=bos"; t_2 = "t_2=bos"; p_2 = "p_2=bos"; } else { w_2 = "w_2=" + toks[i - 2]; t_2 = "t_2=" + tags[i - 2]; p_2 = "p_2" + preds[i - 2]; } if (i < 1) { w_1 = "w_1=bos"; t_1 = "t_1=bos"; p_1 = "p_1=bos"; } else { w_1 = "w_1=" + toks[i - 1]; t_1 = "t_1=" + tags[i - 1]; p_1 = "p_1=" + preds[i - 1]; } w0 = "w0=" + toks[i]; t0 = "t0=" + tags[i]; if (i + 1 >= toks.length) { w1 = "w1=eos"; t1 = "t1=eos"; } else { w1 = "w1=" + toks[i + 1]; t1 = "t1=" + tags[i + 1]; } if (i + 2 >= toks.length) { w2 = "w2=eos"; t2 = "t2=eos"; } else { w2 = "w2=" + toks[i + 2]; t2 = "t2=" + tags[i + 2]; } String[] features = new String[] { // add word features w_2, w_1, w0, w1, w2, w_1 + w0, w0 + w1, // add tag features t_2, t_1, t0, t1, t2, t_2 + t_1, t_1 + t0, t0 + t1, t1 + t2, t_2 + t_1 + t0, t_1 + t0 + t1, t0 + t1 + t2, // add pred tags p_2, p_1, p_2 + p_1, // add pred and tag p_1 + t_2, p_1 + t_1, p_1 + t0, p_1 + t1, p_1 + t2, p_1 + t_2 + t_1, p_1 + t_1 + t0, p_1 + t0 + t1, p_1 + t1 + t2, p_1 + t_2 + t_1 + t0, p_1 + t_1 + t0 + t1, p_1 + t0 + t1 + t2, // add pred and word p_1 + w_2, p_1 + w_1, p_1 + w0, p_1 + w1, p_1 + w2, p_1 + w_1 + w0, p_1 + w0 + w1 }; feats.addAll(Arrays.asList(features)); } //0.9670307770871996 private void create3WindowFeats(int i, String[] toks, String[] tags, String[] preds, List<String> feats) { // Words in a 5-word window String w_1, w0, w1; // Tags in a 5-word window String t_1, t0, t1; // Previous predictions String p_2, p_1; w0 = w1 = null; t_1 = t0 = t1 = null; p_1 = p_2 = null; if (i < 2) { p_2 = "p_2=bos"; } else { p_2 = "p_2" + preds[i - 2]; } if (i < 1) { w_1 = "w_1=bos"; t_1 = "t_1=bos"; p_1 = "p_1=bos"; } else { w_1 = "w_1=" + toks[i - 1]; t_1 = "t_1=" + tags[i - 1]; p_1 = "p_1=" + preds[i - 1]; } w0 = "w0=" + toks[i]; t0 = "t0=" + tags[i]; if (i + 1 >= toks.length) { w1 = "w1=eos"; t1 = "t1=eos"; } else { w1 = "w1=" + toks[i + 1]; t1 = "t1=" + tags[i + 1]; } String[] features = new String[] { // add word features w_1, w0, w1, w_1 + w0, w0 + w1, // add tag features t_1, t0, t1, t_1 + t0, t0 + t1, t_1 + t0 + t1, // add pred tags p_2, p_1, p_2 + p_1, // add pred and tag p_1 + t_1, p_1 + t0, p_1 + t1, p_1 + t_1 + t0, p_1 + t0 + t1, p_1 + t_1 + t0 + t1, // add pred and word p_1 + w_1, p_1 + w0, p_1 + w1, p_1 + w_1 + w0, p_1 + w0 + w1 }; feats.addAll(Arrays.asList(features)); } }