package edu.stanford.nlp.sequences; import java.util.*; import java.io.Serializable; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.PaddedList; /** * This is the abstract class that all feature factories must * subclass. It also defines most of the basic {@link Clique}s * that you would want to make features over. It contains a * convenient method, getCliques(maxLeft, maxRight) which will give * you all the cliques within the specified limits. * * @param <IN> The type of the items in the PaddedList from which features * are extracted * * @author Jenny Finkel */ public abstract class FeatureFactory<IN> implements Serializable { private static final long serialVersionUID = 7249250071983091694L; protected SeqClassifierFlags flags; public FeatureFactory() {} public void init (SeqClassifierFlags flags) { this.flags = flags; } public static final Clique cliqueC = Clique.valueOf(new int[] {0}); public static final Clique cliqueCpC = Clique.valueOf(new int[] {-1, 0}); public static final Clique cliqueCp2C = Clique.valueOf(new int[] {-2, 0}); public static final Clique cliqueCp3C = Clique.valueOf(new int[] {-3, 0}); public static final Clique cliqueCp4C = Clique.valueOf(new int[] {-4, 0}); public static final Clique cliqueCp5C = Clique.valueOf(new int[] {-5, 0}); public static final Clique cliqueCpCp2C = Clique.valueOf(new int[] {-2, -1, 0}); public static final Clique cliqueCpCp2Cp3C = Clique.valueOf(new int[] {-3, -2, -1, 0}); public static final Clique cliqueCpCp2Cp3Cp4C = Clique.valueOf(new int[] {-4, -3, -2, -1, 0}); public static final Clique cliqueCpCp2Cp3Cp4Cp5C = Clique.valueOf(new int[] {-5, -4, -3, -2, -1, 0}); public static final Clique cliqueCnC = Clique.valueOf(new int[] {0, 1}); public static final Clique cliqueCpCnC = Clique.valueOf(new int[] {-1, 0, 1}); public static final List<Clique> knownCliques = Arrays.asList(cliqueC, cliqueCpC, cliqueCp2C, cliqueCp3C, cliqueCp4C, cliqueCp5C, cliqueCpCp2C, cliqueCpCp2Cp3C, cliqueCpCp2Cp3Cp4C, cliqueCpCp2Cp3Cp4Cp5C, cliqueCnC, cliqueCpCnC); public List<Clique> getCliques() { return getCliques(flags.maxLeft, flags.maxRight); } public static List<Clique> getCliques(int maxLeft, int maxRight) { List<Clique> cliques = new ArrayList<>(); for (Clique c : knownCliques) { if (-c.maxLeft() <= maxLeft && c.maxRight() <= maxRight) { cliques.add(c); } } return cliques; } /** * This method returns a {@link Collection} of the features * calculated for the word at the specified position in info (the list of * words) for the specified {@link Clique}. * It should return the actual String features, <b>NOT</b> wrapped in any * other object, as the wrapping * will be done automatically. * Because it takes a {@link PaddedList} you don't * need to worry about indices which are outside of the list. * * @param info A PaddedList of the feature-value pairs * @param position The current position to extract features at * @param clique The particular clique for which to extract features. It * should be a member of the knownCliques list. * @return A {@link Collection} of the features * calculated for the word at the specified position in info. */ public abstract Collection<String> getCliqueFeatures(PaddedList<IN> info, int position, Clique clique); /** Makes more complete feature names out of partial feature names, by * adding a suffix to the String feature name, adding results to an * accumulator * * @param accumulator The output features are added here * @param addend The base set of features * @param suffix The suffix added to each feature in the addend set */ @SuppressWarnings({"MethodMayBeStatic"}) protected void addAllInterningAndSuffixing(Collection<String> accumulator, Collection<String> addend, String suffix) { boolean nonNullSuffix = suffix != null && ! suffix.isEmpty(); if (nonNullSuffix) { suffix = '|' + suffix; } // boolean intern2 = flags.intern2; for (String feat : addend) { if (nonNullSuffix) { feat = feat.concat(suffix); } // if (intern2) { // feat = feat.intern(); // } accumulator.add(feat); } } /** * Convenience methods for subclasses which use CoreLabel. Gets the * word after applying any wordFunction present in the * SeqClassifierFlags. * * @param label A CoreLabel * @return The TextAnnotation of the label, perhaps after passing it through * a function (flags.wordFunction) */ protected String getWord(CoreLabel label) { String word = label.getString(CoreAnnotations.TextAnnotation.class); if (flags.wordFunction != null) { word = flags.wordFunction.apply(word); } return word; } }