package edu.stanford.nlp.international.morph; import java.io.Serializable; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Pair; /** * Morphological feature specification for surface forms in a given language. * Currently supported feature names are the values of MorphFeatureType. * * @author Spence Green * */ public abstract class MorphoFeatureSpecification implements Serializable { private static final long serialVersionUID = -5720683653931585664L; //Delimiter for associating a surface form with a morphological analysis, e.g., // // his~#PRP_3ms // public static final String MORPHO_MARK = "~#"; public static final String LEMMA_MARK = "|||"; public static final String NO_ANALYSIS = "XXX"; // WSGDEBUG -- // Added NNUM and NGEN for nominals in Arabic public static enum MorphoFeatureType {TENSE,DEF,ASP,MOOD,NNUM,NUM, NGEN, GEN,CASE,PER,POSS,VOICE,OTHER,PROP}; protected final Set<MorphoFeatureType> activeFeatures; public MorphoFeatureSpecification() { activeFeatures = Generics.newHashSet(); } public void activate(MorphoFeatureType feat) { activeFeatures.add(feat); } public boolean isActive(MorphoFeatureType feat) { return activeFeatures.contains(feat); } public abstract List<String> getValues(MorphoFeatureType feat); public abstract MorphoFeatures strToFeatures(String spec); /** * Returns the lemma as pair.first() and the morph analysis as pair.second(). */ public static Pair<String,String> splitMorphString(String word, String morphStr) { if (morphStr == null || morphStr.trim().equals("")) { return new Pair<>(word, NO_ANALYSIS); } String[] toks = morphStr.split(Pattern.quote(LEMMA_MARK)); if (toks.length != 2) { throw new RuntimeException("Invalid morphology string: " + morphStr); } return new Pair<>(toks[0], toks[1]); } @Override public String toString() { return activeFeatures.toString(); } }