ArabicMorphoFeatureSpecification.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.international.arabic; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatures;

/**
 * Extracts morphosyntactic features from BAMA/SAMA analyses. Compatible with both the
 * long tags in the ATB and the output of MADA.
 *
 * @author Spence Green
 *
 */
public class ArabicMorphoFeatureSpecification extends MorphoFeatureSpecification  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicMorphoFeatureSpecification.class);

  private static final long serialVersionUID = 4448045447200922076L;

  private static final String[] defVals = {"I", "D"};
  private static final String[] caseVals = {"NOM","ACC","GEN"};
  private static final String[] genVals = {"M","F"};
  private static final String[] numVals = {"SG","DU","PL"};
  private static final String[] perVals = {"1","2","3"};
  private static final String[] possVals = {"POSS"};
  private static final String[] voiceVals = {"ACT","PASS"};
  private static final String[] moodVals = {"I","S","J"};
  private static final String[] tenseVals = {"PAST","PRES","IMP"};

  // Standard feature tuple (e.g., "3MS", "1P", etc.)
  private static final Pattern pFeatureTuple = Pattern.compile("(\\d\\p{Upper}\\p{Upper}?)");

  // Demonstrative pronouns do not have number
  private static final Pattern pDemPronounFeatures = Pattern.compile("DEM_PRON(.+)");

  //Verbal patterns
  private static final Pattern pVerbMood = Pattern.compile("MOOD|SUBJ");
  private static final Pattern pMood = Pattern.compile("_MOOD:([ISJ])");
  private static final Pattern pVerbTenseMarker = Pattern.compile("IV|PV|CV");
  private static final Pattern pNounNoMorph = Pattern.compile("PROP|QUANT");

  @Override
  public List<String> getValues(MorphoFeatureType feat) {
    if(feat == MorphoFeatureType.DEF)
      return Arrays.asList(defVals);
    else if(feat == MorphoFeatureType.CASE) {
      throw new RuntimeException(this.getClass().getName() + ": Case is presently unsupported!");
//      return Arrays.asList(caseVals);
    } else if(feat == MorphoFeatureType.GEN)
      return Arrays.asList(genVals);
    else if(feat == MorphoFeatureType.NUM)
      return Arrays.asList(numVals);
    else if(feat == MorphoFeatureType.PER)
      return Arrays.asList(perVals);
    else if(feat == MorphoFeatureType.POSS)
      return Arrays.asList(possVals);
    else if(feat == MorphoFeatureType.VOICE)
      return Arrays.asList(voiceVals);
    else if(feat == MorphoFeatureType.MOOD)
      return Arrays.asList(moodVals);
    else if(feat == MorphoFeatureType.TENSE)
      return Arrays.asList(tenseVals);
    else
      throw new IllegalArgumentException("Arabic does not support feature type: " + feat.toString());
  }

  /**
   * Hand-written rules to convert SAMA analyses to feature structures.
   */
  @Override
  public MorphoFeatures strToFeatures(String spec) {
    MorphoFeatures features = new ArabicMorphoFeatures();

    // Check for the boundary symbol
    if(spec == null || spec.equals("")) {
      return features;
    }
    //Possessiveness
    if(isActive(MorphoFeatureType.POSS) && spec.contains("POSS")) {
      features.addFeature(MorphoFeatureType.POSS,possVals[0]);
    }

    //Nominals and pronominals. Mona ignores Pronominals in ERTS, but they seem to help...
    // NSUFF -- declinable nominals
    // VSUFF -- enclitic pronominals
    // PRON -- ordinary pronominals
    if(spec.contains("NSUFF") || spec.contains("NOUN") || spec.contains("ADJ")) {
      // Nominal phi feature indicators are different than the indicators
      // that we process with processInflectionalFeatures()
      if(isActive(MorphoFeatureType.NGEN)) {
        if(spec.contains("FEM")) {
          features.addFeature(MorphoFeatureType.NGEN, genVals[1]);
        } else if(spec.contains("MASC") || !pNounNoMorph.matcher(spec).find()) {
          features.addFeature(MorphoFeatureType.NGEN, genVals[0]);
        }
      }

      // WSGDEBUG -- Number for nominals only
      if(isActive(MorphoFeatureType.NNUM)) {
        if(spec.contains("DU")) {
          features.addFeature(MorphoFeatureType.NNUM, numVals[1]);
        } else if(spec.contains("PL")) {
          features.addFeature(MorphoFeatureType.NNUM, numVals[2]);
        } else if (!pNounNoMorph.matcher(spec).find()){ // (spec.contains("SG"))
          features.addFeature(MorphoFeatureType.NNUM, numVals[0]);
        }
      }

      //Definiteness
      if(isActive(MorphoFeatureType.DEF)) {
        if (spec.contains("DET")) {
          features.addFeature(MorphoFeatureType.DEF, defVals[1]);
        } else if (!pNounNoMorph.matcher(spec).find()){
          features.addFeature(MorphoFeatureType.DEF, defVals[0]);
        }
      }

      // Proper nouns (probably a stupid feature)
      if (isActive(MorphoFeatureType.PROP)) {
        if (spec.contains("PROP")) {
          features.addFeature(MorphoFeatureType.PROP,"");
        }
      }

    } else if(spec.contains("PRON") || (spec.contains("VSUFF_DO") && !pVerbMood.matcher(spec).find())) {
      if(spec.contains("DEM_PRON")) {
        features.addFeature(MorphoFeatureType.DEF, defVals[0]);
        Matcher m = pDemPronounFeatures.matcher(spec);
        if (m.find()) {
          spec = m.group(1);
          processInflectionalFeaturesHelper(features, spec);
        }

      } else {
        processInflectionalFeatures(features, spec);
      }

    // Verbs (marked for tense)
    } else if(pVerbTenseMarker.matcher(spec).find()) {

      // Tense feature
      if(isActive(MorphoFeatureType.TENSE)) {
        if(spec.contains("PV"))
          features.addFeature(MorphoFeatureType.TENSE, tenseVals[0]);
        else if(spec.contains("IV"))
          features.addFeature(MorphoFeatureType.TENSE, tenseVals[1]);
        else if(spec.contains("CV"))
          features.addFeature(MorphoFeatureType.TENSE, tenseVals[2]);
      }

      // Inflectional features
      processInflectionalFeatures(features, spec);

      if(isActive(MorphoFeatureType.MOOD)) {
        Matcher moodMatcher = pMood.matcher(spec);
        if(moodMatcher.find()) {
          String moodStr = moodMatcher.group(1);
          switch (moodStr) {
            case "I":
              features.addFeature(MorphoFeatureType.MOOD, moodVals[0]);
              break;
            case "S":
              features.addFeature(MorphoFeatureType.MOOD, moodVals[1]);
              break;
            case "J":
              features.addFeature(MorphoFeatureType.MOOD, moodVals[2]);
              break;
          }
        }
      }

      if(isActive(MorphoFeatureType.VOICE)) {
        if(spec.contains("PASS")) {
          features.addFeature(MorphoFeatureType.VOICE, voiceVals[1]);
        } else {
          features.addFeature(MorphoFeatureType.VOICE, voiceVals[0]);
        }
      }
    }
    return features;
  }

  /**
   * Extract features from a standard phi feature specification.
   *
   * @param feats
   * @param spec
   */
  private void processInflectionalFeatures(MorphoFeatures feats, String spec) {
    // Extract the feature tuple
    Matcher m = pFeatureTuple.matcher(spec);
    if (m.find()) {
      spec = m.group(1);
      processInflectionalFeaturesHelper(feats, spec);
    }
  }

  private void processInflectionalFeaturesHelper(MorphoFeatures feats, String spec) {
    if(isActive(MorphoFeatureType.GEN)) {
      if(spec.contains("M"))
        feats.addFeature(MorphoFeatureType.GEN, genVals[0]);
      else if(spec.contains("F"))
        feats.addFeature(MorphoFeatureType.GEN, genVals[1]);
    }

    if(isActive(MorphoFeatureType.NUM)) {
      if(spec.endsWith("S"))
        feats.addFeature(MorphoFeatureType.NUM, numVals[0]);
      else if(spec.endsWith("D"))
        feats.addFeature(MorphoFeatureType.NUM, numVals[1]);
      else if(spec.endsWith("P"))
        feats.addFeature(MorphoFeatureType.NUM, numVals[2]);
    }

    if(isActive(MorphoFeatureType.PER)) {
      if(spec.contains("1"))
        feats.addFeature(MorphoFeatureType.PER, perVals[0]);
      else if(spec.contains("2"))
        feats.addFeature(MorphoFeatureType.PER, perVals[1]);
      else if(spec.contains("3"))
        feats.addFeature(MorphoFeatureType.PER, perVals[2]);
    }
  }

  /**
   * Converts features specifications to labels for tagging
   *
   * @author Spence Green
   *
   */
  public static class ArabicMorphoFeatures extends MorphoFeatures {

    private static final long serialVersionUID = -4611776415583633186L;

    @Override
    public MorphoFeatures fromTagString(String str) {
      String[] feats = str.split("\\-");
      MorphoFeatures mFeats = new ArabicMorphoFeatures();
      // First element is the base POS
//      String baseTag = feats[0];
      for(int i = 1; i < feats.length; i++) {
        String[] keyValue = feats[i].split(KEY_VAL_DELIM);
        if(keyValue.length != 2) continue;
        MorphoFeatureType fName = MorphoFeatureType.valueOf(keyValue[0].trim());
        mFeats.addFeature(fName, keyValue[1].trim());
      }
      return mFeats;
    }

    @Override
    public String getTag(String basePartOfSpeech) {
      StringBuilder sb = new StringBuilder(basePartOfSpeech);
      // Iterate over feature list so that features are added in the same order
      // for every feature spec.
      for (MorphoFeatureType feat : MorphoFeatureType.values()) {
        if (hasFeature(feat)) {
          sb.append(String.format("-%s:%s",feat,fSpec.get(feat)));
        }
      }
      return sb.toString();
    }
  }

  /**
   * For debugging. Converts a set of long tags (BAMA analyses as in the ATB) to their morpho
   * feature specification. The input file should have one long tag per line.
   *
   * @param args
   */
  public static void main(String[] args) {
    if(args.length != 2) {
      System.err.printf("Usage: java %s filename feats%n", ArabicMorphoFeatureSpecification.class.getName());
      System.exit(-1);
    }

    MorphoFeatureSpecification fSpec = new ArabicMorphoFeatureSpecification();
    String[] feats = args[1].split(",");
    for(String feat : feats) {
      MorphoFeatureType fType = MorphoFeatureType.valueOf(feat);
      fSpec.activate(fType);
    }

    File fName = new File(args[0]);
    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fName)));

      int nLine = 0;
      for(String line;(line = br.readLine()) != null; nLine++) {
        MorphoFeatures mFeats = fSpec.strToFeatures(line.trim());
        System.out.printf("%s\t%s%n", line.trim(), mFeats.toString());
      }
      br.close();
      System.out.printf("%nRead %d lines%n",nLine);

    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}