ChineseMorphFeatureSets.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.ie;

import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.HashIndex;

import java.io.*;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A class for holding Chinese morphological features used for word segmentation and POS tagging.
 *
 * @author Galen Andrew
 */
public class ChineseMorphFeatureSets implements Serializable {

  private static final long serialVersionUID = -1055526945031459198L;

  private Index<String> featIndex = new HashIndex<>();
  private Map<String, Set<Character>> singletonFeatures = Generics.newHashMap();
  private Map<String, Pair<Set<Character>, Set<Character>>> affixFeatures = Generics.newHashMap();

  public Map<String, Set<Character>> getSingletonFeatures() {
    return singletonFeatures;
  }

  public Map<String, Pair<Set<Character>, Set<Character>>> getAffixFeatures() {
    return affixFeatures;
  }

  public ChineseMorphFeatureSets(String featureDir) {
    try {
      File dir = new File(featureDir);
      File[] files = dir.listFiles((dir1, name) -> name.endsWith(".gb"));
      for (File file : files) {
        getFeatures(file);
      }
    } catch (IOException e) {
      throw new RuntimeException("Error creating ChineseMaxentLexicon" + e);
    }
  }

  private enum FeatType {
    PREFIX, SUFFIX, SINGLETON
  }

  private void getFeatures(File file) throws IOException {
    BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GB18030"));
    String filename = file.getName();
    String singleFeatName = filename;
    if (singleFeatName.indexOf('.') >= 0) {
      singleFeatName = singleFeatName.substring(0, filename.lastIndexOf('.'));
    }

    FeatType featType = null;
    for (FeatType ft : FeatType.values()) {
      if (filename.contains(ft.toString().toLowerCase())) {
        featType = ft;
        singleFeatName = singleFeatName.substring(0, filename.indexOf(ft.toString().toLowerCase()));
        if (singleFeatName.endsWith("_")) {
          singleFeatName = singleFeatName.substring(0, singleFeatName.lastIndexOf('_'));
        }
        break;
      }
    }

    featIndex.add(singleFeatName);
    String singleFeatIndexString = Integer.toString(featIndex.indexOf(singleFeatName));

    Set<Character> featureSet = Generics.newHashSet();
    String line;
    Pattern typedDoubleFeatPattern = Pattern.compile("([A-Za-z]+)\\s+(.)\\s+(.)\\s*");
    Pattern typedSingleFeatPattern = Pattern.compile("([A-Za-z]+)\\s+(.)\\s*");
    Pattern singleFeatPattern = Pattern.compile("(.)(?:\\s+[0-9]+)?\\s*");
    while ((line = in.readLine()) != null) {
      if (line.length() == 0) {
        continue;
      }

      if (featType == null) {
        Matcher typedDoubleFeatMatcher = typedDoubleFeatPattern.matcher(line);
        if (typedDoubleFeatMatcher.matches()) {
          String featName = typedDoubleFeatMatcher.group(1);
          featIndex.add(featName);
          String featIndexString = Integer.toString(featIndex.indexOf(featName));
          String prefixChar = typedDoubleFeatMatcher.group(2);
          addTypedFeature(featIndexString, prefixChar.charAt(0), true);
          String suffixChar = typedDoubleFeatMatcher.group(3);
          addTypedFeature(featIndexString, suffixChar.charAt(0), false);
          continue;
        }
      }

      Matcher typedSingleFeatMatcher = typedSingleFeatPattern.matcher(line);
      if (typedSingleFeatMatcher.matches()) {
        String featName = typedSingleFeatMatcher.group(1);
        featIndex.add(featName);
        String featIndexString = Integer.toString(featIndex.indexOf(featName));
        String charString = typedSingleFeatMatcher.group(2);
        switch (featType) {
          case PREFIX:
            addTypedFeature(featIndexString, charString.charAt(0), true);
            break;

          case SUFFIX:
            addTypedFeature(featIndexString, charString.charAt(0), false);
            break;

          case SINGLETON:
            throw new RuntimeException("ERROR: typed SINGLETON feature.");
        }
        continue;
      }

      Matcher singleFeatMatcher = singleFeatPattern.matcher(line);
      if (singleFeatMatcher.matches()) {
        String charString = singleFeatMatcher.group();
        featureSet.add(charString.charAt(0));
        continue;
      }

      if (line.startsWith("prefix") || line.startsWith("suffix")) {
        if (featureSet.size() > 0) {
          Pair<Set<Character>, Set<Character>> p = affixFeatures.get(singleFeatIndexString);
          if (p == null) {
            affixFeatures.put(singleFeatIndexString, p = new Pair<>());
          }
          if (featType == FeatType.PREFIX) {
            p.setFirst(featureSet);
          } else {
            p.setSecond(featureSet);
          }
          featureSet = Generics.newHashSet();
        }
        featType = FeatType.PREFIX;
        if (line.startsWith("prefix")) {
          featType = FeatType.PREFIX;
        } else if (line.startsWith("suffix")) {
          featType = FeatType.SUFFIX;
        }
      }
    }

    if (featureSet.size() > 0) {
      if (featType == FeatType.SINGLETON) {
        singletonFeatures.put(singleFeatIndexString, featureSet);
      } else {
        Pair<Set<Character>, Set<Character>> p = affixFeatures.get(singleFeatIndexString);
        if (p == null) {
          affixFeatures.put(singleFeatIndexString, p = new Pair<>());
        }
        if (featType == FeatType.PREFIX) {
          p.setFirst(featureSet);
        } else {
          p.setSecond(featureSet);
        }
      }
    }
  }

  private void addTypedFeature(String featName, char featChar, boolean isPrefix) {
    Pair<Set<Character>, Set<Character>> p = affixFeatures.get(featName);
    if (p == null) {
      affixFeatures.put(featName, p = new Pair<>());
    }
    Set<Character> feature;
    if (isPrefix) {
      feature = p.first();
      if (feature == null) {
        p.setFirst(feature = Generics.newHashSet());
      }
    } else {
      feature = p.second();
      if (feature == null) {
        p.setSecond(feature = Generics.newHashSet());
      }
    }
    feature.add(featChar);
  }

}