package edu.stanford.nlp.ie; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Index; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.HashIndex; import java.io.*; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * A class for holding Chinese morphological features used for word segmentation and POS tagging. * * @author Galen Andrew */ public class ChineseMorphFeatureSets implements Serializable { private static final long serialVersionUID = -1055526945031459198L; private Index<String> featIndex = new HashIndex<>(); private Map<String, Set<Character>> singletonFeatures = Generics.newHashMap(); private Map<String, Pair<Set<Character>, Set<Character>>> affixFeatures = Generics.newHashMap(); public Map<String, Set<Character>> getSingletonFeatures() { return singletonFeatures; } public Map<String, Pair<Set<Character>, Set<Character>>> getAffixFeatures() { return affixFeatures; } public ChineseMorphFeatureSets(String featureDir) { try { File dir = new File(featureDir); File[] files = dir.listFiles((dir1, name) -> name.endsWith(".gb")); for (File file : files) { getFeatures(file); } } catch (IOException e) { throw new RuntimeException("Error creating ChineseMaxentLexicon" + e); } } private enum FeatType { PREFIX, SUFFIX, SINGLETON } private void getFeatures(File file) throws IOException { BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GB18030")); String filename = file.getName(); String singleFeatName = filename; if (singleFeatName.indexOf('.') >= 0) { singleFeatName = singleFeatName.substring(0, filename.lastIndexOf('.')); } FeatType featType = null; for (FeatType ft : FeatType.values()) { if (filename.contains(ft.toString().toLowerCase())) { featType = ft; singleFeatName = singleFeatName.substring(0, filename.indexOf(ft.toString().toLowerCase())); if (singleFeatName.endsWith("_")) { singleFeatName = singleFeatName.substring(0, singleFeatName.lastIndexOf('_')); } break; } } featIndex.add(singleFeatName); String singleFeatIndexString = Integer.toString(featIndex.indexOf(singleFeatName)); Set<Character> featureSet = Generics.newHashSet(); String line; Pattern typedDoubleFeatPattern = Pattern.compile("([A-Za-z]+)\\s+(.)\\s+(.)\\s*"); Pattern typedSingleFeatPattern = Pattern.compile("([A-Za-z]+)\\s+(.)\\s*"); Pattern singleFeatPattern = Pattern.compile("(.)(?:\\s+[0-9]+)?\\s*"); while ((line = in.readLine()) != null) { if (line.length() == 0) { continue; } if (featType == null) { Matcher typedDoubleFeatMatcher = typedDoubleFeatPattern.matcher(line); if (typedDoubleFeatMatcher.matches()) { String featName = typedDoubleFeatMatcher.group(1); featIndex.add(featName); String featIndexString = Integer.toString(featIndex.indexOf(featName)); String prefixChar = typedDoubleFeatMatcher.group(2); addTypedFeature(featIndexString, prefixChar.charAt(0), true); String suffixChar = typedDoubleFeatMatcher.group(3); addTypedFeature(featIndexString, suffixChar.charAt(0), false); continue; } } Matcher typedSingleFeatMatcher = typedSingleFeatPattern.matcher(line); if (typedSingleFeatMatcher.matches()) { String featName = typedSingleFeatMatcher.group(1); featIndex.add(featName); String featIndexString = Integer.toString(featIndex.indexOf(featName)); String charString = typedSingleFeatMatcher.group(2); switch (featType) { case PREFIX: addTypedFeature(featIndexString, charString.charAt(0), true); break; case SUFFIX: addTypedFeature(featIndexString, charString.charAt(0), false); break; case SINGLETON: throw new RuntimeException("ERROR: typed SINGLETON feature."); } continue; } Matcher singleFeatMatcher = singleFeatPattern.matcher(line); if (singleFeatMatcher.matches()) { String charString = singleFeatMatcher.group(); featureSet.add(charString.charAt(0)); continue; } if (line.startsWith("prefix") || line.startsWith("suffix")) { if (featureSet.size() > 0) { Pair<Set<Character>, Set<Character>> p = affixFeatures.get(singleFeatIndexString); if (p == null) { affixFeatures.put(singleFeatIndexString, p = new Pair<>()); } if (featType == FeatType.PREFIX) { p.setFirst(featureSet); } else { p.setSecond(featureSet); } featureSet = Generics.newHashSet(); } featType = FeatType.PREFIX; if (line.startsWith("prefix")) { featType = FeatType.PREFIX; } else if (line.startsWith("suffix")) { featType = FeatType.SUFFIX; } } } if (featureSet.size() > 0) { if (featType == FeatType.SINGLETON) { singletonFeatures.put(singleFeatIndexString, featureSet); } else { Pair<Set<Character>, Set<Character>> p = affixFeatures.get(singleFeatIndexString); if (p == null) { affixFeatures.put(singleFeatIndexString, p = new Pair<>()); } if (featType == FeatType.PREFIX) { p.setFirst(featureSet); } else { p.setSecond(featureSet); } } } } private void addTypedFeature(String featName, char featChar, boolean isPrefix) { Pair<Set<Character>, Set<Character>> p = affixFeatures.get(featName); if (p == null) { affixFeatures.put(featName, p = new Pair<>()); } Set<Character> feature; if (isPrefix) { feature = p.first(); if (feature == null) { p.setFirst(feature = Generics.newHashSet()); } } else { feature = p.second(); if (feature == null) { p.setSecond(feature = Generics.newHashSet()); } } feature.add(featChar); } }