package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.readers.DocumentData; import java.util.AbstractMap.SimpleEntry; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; import edu.stanford.nlp.pipeline.MweDictAnnotator.MWEAnnotation; import edu.stanford.nlp.util.CoreMap; /** * This class makes use of a list, crawled from Wikipedia containing MWEs and identifies NGrams that can be * mapped to any element of it. */ public class MweFeature extends Feature { /** * */ private static final long serialVersionUID = 3440662396687971296L; public MweFeature() { scale = Scale.BINARY; collectionToStoreDocVals = HashSet.class; } @Override public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { boolean isMWE = false, isCompoundMwe = false, containsMwe = false; List<Entry<Character, Integer>> types = new ArrayList<Entry<Character, Integer>>(); List<String> markUps = new ArrayList<String>(); List<int[]> spans = new ArrayList<int[]>(); for (int i = 0; i < ngramForm.getKey().size(); ++i) { String mweAnnotation = ngramForm.getKey().get(i).get(MWEAnnotation.class); String[] mweAnnotationParts = mweAnnotation.split("@"); int removed = 0; for (int p = 0; p < mweAnnotationParts.length; ++p) { char type = mweAnnotationParts[p].charAt(0); String markUp = mweAnnotationParts[p].replaceAll("[BIE]-", ""); if (i == 0 && type != 'E' && type != 'O') { types.add(new SimpleEntry<Character, Integer>(type, i)); markUps.add(markUp); } else if (i > 0) { if (type == 'E') { Entry<Character, Integer> t = types.remove(p - removed); markUps.remove(p - removed); removed++; boolean acceptablePOSsequence = true; for (int token = t.getValue(); token <= i; ++token) { String tag = ngramForm.getKey().get(token).get(PartOfSpeechAnnotation.class); if ((token == t.getValue() || token == i) && !tag.matches("(?i)nn.{0,2}|jj.?")) { acceptablePOSsequence = false; break; } else if (!tag.matches("(?i)nn.{0,2}|jj.?|in|pos")) { acceptablePOSsequence = false; break; } } if (acceptablePOSsequence) { containsMwe = containsMwe || t.getKey() == 'B'; boolean fullScope = t.getValue() == 0 && i == ngramForm.getKey().size() - 1; if (t.getKey() == 'B' && !fullScope) spans.add(new int[] { t.getValue(), i }); isMWE = isMWE || (t.getKey() == 'B' && fullScope); } } else if (type == 'B') { types.add(new SimpleEntry<Character, Integer>(type, i)); markUps.add(markUp); } } } } if (spans.size() > 1) { for (int[] span : spans) { for (int[] span2 : spans) { if (span[1] >= span2[0] && span[0] == 0 && span2[1] == ngramForm.getKey().size() - 1) { isCompoundMwe = true; } } } } updateFeatureVals(this.getClass().getName() + "_NGram", isMWE ? 1.0d : 0.0d, docToCheck); updateFeatureVals(this.getClass().getName() + "_compoundNGram", isCompoundMwe ? 1.0d : 0.0d, docToCheck); updateFeatureVals(this.getClass().getName() + "_Containment", containsMwe ? 1.0d : 0.0d, docToCheck); } }