package hu.u_szeged.kpe.features;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.readers.DocumentData;
import hu.u_szeged.utils.NLPUtils;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.StopWordAnnotator.StopWordAnnotation;
import edu.stanford.nlp.util.CoreMap;
/**
* Indicates how many stopwords the original phrase contained (before having been converted into its normalized form).
*/
public class StopWordFeature extends Feature {
private static final long serialVersionUID = 8414326148202027597L;
private List<Integer> occurrences;
public StopWordFeature() {
scale = Scale.NUMERIC;
}
public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
int numOfOccurrences = ngramForm.getValue().getPositions().size();
if (occurrences == null) {
occurrences = new ArrayList<Integer>(docs.length);
for (int i = 0; i < docs.length; ++i)
occurrences.add(0);
}
occurrences.set(docToCheck, occurrences.get(docToCheck) + numOfOccurrences);
boolean hasStopWord = false;
for (CoreLabel ew : ngramForm.getKey())
hasStopWord = hasStopWord || ew.get(StopWordAnnotation.class);
if (hasStopWord)
updateFeatureVals(numOfOccurrences, docToCheck);
}
protected double aggregation(List<Collection<Number>> docVals, String phrase, boolean train, List<int[]> length) {
double[] perDocVals = new double[docVals.size()];
for (int doc = 0; doc < docVals.size(); ++doc) {
ArrayList<Number> stopwordsForDoc = (ArrayList<Number>) docVals.get(doc);
if (stopwordsForDoc.size() > 0 && occurrences.get(doc) > 0) {
double numHavingStopword = ((Number) (stopwordsForDoc.get(stopwordsForDoc.size() - 1))).doubleValue();
perDocVals[doc] = numHavingStopword / occurrences.get(doc);
}
}
occurrences = null;
return NLPUtils.mean(perDocVals);
}
}