package hu.u_szeged.kpe.features;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.main.KPEFilter;
import hu.u_szeged.kpe.readers.DocumentData;
import hu.u_szeged.utils.NLPUtils;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.StopWordAnnotator.StopWordAnnotation;
import edu.stanford.nlp.util.CoreMap;
/**
* Determines the POS sequence for candidate phrases.
*/
public class PosFeature extends NominalFeature {
private static final long serialVersionUID = 8124439968184109852L;
private List<Integer> occurrences;
public PosFeature() {
scale = Scale.NUMERIC;
dummyValue = -1;
canBeRepresentedAsSequential = true;
}
public void setFeatureField(KPEFilter kf) {
}
public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
if (documentToExamine == -1) {
occurrences = new ArrayList<Integer>(docs.length);
for (int i = 0; i < docs.length; ++i)
occurrences.add(0);
}
int occurrence = ngramForm.getValue().getPositions().size();
occurrences.set(docToCheck, occurrences.get(docToCheck) + occurrence);
// String pos = ngramForm.getKey().getSequenceAsString(NGram.SequenceType.tag, '_');
// updateFeatureVals(this.getClass().getName() + "_" + pos, 1.0d, docToCheck, HashSet.class);
int ngramSize = ngramForm.getKey().size();
StringBuffer sb = new StringBuffer();
for (int position = 0; position < ngramSize; ++position) {
CoreLabel cl = ngramForm.getKey().get(position);
String type = ngramSize == 1 ? "S" : (position == 0 ? "B" : (position < ngramSize - 1 ? "I" : "E"));
// String tag = cl.get(StopWordAnnotation.class) ? "MISC" : (SzTECoreNLP.lang == Language.HU ? cl.tag()
// : cl.tag().substring(0,
String tag = cl.get(StopWordAnnotation.class) ? "MISC" : cl.tag().substring(0, Math.min(2, cl.tag().length()));
if (employBIESmarkup) {
updateFeatureVals(this.getClass().getName() + "_" + type + "_" + tag, occurrence, docToCheck);
} else {
sb.append("_" + tag);
}
}
if (!employBIESmarkup) {
updateFeatureVals(this.getClass().getName() + sb.toString(), occurrence, docToCheck);
}
}
protected double aggregation(List<Collection<Number>> docVals, String phrase, boolean train, List<int[]> length) {
double[] perDocFeatureVals = new double[docVals.size()];
for (int doc = 0; doc < docVals.size(); ++doc) {
List<Number> docNums = (ArrayList<Number>) docVals.get(doc);
if (docNums.size() > 0 && occurrences.get(doc) != 0) {
perDocFeatureVals[doc] = docNums.get(docNums.size() - 1).doubleValue() / occurrences.get(doc);
}
}
return NLPUtils.mean(perDocFeatureVals);
}
}