package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.main.KPEFilter; import hu.u_szeged.kpe.readers.DocumentData; import hu.u_szeged.utils.NLPUtils; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; import edu.stanford.nlp.util.CoreMap; /** * Feature classes are responsible for various feature calculation. */ public abstract class Feature implements Serializable { public enum Scale { BINARY, NOMINAL, NUMERIC } private static final long serialVersionUID = 1L; protected static boolean binarizeNominals; protected boolean canBeRepresentedAsSequential; protected boolean employBIESmarkup; public static Map<String, Integer[]> dict; protected int numDoc; public static int numberOfDocs; /** its value represents how many documents have been keyphrased (including the one in progress) **/ protected int documentToExamine = -1; /** * dummy value is the one that is applied in those situations when a phrase was not present in a document at * all **/ protected double dummyValue = 0.0d; protected Class<?> collectionToStoreDocVals = ArrayList.class; protected Scale scale = null; protected Map<String, List<Collection<Number>>> featureVals = new HashMap<String, List<Collection<Number>>>(); protected int maxPhraseLength; public void setBinarization(boolean needToBinarizeNominals) { binarizeNominals = needToBinarizeNominals; } public abstract void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs); public void setFeatureField(KPEFilter kf) { dict = kf.getDictionary(); numDoc = kf.getDocsNumber(); maxPhraseLength = kf.getMaxPhraseLength(); } public void setEmployBIESmarkup(Boolean newValue) { if (canBeRepresentedAsSequential) { if (newValue == null) { try { throw new Exception("Too little BIES parameters were passed."); } catch (Exception e) { e.printStackTrace(); } } employBIESmarkup = newValue; } } public double getDummyVal() { return dummyValue; } public Map<String, List<Collection<Number>>> getVals() { return featureVals; } protected void updateFeatureVals(Number val, int docToCheck) { if (this instanceof SentiWordnetFeature || val.intValue() != -1) { updateFeatureVals(this.getClass().getName(), val, docToCheck, collectionToStoreDocVals); } } protected void updateFeatureVals(String key, Number val, int docToCheck) { updateFeatureVals(key, val, docToCheck, collectionToStoreDocVals); } @SuppressWarnings("unchecked") protected void updateFeatureVals(String key, Number val, int docToCheck, Class<?> collectionClass) { List<Collection<Number>> docFeatVals = featureVals.get(key); if (docFeatVals == null) { docFeatVals = new ArrayList<Collection<Number>>(numberOfDocs); for (int i = 0; i < numberOfDocs; ++i) { try { docFeatVals.add((Collection<Number>) collectionClass.newInstance()); } catch (Exception e) { e.printStackTrace(); } } featureVals.put(key, docFeatVals); } Collection<Number> vals = docFeatVals.get(docToCheck); if (vals instanceof Set<?> || vals instanceof LinkedList<?>) { vals.add(val == null ? null : val.doubleValue()); } else if (vals instanceof ArrayList<?>) { if (vals.size() > 0) { double lastVal = ((Number) ((ArrayList<?>) vals).get(vals.size() - 1)).doubleValue(); vals.add(val.doubleValue() + lastVal); } else { vals.add(val); } } else { System.err.println("No such collection was expected."); } documentToExamine = docToCheck; } protected double aggregation(List<Collection<Number>> docVals, String phrase, boolean train, List<int[]> length) { double[] perDocFeatureVals = new double[docVals.size()]; for (int doc = 0; doc < docVals.size(); ++doc) { Collection<Number> docNums = docVals.get(doc); if (docNums instanceof TreeSet<?>) { perDocFeatureVals[doc] = ((TreeSet<Number>) docVals.get(doc)).first().doubleValue(); } else if (docNums instanceof HashSet<?>) { perDocFeatureVals[doc] = docNums.contains(1.0d) ? 1.0d : 0.0d; } else if (docNums instanceof ArrayList<?>) { perDocFeatureVals[doc] = ((ArrayList<Number>) docNums).get(docNums.size() - 1).doubleValue(); } else if (docNums instanceof LinkedList<?>) { perDocFeatureVals[doc] = NLPUtils.mean(docVals.get(doc)); } else { System.err.println("Unexpected collection type " + docNums.getClass() + "\t" + this.getClass()); } } return NLPUtils.mean(perDocFeatureVals); } public Map<String, Double> aggregateVals(boolean train, String token, List<int[]> length, double[] dedicatedFeatures) { Map<String, Double> aggregatedVals = new HashMap<String, Double>(); for (Entry<String, List<Collection<Number>>> entry : featureVals.entrySet()) { double featureVal = aggregation(entry.getValue(), token, train, length); aggregatedVals.put(entry.getKey(), featureVal); if (this instanceof TfIdfFeature) dedicatedFeatures[0] = featureVal; else if (this instanceof FirstIndexFeature) dedicatedFeatures[1] = featureVal; } // reset it so the next time a set of documents are to be keyphrased this flag can start to count from the // beginning documentToExamine = -1; featureVals = new HashMap<String, List<Collection<Number>>>(); return aggregatedVals; } }