package hu.u_szeged.kpe.features;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGram.SequenceType;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.main.KPEFilter;
import hu.u_szeged.kpe.readers.DocumentData;
import hu.u_szeged.utils.NLPUtils;
import hu.u_szeged.utils.WikiQuery;
import hu.u_szeged.utils.WikiQuery.QueryType;
import java.util.AbstractSequentialList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.NormalizerAnnotator.NormalizerAnnotation;
import edu.stanford.nlp.pipeline.StopWordAnnotator.StopWordAnnotation;
import edu.stanford.nlp.util.CoreMap;
/**
* This class sets the feature which indicates whether a Wikipedia article could be assigned to an NGram
*/
public class WikiFeature extends Feature {
private static final long serialVersionUID = 1L;
/** */
private static Map<String, Set<String>> categoryCache;
// TODO Q: Should the usage of categoryCache be limited somehow (e.g. by
// constraining it not to become extremely big) when there are lots of
// documents?? A: Probably, we shall return to this question after the first
// OutOfMemoryException happened.
public WikiFeature() {
scale = Scale.BINARY;
collectionToStoreDocVals = HashSet.class;
}
public void setFeatureField(KPEFilter kf) {
if (categoryCache == null) {
categoryCache = new HashMap<>();
}
}
/**
* This pattern is used to remove prefixes of category links and optional suffix parts in parenthesis
*/
private static final Pattern p = Pattern.compile("(?i)(^(category:|portal:)|\\s\\([^()]+\\))");
@SuppressWarnings("unchecked")
private Set<String> getNormalizedWikiCategories(String articleName) {
Set<String> normalizedCategories = categoryCache.get(articleName.toLowerCase());
if (normalizedCategories == null) {
normalizedCategories = new HashSet<>();
List<Object> categories = ((List<Object>) WikiQuery.performQuery(articleName.toLowerCase(), QueryType.CATEGORY));
// += 2 is used as every 2nd (even) Object is a count, while the odd indices stand for category names
// the very last entry is not needed, as it contains the sum of the counts
for (int i = 0; i < categories.size() - 1; i += 2) {
String category = (String) categories.get(i);
AbstractSequentialList<String> tokens = new LinkedList<String>();
Matcher m = p.matcher(category);
category = m.replaceAll("");
for (CoreLabel cl : new NGram(category)) {
if (!cl.word().matches(".*\\d.*") && cl.tag().startsWith("NN") && !cl.get(StopWordAnnotation.class)) {
tokens.add(cl.get(NormalizerAnnotation.class));
}
}
Collections.sort(tokens);
String joinedVersion = NLPUtils.join(tokens);
if (joinedVersion.length() > 0) {
normalizedCategories.add(joinedVersion);
}
}
categoryCache.put(articleName.toLowerCase(), normalizedCategories);
}
return normalizedCategories;
}
public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
String wikiForm = ngramForm.getKey().getSequenceAsString(SequenceType.WIKI_FROM).toLowerCase();
for (String category : getNormalizedWikiCategories(wikiForm)) {
updateFeatureVals(this.getClass().getName() + "_" + category, 1.0d, docToCheck);
}
}
}