WikiFeature.java example

Explorer

kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged

package hu.u_szeged.kpe.features;

import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGram.SequenceType;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.main.KPEFilter;
import hu.u_szeged.kpe.readers.DocumentData;
import hu.u_szeged.utils.NLPUtils;
import hu.u_szeged.utils.WikiQuery;
import hu.u_szeged.utils.WikiQuery.QueryType;

import java.util.AbstractSequentialList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.NormalizerAnnotator.NormalizerAnnotation;
import edu.stanford.nlp.pipeline.StopWordAnnotator.StopWordAnnotation;
import edu.stanford.nlp.util.CoreMap;

/**
 * This class sets the feature which indicates whether a Wikipedia article could be assigned to an NGram
 */
public class WikiFeature extends Feature {
  private static final long serialVersionUID = 1L;
  /** */
  private static Map<String, Set<String>> categoryCache;

  // TODO Q: Should the usage of categoryCache be limited somehow (e.g. by
  // constraining it not to become extremely big) when there are lots of
  // documents?? A: Probably, we shall return to this question after the first
  // OutOfMemoryException happened.

  public WikiFeature() {
    scale = Scale.BINARY;
    collectionToStoreDocVals = HashSet.class;
  }

  public void setFeatureField(KPEFilter kf) {
    if (categoryCache == null) {
      categoryCache = new HashMap<>();
    }
  }

  /**
   * This pattern is used to remove prefixes of category links and optional suffix parts in parenthesis
   */
  private static final Pattern p = Pattern.compile("(?i)(^(category:|portal:)|\\s\\([^()]+\\))");

  @SuppressWarnings("unchecked")
  private Set<String> getNormalizedWikiCategories(String articleName) {
    Set<String> normalizedCategories = categoryCache.get(articleName.toLowerCase());
    if (normalizedCategories == null) {
      normalizedCategories = new HashSet<>();

      List<Object> categories = ((List<Object>) WikiQuery.performQuery(articleName.toLowerCase(), QueryType.CATEGORY));
      // += 2 is used as every 2nd (even) Object is a count, while the odd indices stand for category names
      // the very last entry is not needed, as it contains the sum of the counts
      for (int i = 0; i < categories.size() - 1; i += 2) {
        String category = (String) categories.get(i);
        AbstractSequentialList<String> tokens = new LinkedList<String>();
        Matcher m = p.matcher(category);
        category = m.replaceAll("");
        for (CoreLabel cl : new NGram(category)) {
          if (!cl.word().matches(".*\\d.*") && cl.tag().startsWith("NN") && !cl.get(StopWordAnnotation.class)) {
            tokens.add(cl.get(NormalizerAnnotation.class));
          }
        }
        Collections.sort(tokens);
        String joinedVersion = NLPUtils.join(tokens);
        if (joinedVersion.length() > 0) {
          normalizedCategories.add(joinedVersion);
        }
      }
      categoryCache.put(articleName.toLowerCase(), normalizedCategories);
    }
    return normalizedCategories;
  }

  public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
      List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
    String wikiForm = ngramForm.getKey().getSequenceAsString(SequenceType.WIKI_FROM).toLowerCase();
    for (String category : getNormalizedWikiCategories(wikiForm)) {
      updateFeatureVals(this.getClass().getName() + "_" + category, 1.0d, docToCheck);
    }
  }
}