AcronymFeature.java example

Explorer

kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged

package hu.u_szeged.kpe.features;

import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGram.SequenceType;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.readers.DocumentData;
import hu.u_szeged.utils.NLPUtils;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;

import edu.stanford.nlp.util.CoreMap;

/**
 * Decides whether a given keyphrase aspirant is an extended for of an acronym present in its document.
 */
public class AcronymFeature extends Feature {
  private static final long serialVersionUID = -499383355487365213L;
  private Set<String> abbreviations;

  public AcronymFeature() {
    scale = Scale.BINARY;
    collectionToStoreDocVals = HashSet.class;
  }

  /**
   * Method that checks whether a long term (of more than one token) could be an extended form of of a possible acronym.<br />
   * Initials of the individual tokens of the longer form must be consistent with the abbreviation in the sense that they must not be such characters
   * that are not a successive character of the substring of the abbreviation that is not covered.<br />
   * E.g. for pairs
   * <ul>
   * <li>("UN", "United Nations") the result would be 1.0d</li>
   * <li>("UN", "United Kingdom") 0.0d</li>
   * <li>("UNO", "United Organization") gives the result of 1.0 as well</li>
   * </ul>
   * 
   * @param acronym
   *          abbreviation or acronym to check
   * @param longTerm
   *          possible extension of the shorter form of the other parameter
   * @return double value indicating when returning 1.0 that the longer term could be an extended form of the shorter form in question
   */
  public static double checkForAcronymity(String acronym, String longTerm) {
    acronym = acronym.toLowerCase().replaceAll("\\p{Punct}", "");
    longTerm = NLPUtils.join(longTerm.split("-")).toLowerCase();
    longTerm = longTerm.replaceAll("\\s+and\\s+", " ");
    if (longTerm.split(" ").length < 2 || longTerm.startsWith(acronym))
      return 0.0;
    ArrayList<Integer> tokenBorders = new ArrayList<Integer>();
    Stack<Integer> lastlyInvolved = new Stack<Integer>();
    lastlyInvolved.push(0);
    int whitespace = longTerm.indexOf(" ") + 1;
    tokenBorders.add(0);
    tokenBorders.add(whitespace);
    while ((whitespace = longTerm.indexOf(" ", whitespace)) != -1)
      tokenBorders.add(++whitespace);
    int[] matchingChars = new int[longTerm.length()];
    boolean[] notForInvolving = new boolean[longTerm.length()];
    for (int c = 0; c < matchingChars.length; ++c) {
      if (c > 0)
        matchingChars[c] = matchingChars[c - 1];
      if (tokenBorders.contains(c)) {
        if (longTerm.charAt(c) != acronym.charAt(matchingChars[c])) {
          int wronglyInvolvedIndex;
          if ((wronglyInvolvedIndex = lastlyInvolved.pop()) == 0)
            return 0.0;
          notForInvolving[wronglyInvolvedIndex] = true;
          for (int index = wronglyInvolvedIndex; index <= c; ++index) {
            matchingChars[index]--;
          }
          c = wronglyInvolvedIndex;
        } else {
          matchingChars[c]++;
          lastlyInvolved.push(c);
        }
      } else {
        if (longTerm.charAt(c) == acronym.charAt(matchingChars[c]) && !notForInvolving[c]) {
          matchingChars[c]++;
          lastlyInvolved.push(c);
        }
      }

      if (matchingChars[c] == acronym.length())
        if (c < tokenBorders.get(tokenBorders.size() - 1)) {
          int wronglyInvolvedIndex = lastlyInvolved.pop();
          notForInvolving[wronglyInvolvedIndex] = true;
          for (int index = wronglyInvolvedIndex; index <= c; ++index) {
            matchingChars[index]--;
          }
          c = wronglyInvolvedIndex;
        } else
          return 1.0;
    }
    return 0.0;
  }

  public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
      List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {

    if (abbreviations == null) {
      abbreviations = new HashSet<String>();
      for (DocumentData doc : docs)
        abbreviations.addAll(doc.getAcronyms().keySet());
    }

    String original = ngramForm.getKey().getSequenceAsString(SequenceType.ORIGINAL).replaceAll("-?s$", "");
    int matchingAcronyms = 0;

    for (String acronym : abbreviations) {
      if (checkForAcronymity(acronym, original) == 1.0) {
        matchingAcronyms++;
        break;
      }
    }
    updateFeatureVals(matchingAcronyms, docToCheck);
  }

  protected double aggregation(List<Collection<Number>> docVals, String phrase, boolean train, List<int[]> length) {
    abbreviations = null;
    return super.aggregation(docVals, phrase, train, length);
  }
}