SuffixFeature.java example

Explorer

kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged

package hu.u_szeged.kpe.features;

import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.readers.DocumentData;

import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;

/**
 * This feature assigns the character 2/3-gram suffixes for each token of the occurrences of the NGrams belonging to a normalized form.
 */
public class SuffixFeature extends Feature {

  private static final long serialVersionUID = 4955319571139381492L;

  public SuffixFeature() {
    scale = Scale.BINARY;
    dummyValue = -1;
    canBeRepresentedAsSequential = true;
    collectionToStoreDocVals = HashSet.class;
  }

  public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
      List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
    int ngramLength = ngramForm.getKey().size();
    StringBuffer[] concatenatedSuffices = new StringBuffer[2];

    for (int tokenNumber = 0; tokenNumber < ngramLength; ++tokenNumber) {
      CoreLabel cl = ngramForm.getKey().get(tokenNumber);
      String word = cl.word().toLowerCase();
      int lemmaLenght = word.length();
      for (int i = 0; !employBIESmarkup && i < concatenatedSuffices.length; ++i) {
        // if this is the first token we need to instantiate array elements
        if (tokenNumber == 0) {
          concatenatedSuffices[i] = new StringBuffer();
        }
        concatenatedSuffices[i].append("_" + word.charAt(lemmaLenght - 1));
      }
      String type = ngramLength == 1 ? "S" : (tokenNumber == 0 ? "B" : (tokenNumber < ngramLength - 1 ? "I" : "E"));

      StringBuffer sb = new StringBuffer();
      sb.append(word.charAt(lemmaLenght - 1));
      for (int i = 2; i <= lemmaLenght && i < concatenatedSuffices.length + 2; ++i) {
        char charToAdd = word.charAt(lemmaLenght - i);
        if (!Character.isLetter(charToAdd))
          break;
        if (employBIESmarkup) {
          sb.append(charToAdd);
          updateFeatureVals(this.getClass().getName() + "_" + type + "_" + sb, 1.0d, docToCheck);
        } else {
          for (int s = 0; s < concatenatedSuffices.length; ++s) {
            if (i - 2 <= s) {
              concatenatedSuffices[s].append(charToAdd);
            }
          }
        }
      }
    }
    if (!employBIESmarkup) {
      for (StringBuffer suffix : concatenatedSuffices) {
        updateFeatureVals(this.getClass().getName() + suffix.toString(), 1.0d, docToCheck);
      }
    }
  }
}