NGramUtils.java example

Explorer
opennlp-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.ngram;

import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedList;

import opennlp.tools.util.StringList;

/**
 * Utility class for ngrams.
 * Some methods apply specifically to certain 'n' values, for e.g. tri/bi/uni-grams.
 */
public class NGramUtils {

  /**
   * calculate the probability of a ngram in a vocabulary using Laplace smoothing algorithm
   *
   * @param ngram the ngram to get the probability for
   * @param set   the vocabulary
   * @param k     the smoothing factor
   * @return the Laplace smoothing probability
   * @see <a href="https://en.wikipedia.org/wiki/Additive_smoothing">Additive Smoothing</a>
   */
  public static double calculateLaplaceSmoothingProbability(StringList ngram,
      Iterable<StringList> set, Double k) {
    return (count(ngram, set) + k) / (count(getNMinusOneTokenFirst(ngram), set) + k * 1);
  }

  /**
   * calculate the probability of a unigram in a vocabulary using maximum likelihood estimation
   *
   * @param word the only word in the unigram
   * @param set  the vocabulary
   * @return the maximum likelihood probability
   */
  public static double calculateUnigramMLProbability(String word, Collection<StringList> set) {
    double vocSize = 0d;
    for (StringList s : set) {
      vocSize += s.size();
    }
    return count(new StringList(word), set) / vocSize;
  }

  /**
   * calculate the probability of a bigram in a vocabulary using maximum likelihood estimation
   *
   * @param x0  first word in the bigram
   * @param x1  second word in the bigram
   * @param set the vocabulary
   * @return the maximum likelihood probability
   */
  public static double calculateBigramMLProbability(String x0, String x1, Collection<StringList> set) {
    return calculateNgramMLProbability(new StringList(x0, x1), set);
  }

  /**
   * calculate the probability of a trigram in a vocabulary using maximum likelihood estimation
   *
   * @param x0  first word in the trigram
   * @param x1  second word in the trigram
   * @param x2  third word in the trigram
   * @param set the vocabulary
   * @return the maximum likelihood probability
   */
  public static double calculateTrigramMLProbability(String x0, String x1, String x2,
                                                     Iterable<StringList> set) {
    return calculateNgramMLProbability(new StringList(x0, x1, x2), set);
  }

  /**
   * calculate the probability of a ngram in a vocabulary using maximum likelihood estimation
   *
   * @param ngram a ngram
   * @param set   the vocabulary
   * @return the maximum likelihood probability
   */
  public static double calculateNgramMLProbability(StringList ngram, Iterable<StringList> set) {
    StringList ngramMinusOne = getNMinusOneTokenFirst(ngram);
    return count(ngram, set) / count(ngramMinusOne, set);
  }

  /**
   * calculate the probability of a bigram in a vocabulary using prior Laplace smoothing algorithm
   *
   * @param x0  the first word in the bigram
   * @param x1  the second word in the bigram
   * @param set the vocabulary
   * @param k   the smoothing factor
   * @return the prior Laplace smoothiing probability
   */
  public static double calculateBigramPriorSmoothingProbability(String x0, String x1,
                                                                Collection<StringList> set, Double k) {
    return (count(new StringList(x0, x1), set) + k * calculateUnigramMLProbability(x1, set)) /
        (count(new StringList(x0), set) + k * set.size());
  }

  /**
   * calculate the probability of a trigram in a vocabulary using a linear interpolation algorithm
   *
   * @param x0      the first word in the trigram
   * @param x1      the second word in the trigram
   * @param x2      the third word in the trigram
   * @param set     the vocabulary
   * @param lambda1 trigram interpolation factor
   * @param lambda2 bigram interpolation factor
   * @param lambda3 unigram interpolation factor
   * @return the linear interpolation probability
   */
  public static double calculateTrigramLinearInterpolationProbability(String x0, String x1,
                                                                      String x2, Collection<StringList> set,
      Double lambda1, Double lambda2, Double lambda3) {
    assert lambda1 + lambda2 + lambda3 == 1 : "lambdas sum should be equals to 1";
    assert lambda1 > 0 && lambda2 > 0 && lambda3 > 0 : "lambdas should all be greater than 0";

    return lambda1 * calculateTrigramMLProbability(x0, x1, x2, set) +
        lambda2 * calculateBigramMLProbability(x1, x2, set) +
        lambda3 * calculateUnigramMLProbability(x2, set);

  }

  /**
   * calculate the probability of a ngram in a vocabulary using the missing probability mass algorithm
   *
   * @param ngram    the ngram
   * @param discount discount factor
   * @param set      the vocabulary
   * @return the probability
   */
  public static double calculateMissingNgramProbabilityMass(StringList ngram, Double discount,
                                                            Iterable<StringList> set) {
    Double missingMass = 0d;
    Double countWord = count(ngram, set);
    for (String word : flatSet(set)) {
      missingMass += (count(getNPlusOneNgram(ngram, word), set) - discount) / countWord;
    }
    return 1 - missingMass;
  }

  /**
   * get the (n-1)th ngram of a given ngram, that is the same ngram except the last word in the ngram
   *
   * @param ngram a ngram
   * @return a ngram
   */
  public static StringList getNMinusOneTokenFirst(StringList ngram) {
    String[] tokens = new String[ngram.size() - 1];
    for (int i = 0; i < ngram.size() - 1; i++) {
      tokens[i] = ngram.getToken(i);
    }
    return tokens.length > 0 ? new StringList(tokens) : null;
  }

  /**
   * get the (n-1)th ngram of a given ngram, that is the same ngram except the first word in the ngram
   *
   * @param ngram a ngram
   * @return a ngram
   */
  public static StringList getNMinusOneTokenLast(StringList ngram) {
    String[] tokens = new String[ngram.size() - 1];
    for (int i = 1; i < ngram.size(); i++) {
      tokens[i - 1] = ngram.getToken(i);
    }
    return tokens.length > 0 ? new StringList(tokens) : null;
  }

  private static StringList getNPlusOneNgram(StringList ngram, String word) {
    String[] tokens = new String[ngram.size() + 1];
    for (int i = 0; i < ngram.size(); i++) {
      tokens[i] = ngram.getToken(i);
    }
    tokens[tokens.length - 1] = word;
    return new StringList(tokens);
  }

  private static Double count(StringList ngram, Iterable<StringList> sentences) {
    Double count = 0d;
    for (StringList sentence : sentences) {
      int idx0 = indexOf(sentence, ngram.getToken(0));
      if (idx0 >= 0 && sentence.size() >= idx0 + ngram.size()) {
        boolean match = true;
        for (int i = 1; i < ngram.size(); i++) {
          String sentenceToken = sentence.getToken(idx0 + i);
          String ngramToken = ngram.getToken(i);
          match &= sentenceToken.equals(ngramToken);
        }
        if (match) {
          count++;
        }
      }
    }
    return count;
  }

  private static int indexOf(StringList sentence, String token) {
    for (int i = 0; i < sentence.size(); i++) {
      if (token.equals(sentence.getToken(i))) {
        return i;
      }
    }
    return -1;
  }

  private static Collection<String> flatSet(Iterable<StringList> set) {
    Collection<String> flatSet = new HashSet<>();
    for (StringList sentence : set) {
      for (String word : sentence) {
        flatSet.add(word);
      }
    }
    return flatSet;
  }

  /**
   * get the ngrams of dimension n of a certain input sequence of tokens
   *
   * @param sequence a sequence of tokens
   * @param size     the size of the resulting ngrmams
   * @return all the possible ngrams of the given size derivable from the input sequence
   */
  public static Collection<StringList> getNGrams(StringList sequence, int size) {
    Collection<StringList> ngrams = new LinkedList<>();
    if (size == -1 || size >= sequence.size()) {
      ngrams.add(sequence);
    } else {
      String[] ngram = new String[size];
      for (int i = 0; i < sequence.size() - size + 1; i++) {
        ngram[0] = sequence.getToken(i);
        for (int j = 1; j < size; j++) {
          ngram[j] = sequence.getToken(i + j);
        }
        ngrams.add(new StringList(ngram));
      }
    }

    return ngrams;
  }
}