KeyPhraseExtractionAndSummary.java example

Explorer
java_practical_semantic_web-master
package com.knowledgebooks.nlp;


import com.knowledgebooks.nlp.util.Document;
import com.knowledgebooks.public_domain.Stemmer;

import java.util.ArrayList;
import java.util.List;

/**
 * General NLP utilities for extracting key phrases from input text also
 * generating short summaries of input text.
 *
 * <pre>
 *  Class to extract key-word based summarization from text. The algorithm is as follows:
 *
 *    1. find the most likely topic tags for the text
 *    2. locate words that contributed to forming these categories and
 *       set a weighting based on the relevance of the categories
 *    3. "smudge" out these weightings to surrounding words.
 *    4. using a threshold cutoff, locate summarization with high weightings
 * </pre>
 */

/**
 * Copyright Mark Watson 2008-2010. All Rights Reserved.
 * License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
 */

public class KeyPhraseExtractionAndSummary {
  private PhraseList pl = new PhraseList();
  // (defun get-key-summarization (word-vector key-word-rankings &aux x y z v (ret '()))
  private Document document;

  /**
   * @param text
   */
  public KeyPhraseExtractionAndSummary(String text) {
    document = new Document(text);
    Stemmer stemmer = new Stemmer();

    List<String> stems = new ArrayList<String>(document.getNumWords());
    for (int i = 0, size = document.getNumWords(); i < size; i++) stems.add(stemmer.stemOneWord(document.getWord(i)));
    float[] weights = autoTagger.getWordImportanceWeights(stems);
    int[] start = document.startSentenceBoundary;
    document.getTokens();
    if (start == null) return;
    int[] end = document.endSentenceBoundary;
    int size = start.length;
    //System.out.println("KeyPhraseExtractionAndSummary.init:  number sentences in document="+size);
    float[] bb = new float[size];

    //
    // loop over sentences in the document for key word based summarization:
    //
    float bmax = 0f;
    for (int i = 0; i < size; i++) {
      for (int j = start[i]; j <= end[i]; j++) {
        bb[i] += weights[j];
      }
      if (end[i] > start[i]) bb[i] /= (end[i] - start[i]);
      if ((end[i] - start[i]) < 50) bb[i] *= 0.65f; // penalize short segments
      if ((end[i] - start[i]) < 25) bb[i] *= 0.45f; // penalize very short segments
      if (bmax < bb[i]) bmax = bb[i];
    }
    // store higher ranked sentences in the phrase list:
    for (int i = 0; i < size; i++) {
      if (bb[i] > 0.75f * bmax) {
        String s = document.getSentence(i);
        if (s.indexOf("without written permission") > -1) bb[i] *= 0.25f;
        if (s.indexOf("from the Associated Press") > -1) bb[i] *= 0.25f;
        if (s.indexOf("Copyright") > -1) bb[i] *= 0.8f;
        if (s.startsWith("Copyright")) bb[i] *= 0.3f;
        if (bb[i] > 0.6f * bmax) pl.addPhrase(s, bb[i]);
      }
    }
    // check to see if there are no summarization - if not, use other means to rank sentences:
    if (pl.getNumPhrases() == 0) {
      // no key summarization generated by hot word counts - use second method
      if (size > 0) {
        for (int i = 0; i < size; i++) {
          String s = document.getSentence(i);
          if (s == null || s.length() == 0) continue;
          float charRatio = 0;
          int slen = s.length();
          for (int j = 0; j < slen; j++) {
            char ch = s.charAt(j);
            if (Character.isLetterOrDigit(ch)) charRatio += 1;
            else charRatio -= 1;
          }
          float score = charRatio * 0.1f;
          if (s.toLowerCase().indexOf("copyright") > -1) score -= 3;
          if (s.toLowerCase().indexOf("The information contained in") > -1) score -= 7;
          if (slen < 40) score -= 3;
          if (slen > 200) score -= 2;
          char startChar = s.charAt(0);
          if (Character.isLetterOrDigit(startChar) == false) score -= 4;
          if (Character.isLowerCase(startChar)) score -= 5;
          if (s.startsWith("By ")) score -= 1;
          if (score > 5) pl.addPhrase(s, (score / 25000));  // FOR DEBUG: bias score so I know this option used
        }
      }
    }
    // sort the phrase list, highest relevancy first:
    pl.sortPhrases();
  }

  private String[] keyWords = null;

  public String[] getKeyWords() {
    return keyWords;
  }

  public int getNumPhrases() {
    return pl.getNumPhrases();
  }

  public float getScore(int index) {
    return pl.getScore(index);
  }

  public String getPhrase(int index) {
    return pl.getPhrase(index);
  }

  public PhraseList getAllPhrases() {
    return pl;
  }

  public String getSummary() {
    //System.out.println("\n\n  GETTING SUMMARY:");
    //for (int i=0; i<pl.size(); i++) {
    //    System.out.println("   score:" + pl.getScore(i)+", phrase: "+pl.getPhrase(i));
    //}
    String ret = "";
    //System.out.println("GETTING SUMMARY: pl.size()=" + pl.size());
    if (pl.size() == 1) ret = pl.getPhrase(0);
      /*else if (pl.getScore(0) > (2 * pl.getScore(1))) ret = pl.getPhrase(0);
else if (pl.getPhrase(0).length() > 80)  ret = pl.getPhrase(0);*/
    else ret = pl.getPhrase(0) + "  " + pl.getPhrase(1);
    ret = ret.trim();
    return ret;
  }

  //private Document document;
  private AutoTagger autoTagger = new AutoTagger();
  // test:

  public static void main(String[] args) {
    //String s = "President Bush went to Mexico to buy oil. The stock market, especially oil futures fell sharply. He danced while there.";
    String s = "Sales of 10 cotton cloth and raw silk cocoons  are down in England and France due to competition from India. Cotton is easy to wash. President Bush, wearing a Strouds shirt, and Congress are concerned about US cotton and riso and Riso sales. Airline traffic is down this year.";
    KeyPhraseExtractionAndSummary e = new KeyPhraseExtractionAndSummary(s);
    int num = e.getNumPhrases();
    for (int i = 0; i < num; i++) {
      System.out.println("" + e.getScore(i) + " : " + e.getPhrase(i));
    }
    System.out.println("\nSummary:\n" + e.getSummary());
  }
}