package com.knowledgebooks.nlp; import com.knowledgebooks.nlp.util.Document; import com.knowledgebooks.public_domain.Stemmer; import java.util.ArrayList; import java.util.List; /** * General NLP utilities for extracting key phrases from input text also * generating short summaries of input text. * * <pre> * Class to extract key-word based summarization from text. The algorithm is as follows: * * 1. find the most likely topic tags for the text * 2. locate words that contributed to forming these categories and * set a weighting based on the relevance of the categories * 3. "smudge" out these weightings to surrounding words. * 4. using a threshold cutoff, locate summarization with high weightings * </pre> */ /** * Copyright Mark Watson 2008-2010. All Rights Reserved. * License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt) */ public class KeyPhraseExtractionAndSummary { private PhraseList pl = new PhraseList(); // (defun get-key-summarization (word-vector key-word-rankings &aux x y z v (ret '())) private Document document; /** * @param text */ public KeyPhraseExtractionAndSummary(String text) { document = new Document(text); Stemmer stemmer = new Stemmer(); List<String> stems = new ArrayList<String>(document.getNumWords()); for (int i = 0, size = document.getNumWords(); i < size; i++) stems.add(stemmer.stemOneWord(document.getWord(i))); float[] weights = autoTagger.getWordImportanceWeights(stems); int[] start = document.startSentenceBoundary; document.getTokens(); if (start == null) return; int[] end = document.endSentenceBoundary; int size = start.length; //System.out.println("KeyPhraseExtractionAndSummary.init: number sentences in document="+size); float[] bb = new float[size]; // // loop over sentences in the document for key word based summarization: // float bmax = 0f; for (int i = 0; i < size; i++) { for (int j = start[i]; j <= end[i]; j++) { bb[i] += weights[j]; } if (end[i] > start[i]) bb[i] /= (end[i] - start[i]); if ((end[i] - start[i]) < 50) bb[i] *= 0.65f; // penalize short segments if ((end[i] - start[i]) < 25) bb[i] *= 0.45f; // penalize very short segments if (bmax < bb[i]) bmax = bb[i]; } // store higher ranked sentences in the phrase list: for (int i = 0; i < size; i++) { if (bb[i] > 0.75f * bmax) { String s = document.getSentence(i); if (s.indexOf("without written permission") > -1) bb[i] *= 0.25f; if (s.indexOf("from the Associated Press") > -1) bb[i] *= 0.25f; if (s.indexOf("Copyright") > -1) bb[i] *= 0.8f; if (s.startsWith("Copyright")) bb[i] *= 0.3f; if (bb[i] > 0.6f * bmax) pl.addPhrase(s, bb[i]); } } // check to see if there are no summarization - if not, use other means to rank sentences: if (pl.getNumPhrases() == 0) { // no key summarization generated by hot word counts - use second method if (size > 0) { for (int i = 0; i < size; i++) { String s = document.getSentence(i); if (s == null || s.length() == 0) continue; float charRatio = 0; int slen = s.length(); for (int j = 0; j < slen; j++) { char ch = s.charAt(j); if (Character.isLetterOrDigit(ch)) charRatio += 1; else charRatio -= 1; } float score = charRatio * 0.1f; if (s.toLowerCase().indexOf("copyright") > -1) score -= 3; if (s.toLowerCase().indexOf("The information contained in") > -1) score -= 7; if (slen < 40) score -= 3; if (slen > 200) score -= 2; char startChar = s.charAt(0); if (Character.isLetterOrDigit(startChar) == false) score -= 4; if (Character.isLowerCase(startChar)) score -= 5; if (s.startsWith("By ")) score -= 1; if (score > 5) pl.addPhrase(s, (score / 25000)); // FOR DEBUG: bias score so I know this option used } } } // sort the phrase list, highest relevancy first: pl.sortPhrases(); } private String[] keyWords = null; public String[] getKeyWords() { return keyWords; } public int getNumPhrases() { return pl.getNumPhrases(); } public float getScore(int index) { return pl.getScore(index); } public String getPhrase(int index) { return pl.getPhrase(index); } public PhraseList getAllPhrases() { return pl; } public String getSummary() { //System.out.println("\n\n GETTING SUMMARY:"); //for (int i=0; i<pl.size(); i++) { // System.out.println(" score:" + pl.getScore(i)+", phrase: "+pl.getPhrase(i)); //} String ret = ""; //System.out.println("GETTING SUMMARY: pl.size()=" + pl.size()); if (pl.size() == 1) ret = pl.getPhrase(0); /*else if (pl.getScore(0) > (2 * pl.getScore(1))) ret = pl.getPhrase(0); else if (pl.getPhrase(0).length() > 80) ret = pl.getPhrase(0);*/ else ret = pl.getPhrase(0) + " " + pl.getPhrase(1); ret = ret.trim(); return ret; } //private Document document; private AutoTagger autoTagger = new AutoTagger(); // test: public static void main(String[] args) { //String s = "President Bush went to Mexico to buy oil. The stock market, especially oil futures fell sharply. He danced while there."; String s = "Sales of 10 cotton cloth and raw silk cocoons are down in England and France due to competition from India. Cotton is easy to wash. President Bush, wearing a Strouds shirt, and Congress are concerned about US cotton and riso and Riso sales. Airline traffic is down this year."; KeyPhraseExtractionAndSummary e = new KeyPhraseExtractionAndSummary(s); int num = e.getNumPhrases(); for (int i = 0; i < num; i++) { System.out.println("" + e.getScore(i) + " : " + e.getPhrase(i)); } System.out.println("\nSummary:\n" + e.getSummary()); } }