/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.summary.basic; // JDK imports import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; import java.util.HashSet; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import java.util.Vector; // Hadoop imports import org.apache.hadoop.conf.Configuration; // Lucene imports import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; // Nutch imports import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.Summarizer; import org.apache.nutch.searcher.Summary; import org.apache.nutch.searcher.Summary.Ellipsis; import org.apache.nutch.searcher.Summary.Fragment; import org.apache.nutch.searcher.Summary.Highlight; import org.apache.nutch.util.NutchConfiguration; /** Implements hit summarization. */ public class BasicSummarizer implements Summarizer { private int sumContext = 5; private int sumLength = 20; private Analyzer analyzer = null; private Configuration conf = null; private final static Comparator ORDER_COMPARATOR = new Comparator() { public int compare(Object o1, Object o2) { return ((Excerpt) o1).getOrder() - ((Excerpt) o2).getOrder(); } }; private final static Comparator SCORE_COMPARATOR = new Comparator() { public int compare(Object o1, Object o2) { Excerpt excerpt1 = (Excerpt) o1; Excerpt excerpt2 = (Excerpt) o2; if (excerpt1 == null && excerpt2 != null) { return -1; } else if (excerpt1 != null && excerpt2 == null) { return 1; } else if (excerpt1 == null && excerpt2 == null) { return 0; } int numToks1 = excerpt1.numUniqueTokens(); int numToks2 = excerpt2.numUniqueTokens(); if (numToks1 < numToks2) { return -1; } else if (numToks1 == numToks2) { return excerpt1.numFragments() - excerpt2.numFragments(); } else { return 1; } } }; public BasicSummarizer() { } private BasicSummarizer(Configuration conf) { setConf(conf); } /* ----------------------------- * * <implementation:Configurable> * * ----------------------------- */ public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; this.analyzer = new NutchDocumentAnalyzer(conf); this.sumContext = conf.getInt("searcher.summary.context", 5); this.sumLength = conf.getInt("searcher.summary.length", 20); } /* ------------------------------ * * </implementation:Configurable> * * ------------------------------ */ /* --------------------------- * * <implementation:Summarizer> * * --------------------------- */ public Summary getSummary(String text, Query query) { // Simplistic implementation. Finds the first fragments in the document // containing any query terms. // // TODO: check that phrases in the query are matched in the fragment Token[] tokens = getTokens(text); // parse text to token array if (tokens.length == 0) return new Summary(); String[] terms = query.getTerms(); HashSet highlight = new HashSet(); // put query terms in table for (int i = 0; i < terms.length; i++) highlight.add(terms[i]); // A list to store document's excerpts. // (An excerpt is a Vector full of Fragments and Highlights) List excerpts = new ArrayList(); // // Iterate through all terms in the document // int lastExcerptPos = 0; for (int i = 0; i < tokens.length; i++) { // // If we find a term that's in the query... // if (highlight.contains(tokens[i].termText())) { // // Start searching at a point SUM_CONTEXT terms back, // and move SUM_CONTEXT terms into the future. // int startToken = (i > sumContext) ? i - sumContext : 0; int endToken = Math.min(i + sumContext, tokens.length); int offset = tokens[startToken].startOffset(); int j = startToken; // // Iterate from the start point to the finish, adding // terms all the way. The end of the passage is always // SUM_CONTEXT beyond the last query-term. // Excerpt excerpt = new Excerpt(i); if (i != 0) { excerpt.add(new Summary.Ellipsis()); } // // Iterate through as long as we're before the end of // the document and we haven't hit the max-number-of-items // -in-a-summary. // while ((j < endToken) && (j - startToken < sumLength)) { // // Now grab the hit-element, if present // Token t = tokens[j]; if (highlight.contains(t.termText())) { excerpt.addToken(t.termText()); excerpt.add(new Fragment(text.substring(offset, t.startOffset()))); excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset()))); offset = t.endOffset(); endToken = Math.min(j + sumContext, tokens.length); } j++; } lastExcerptPos = endToken; // // We found the series of search-term hits and added // them (with intervening text) to the excerpt. Now // we need to add the trailing edge of text. // // So if (j < tokens.length) then there is still trailing // text to add. (We haven't hit the end of the source doc.) // Add the words since the last hit-term insert. // if (j < tokens.length) { excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset()))); } // // Remember how many terms are in this excerpt // excerpt.setNumTerms(j - startToken); // // Store the excerpt for later sorting // excerpts.add(excerpt); // // Start SUM_CONTEXT places away. The next // search for relevant excerpts begins at i-SUM_CONTEXT // i = j + sumContext; } } // Sort the excerpts based on their score Collections.sort(excerpts, SCORE_COMPARATOR); // // If the target text doesn't appear, then we just // excerpt the first SUM_LENGTH words from the document. // if (excerpts.size() == 0) { Excerpt excerpt = new Excerpt(0); int excerptLen = Math.min(sumLength, tokens.length); lastExcerptPos = excerptLen; excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset()))); excerpt.setNumTerms(excerptLen); excerpts.add(excerpt); } // // Now choose the best items from the excerpt set. // Stop when we have enought excerpts to build our Summary. // double tokenCount = 0; int numExcerpt = excerpts.size()-1; List bestExcerpts = new ArrayList(); while (tokenCount <= sumLength && numExcerpt >= 0) { Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--); bestExcerpts.add(excerpt); tokenCount += excerpt.getNumTerms(); } // Sort the best excerpts based on their natural order Collections.sort(bestExcerpts, ORDER_COMPARATOR); // // Now build our Summary from the best the excerpts. // tokenCount = 0; numExcerpt = 0; Summary s = new Summary(); while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) { Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++); double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) { Fragment f = (Fragment) e.nextElement(); // Don't add fragments if it takes us over the max-limit if (tokenCount + tokenFraction <= sumLength) { s.add(f); } tokenCount += tokenFraction; } } if (tokenCount > 0 && lastExcerptPos < tokens.length) s.add(new Ellipsis()); return s; } /* ---------------------------- * * </implementation:Summarizer> * * ---------------------------- */ /** Maximun number of tokens inspect in a summary . */ private static final int token_deep = 2000; /** * Class Excerpt represents a single passage found in the document, with some * appropriate regions highlit. */ class Excerpt { Vector passages = new Vector(); SortedSet tokenSet = new TreeSet(); int numTerms = 0; int order = 0; /** */ public Excerpt(int order) { this.order = order; } /** */ public void addToken(String token) { tokenSet.add(token); } /** * Return how many unique toks we have */ public int numUniqueTokens() { return tokenSet.size(); } /** * How many fragments we have. */ public int numFragments() { return passages.size(); } public void setNumTerms(int numTerms) { this.numTerms = numTerms; } public int getOrder() { return order; } public int getNumTerms() { return numTerms; } /** * Add a frag to the list. */ public void add(Fragment fragment) { passages.add(fragment); } /** * Return an Enum for all the fragments */ public Enumeration elements() { return passages.elements(); } } private Token[] getTokens(String text) { ArrayList result = new ArrayList(); TokenStream ts = analyzer.tokenStream("content", new StringReader(text)); Token token = null; while (result.size()<token_deep) { try { token = ts.next(); } catch (IOException e) { token = null; } if (token == null) { break; } result.add(token); } try { ts.close(); } catch (IOException e) { // ignore } return (Token[]) result.toArray(new Token[result.size()]); } /** * Tests Summary-generation. User inputs the name of a * text file and a query string */ public static void main(String argv[]) throws IOException { // Test arglist if (argv.length < 2) { System.out.println("Usage: java org.apache.nutch.searcher.Summarizer <textfile> <queryStr>"); return; } Configuration conf = NutchConfiguration.create(); Summarizer s = new BasicSummarizer(conf); // // Parse the args // File textFile = new File(argv[0]); StringBuffer queryBuf = new StringBuffer(); for (int i = 1; i < argv.length; i++) { queryBuf.append(argv[i]); queryBuf.append(" "); } // // Load the text file into a single string. // StringBuffer body = new StringBuffer(); BufferedReader in = new BufferedReader(new FileReader(textFile)); try { System.out.println("About to read " + textFile + " from " + in); String str = in.readLine(); while (str != null) { body.append(str); str = in.readLine(); } } finally { in.close(); } // Convert the query string into a proper Query Query query = Query.parse(queryBuf.toString(), conf); System.out.println("Summary: '" + s.getSummary(body.toString(), query) + "'"); } }