package info.ephyra.search.searchers; import info.ephyra.querygeneration.Query; import info.ephyra.search.Result; import info.ephyra.search.Search; import info.ephyra.util.HTMLConverter; import java.util.ArrayList; /** * <p>A <code>KnowledgeMiner</code> deploys a document retrieval system to * search an unstructured knowledge source, e.g. Google to search the World Wide * Web.</p> * * <p>It runs as a separate thread, so several queries can be performed in * parallel.</p> * * <p>This class extends the class <code>Searcher</code> and is abstract.</p> * * @author Nico Schlaefer * @version 2007-05-29 */ public abstract class KnowledgeMiner extends Searcher { /** The hit position of the first result to be fetched. */ protected int firstResult; /** The maximum number of results to be fetched. */ protected int maxResults; /** * Returns the maximum total number of search results. * * @return maximum total number of search results */ protected abstract int getMaxResultsTotal(); /** * Returns the maximum number of search results per query. * * @return maximum total number of search results */ protected abstract int getMaxResultsPerQuery(); /** * Creates <code>Result</code> objects form an array of text passages and * document IDs. * * @param passages text passages * @param docIDs IDs of the documents the text passages are from * @param isHtml flag indicating that the passages are HTML code * @return <code>Result</code> objects */ protected Result[] getResults(String[] passages, String[] docIDs, boolean isHtml) { return getResults(passages, docIDs, new String[docIDs.length], isHtml); } /** * Creates <code>Result</code> objects form an array of text passages, * document IDs and IDs of cached documents. * * @param passages text passages * @param docIDs IDs of the documents the text passages are from * @param cacheIDs IDs of the documents in the search engine cache * @param isHtml flag indicating that the passages are HTML code * @return <code>Result</code> objects */ protected Result[] getResults(String[] passages, String[] docIDs, String[] cacheIDs, boolean isHtml) { ArrayList<Result> results = new ArrayList<Result>(); for (int i = 0; i < passages.length; i++) { if (passages[i] != null) { String[] split; if (isHtml) { // convert to plain text and split around "..." passages[i] = HTMLConverter.htmlsnippet2text(passages[i]); split = passages[i].split("\\.\\.\\."); } else { // replace sequences of whitespaces by single blanks passages[i] = passages[i].replaceAll("\\s++", " "); split = new String[] {passages[i]}; } for (String passage : split) { passage = passage.trim(); if (passage.length() > 0) { Result result = new Result(passage, query, docIDs[i], i + firstResult - 1); result.setCacheID(cacheIDs[i]); // result is never returned by the QA engine but is only // used to derive other results result.setScore(Float.NEGATIVE_INFINITY); results.add(result); } } } } return results.toArray(new Result[results.size()]); } /** * <p>Sets the query, the hit position of the first result and the number of * results to be fetched and starts the thread.</p> * * <p>This method should be used instead of the inherited * <code>start()</code> method without arguments.</p> * * @param query <code>Query</code> object * @param firstResult hit position of the first result */ protected void start(Query query, int firstResult) { this.query = query; this.firstResult = firstResult; this.maxResults = Math.min(getMaxResultsPerQuery(), getMaxResultsTotal() - firstResult + 1); // wait until there are less than MAX_PENDING pending queries Search.waitForPending(); start(); // one more pending query Search.incPending(); } /** * <p>Returns a new instance of the <code>KnowledgeMiner</code>. A new * instance is created for each query.</p> * * <p>It does not necessarily return an exact copy of the current * instance.</p> * * @return new instance of the <code>KnowledgeMiner</code> */ public abstract KnowledgeMiner getCopy(); /** * <p>Creates <code>[MAX_RESULTS_TOTAL / MAX_RESULTS_PERQUERY]</code> * threads that fetch up to <code>MAX_RESULTS_TOTAL</code> results.</p> * * <p>This method should be used instead of the inherited * <code>start()</code> method without arguments.</p> * * @param query <code>Query</code> object */ public void start(Query query) { int firstResult = 1; while (firstResult <= getMaxResultsTotal()) { getCopy().start(query, firstResult); firstResult += getMaxResultsPerQuery(); } } }