package focusedCrawler.seedfinder; import java.net.URL; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import focusedCrawler.crawler.async.HttpDownloader; import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult; import focusedCrawler.target.classifier.TargetClassifier; import focusedCrawler.target.classifier.TargetRelevance; import focusedCrawler.target.model.Page; import focusedCrawler.target.model.ParsedData; import focusedCrawler.util.parser.BackLinkNeighborhood; import focusedCrawler.util.parser.PaginaURL; public class QueryProcessor { private int maxNumberOfIterations = 10; private double minimumPrecision = 0.25; private TargetClassifier classifier; private HttpDownloader downloader = new HttpDownloader(); private SearchEngineApi searchEngine; Set<String> usedUrls = new HashSet<>(); public QueryProcessor(int maxNumberOfIterations, double minimumPrecision, TargetClassifier classifier, SearchEngineApi searchEngine) { this.maxNumberOfIterations = maxNumberOfIterations; this.classifier = classifier; this.searchEngine = searchEngine; } public QueryResult processQuery(Query query) throws Exception { QueryResult searchResult = new QueryResult(0d); for (int i = 0; i < maxNumberOfIterations; i++) { System.out.println("Search page "+i); QueryResult result = processSingleQuery(query, i); searchResult.positivePages.addAll(result.positivePages); searchResult.negativePages.addAll(result.negativePages); if(result.precision() < minimumPrecision || result.getTotalNumOfDocs() == 0) { break; } } return searchResult; } public QueryResult processSingleQuery(Query query, int searchPage) throws Exception { List<BackLinkNeighborhood> searchResults = searchEngine.submitQuery(query.asString(), searchPage); List<BackLinkNeighborhood> unseenSearchResults = filterUsedUrls(searchResults); if(unseenSearchResults == null | unseenSearchResults.size() == 0) { return new QueryResult(0d); } System.out.println("Unseen hits: "+unseenSearchResults.size()); System.out.println("\nFetching "+unseenSearchResults.size()+" pages..."); List<FetchedResult> fetchedPages = fetchPages(unseenSearchResults); System.out.println("\nFetched "+fetchedPages.size()+" pages."); if(fetchedPages == null || fetchedPages.isEmpty()) { return new QueryResult(0d); } System.out.println("\nProcessing page content..."); QueryResult result = new QueryResult(); if(!searchResults.isEmpty()) result.percentNewResults = unseenSearchResults.size() / searchResults.size(); else result.percentNewResults = 0; for(FetchedResult fetchedResult : fetchedPages) { if(fetchedResult == null) { continue; } URL url = new URL(fetchedResult.getBaseUrl()); String contentAsString = new String(fetchedResult.getContent()); Page page = new Page(url, contentAsString); page.setParsedData(new ParsedData(new PaginaURL(page))); TargetRelevance relevance = classifier.classify(page); page.setTargetRelevance(relevance); if(relevance.isRelevant()) { result.positivePages.add(page); } else { result.negativePages.add(page); } System.out.println((relevance.isRelevant() ? " relevant -> " : "irrelevant -> ") + url); } return result; } private List<FetchedResult> fetchPages(List<BackLinkNeighborhood> newSearchResults) throws InterruptedException, ExecutionException { if(newSearchResults == null || newSearchResults.size() == 0) { return new ArrayList<FetchedResult>(); } List<Future<FetchedResult>> futures = new ArrayList<Future<FetchedResult>>(); for(BackLinkNeighborhood result : newSearchResults) { try { futures.add(downloader.dipatchDownload(result.getLink())); } catch(IllegalArgumentException e) { // invalid URL, just continue to remaining URLs... System.out.println("Failed to dispatch download for: "+result.getLink()); } } List<FetchedResult> fetchedPages = new ArrayList<FetchedResult>(); for (Future<FetchedResult> future : futures) { FetchedResult fetchedResult = future.get(); if(fetchedResult != null) fetchedPages.add(fetchedResult); } return fetchedPages; } private List<BackLinkNeighborhood> filterUsedUrls(List<BackLinkNeighborhood> searchResults) { if(searchResults == null || searchResults.size() == 0) return null; List<BackLinkNeighborhood> filteredResult = new ArrayList<BackLinkNeighborhood>(); for (BackLinkNeighborhood link : searchResults) { if(!usedUrls.contains(link.getLink())) { filteredResult.add(link); usedUrls.add(link.getLink()); } } return filteredResult; } public void close() { downloader.close(); } public static class QueryResult { List<Page> positivePages = new ArrayList<>(); List<Page> negativePages = new ArrayList<>(); double percentNewResults; public QueryResult() { } public QueryResult(double percentNewResults) { this.percentNewResults = percentNewResults; } int getTotalNumOfDocs() { return positivePages.size() + negativePages.size(); } double precision() { if(getTotalNumOfDocs()==0) return 0d; else return positivePages.size() / (double) getTotalNumOfDocs(); } } }