package focusedCrawler.seedfinder; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import org.apache.commons.validator.routines.UrlValidator; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import focusedCrawler.crawler.crawlercommons.fetcher.BaseFetchException; import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult; import focusedCrawler.crawler.crawlercommons.fetcher.http.SimpleHttpFetcher; import focusedCrawler.util.TimeDelay; import focusedCrawler.util.parser.BackLinkNeighborhood; public class BingSearch implements SearchEngineApi { private final SimpleHttpFetcher fetcher; private int docsPerPage = 10; private UrlValidator urlValidator = new UrlValidator(); private TimeDelay timer = new TimeDelay(5000); public BingSearch(SimpleHttpFetcher fetcher) { this.fetcher = fetcher; } public List<BackLinkNeighborhood> submitQuery(String query, int page) throws IOException { timer.waitMinimumDelayIfNecesary(); // 21 -> max number allowed by google... decreases after String queryUrl = "https://www.bing.com/search?q=" + query + "&count="+docsPerPage + "&first="+(page*docsPerPage+1)+"&FORM=PORE"; try { FetchedResult result = fetcher.get(queryUrl); InputStream is = new ByteArrayInputStream(result.getContent()); Document doc = Jsoup.parse(is, "UTF-8", query); is.close(); Elements searchItems = doc.select("ol#b_results"); Elements linkHeaders = searchItems.select("h2"); Elements linksUrl = linkHeaders.select("a"); List<BackLinkNeighborhood> links = new ArrayList<>(); for (Element link : linksUrl) { String linkStr = link.attr("href"); if(urlValidator.isValid(linkStr)) { BackLinkNeighborhood bl = new BackLinkNeighborhood(); bl.setLink(linkStr); bl.setTitle(link.text()); links.add(bl); } } System.out.println(getClass().getSimpleName()+" hits: "+links.size()); return links; } catch (IOException | BaseFetchException e) { throw new IOException("Failed to download backlinks from Google.", e); } } }