package com.sample.crawler; import java.io.IOException; import java.sql.SQLException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * A basic implementaion of a web crawler that uses in memory based map to * maintain list of crawled pages * * @author saifasif */ public class Crawler { public static void main(String[] args) throws SQLException, IOException { processPage("http://www.mit.edu"); } public static void processPage(String URL) throws SQLException, IOException { /* * check if the given URL is already in database. get useful information */ if (ContentList.isContentInMap(URL)) { return; } Document doc = null; try { doc = Jsoup.connect(URL).timeout(5000).get(); if (doc.text().contains("research")) { System.out.println(URL); ContentList.insertKey(URL, URL); } // get all links and recursively call the processPage method Elements questions = doc.select("a[href]"); for (Element link : questions) { if (link.attr("href").contains("mit.edu")) processPage(link.attr("abs:href")); } } catch (Exception e) { System.out.println("skipping .... " + URL); } } }