package com.knowledgebooks.info_spiders;
import net.htmlparser.jericho.*;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
/**
* This simple web spider returns a list of lists, each containing two
* strings representing "URL" and "text". Specifically, I do not return links on each page.
*/
/**
* Copyright Mark Watson 2008-2010. All Rights Reserved.
* License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
*/
public class WebSpider {
public WebSpider(String root_url, int max_returned_pages) throws Exception {
String host = new URL(root_url).getHost();
List<String> urls = new ArrayList<String>();
Set<String> already_visited = new HashSet<String>();
urls.add(root_url);
int num_fetched = 0;
while (num_fetched <= max_returned_pages && !urls.isEmpty()) {
try {
String url_str = urls.remove(0);
if (url_str.toLowerCase().indexOf(host) > -1 && url_str.indexOf("https:") == -1 && !already_visited.contains(url_str)) {
already_visited.add(url_str);
URL url = new URL(url_str);
URLConnection connection = url.openConnection();
connection.setAllowUserInteraction(false);
InputStream ins = url.openStream();
Source source = new Source(ins);
num_fetched++;
TextExtractor te = new TextExtractor(source);
String text = te.toString();
// Skip any pages where text on page is identical to existing
// page (e.g., http://example.com and http://exaple.com/index.html
boolean process = true;
for (List<String> ls : url_content_lists) {
if (text.equals(ls.get(1))) {
process = false;
break;
}
}
if (process) {
try {
Thread.sleep(500);
} catch (Exception ignore) {
}
List<StartTag> anchorTags = source.getAllStartTags("a ");
ListIterator iter = anchorTags.listIterator();
while (iter.hasNext()) {
StartTag anchor = (StartTag) iter.next();
Attributes attr = anchor.parseAttributes();
Attribute link = attr.get("href");
String link_str = link.getValue();
if (link_str.indexOf("http:") == -1) {
String path = url.getPath();
if (path.endsWith("/")) path = path.substring(0, path.length() - 1);
int index = path.lastIndexOf("/");
if (index > -1) path = path.substring(0, index);
link_str = url.getHost() + "/" + path + "/" + link_str;
link_str = "http://" + link_str.replaceAll("///", "/").replaceAll("//", "/");
}
urls.add(link_str);
}
List<String> ls = new ArrayList<String>(2);
ls.add(url_str);
ls.add(text);
url_content_lists.add(ls);
}
}
} catch (Exception ex) {
System.out.println("Error: " + ex);
ex.printStackTrace();
}
}
}
public List<List<String>> url_content_lists = new ArrayList<List<String>>();
}