package edu.unc.ils.mrc.hive.util;
import java.io.ByteArrayInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import difflib.Chunk;
import difflib.Delta;
import difflib.DiffUtils;
import difflib.Patch;
/**
* This simple web crawler creates a textual representation of a website
* with all HTML, javascript, and CSS removed. The crawler starts with an
* initial URL and traverses links up to a maximum specific number of "hops".
* The result is a text document that contains the contents of multiple web-pages
* in the order traversed.
*/
public class SimpleTextCrawler
{
/* Logger */
private static final Log logger = LogFactory.getLog(SimpleTextCrawler.class);
/* Link extraction pattern */
static final Pattern HREF_PATTERN = Pattern.compile("<a\\b[^>]*href=\"([^\"]*)\"[^>]*>");
/* Map of crawled URLs */
Map<String, Integer> retrievedURLs = new HashMap<String, Integer>();
private String proxyHost = null;
private int proxyPort = -1;
private List<Pattern> ignorePatterns = new ArrayList<Pattern>();
/* Used to include *html files during LC test collection generation */
private boolean saveHTML = false;
private static String htmlForURL = "";
/* Reads the following command line arguments:
* *.xls input file with URLs and associated subject headings
* output directory for generated files
* number of hops (optional, default is 0)
* differencing enabled (default is no)
* Example: -f c:\test\testdata.xls -o c:\testout\ -n 3 -d
* Invokes the text crawler for each URL and generates associated
* *.txt, *.key, and *.html files
*/
public static void main(String[] args) throws ParseException {
CommandLineParser parser = new BasicParser();
Options options = getOptions();
CommandLine commandLine = parser.parse(options, args);
if (commandLine.hasOption("h")) {
// Print the help message
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("java edu.unc.ils.mrc.hive.util.SimpleTextCrawler",
options);
} else {
String urlFileName = commandLine.getOptionValue("f");
String outputDir = commandLine.getOptionValue("o");
int numberOfHops = 0;
boolean differencingEnabled = false;
if (commandLine.hasOption("d"))
differencingEnabled = true;
try {
File fileObject = new File(outputDir);
if (!fileObject.exists()) {
fileObject.mkdirs();
}
if (commandLine.hasOption("n")) {
numberOfHops = Integer.parseInt(commandLine.getOptionValue("n"));
}
try {
HSSFWorkbook wb = readFile(urlFileName);
for (int k = 0; k < wb.getNumberOfSheets(); k++) {
HSSFSheet sheet = wb.getSheetAt(k);
int rows = sheet.getPhysicalNumberOfRows();
System.out.println("Input file=" + urlFileName + " and has " + rows + " rows.");
System.out.println("Output directory=" + outputDir +
", number of hops=" + numberOfHops +
", differencing " + (differencingEnabled ? "enabled" : "disabled"));
for (int r = 0; r < rows; r++) {
HSSFRow row = sheet.getRow(r);
if (row == null) {
continue;
}
int cells = row.getPhysicalNumberOfCells();
//System.out.println("\nROW " + row.getRowNum() + " has " + cells + " cell(s).");
PrintWriter outtxt = null;
PrintWriter outkey = null;
PrintWriter outhtml = null;
String txtFileName = null;
String keyFileName = null;
String htmlFileName = null;
for (int c = 0; c < cells; c++) {
HSSFCell cell = row.getCell(c);
String value = null;
String fileName = null;
value = cell.getStringCellValue();
try {
//System.out.println("CELL col=" + cell.getColumnIndex() + " VALUE="+ value);
SimpleTextCrawler sc = new SimpleTextCrawler();
if (c == 0) {
fileName = generateFileName(value);
txtFileName = fileName + ".txt";
keyFileName = fileName + ".key";
htmlFileName = fileName + ".html";
System.out.println("txtFileName = " + txtFileName);
outtxt = new PrintWriter(outputDir + txtFileName);
outkey = new PrintWriter(outputDir + keyFileName);
outhtml = new PrintWriter(outputDir + htmlFileName);
URL url = new URL(cell.getStringCellValue());
String text = sc.getTextAndHTML(url,numberOfHops,differencingEnabled);
outtxt.print(text);
outtxt.close();
outhtml.print(htmlForURL);
outhtml.close();
}
if (c > 0) {
outkey.println(cell.getStringCellValue().toUpperCase());
}
} catch (FileNotFoundException e) {
logger.error("Unable to create " + fileName+".txt" + " or " + fileName+".key");
break;}
catch (SAXException e) {
e.printStackTrace(); }
catch (TikaException e) {
e.printStackTrace(); }
}
if (outkey != null)
outkey.close();
}
}
} catch (IOException e) {
logger.error("Unable to read file " + urlFileName);
}
} catch (NumberFormatException e) {
logger.error("Number of hops must be an integer value. ");
}
catch (SecurityException e) {
logger.error("Unable to create directory " + outputDir);
}
}
}
/**
* Returns the CLI options
* @return
*/
public static Options getOptions() {
Options options = new Options();
Option urlFile = new Option("f", true, "Input file of URLS to be crawled");
urlFile.setRequired(true);
options.addOption(urlFile);
Option outputDir = new Option("o", true, "Output directory for *.txt and *.key files");
outputDir.setRequired(true);
options.addOption(outputDir);
options.addOption("h", false, "Print this help message");
options.addOption("n", true,
"Number of hops. (Default=0, first page only)");
options.addOption("d", false, "Enable differencing (Default=true)");
return options;
}
/* Generate a file name from the domain name by removing trailing slash (if present)
* and replacing dots with underscores.
*/
private static String generateFileName(String urlString) {
String fname = urlString.trim();
if (fname.endsWith("/"))
fname = fname.substring(0,fname.length() - 1);
int pos = fname.lastIndexOf("/");
if (pos > 0)
fname = fname.substring(pos+1);
fname = fname.replace(".","_");
return fname;
}
/**
* creates a HSSFWorkbook for the specified filename.
*/
private static HSSFWorkbook readFile(String filename) throws IOException {
return new HSSFWorkbook(new FileInputStream(filename));
}
public void setProxy(String host, int port) {
this.proxyHost = host;
this.proxyPort = port;
}
public void setIgnorePrefixes(List<String> prefixes) {
if (prefixes != null)
{
for (String pattern: prefixes)
{
Pattern p = Pattern.compile("(" + pattern + ").*?");
ignorePatterns.add(p);
}
}
}
/**
* Returns a text representation of the website at the specified URL by crawling
* all links up to the maximum number of "hops" from the initial URL.
*
* @param url Website to retrieve text for
* @param maxHops Maximum number of hops to crawl
* @param diff Only extract differences between base page and subsequent pages
* @return
* @throws ClientProtocolException
* @throws IOException
*/
public String getText(URL url, int maxHops, boolean diff) throws ClientProtocolException, IOException, TikaException, SAXException {
String baseText = null;
if (diff) {
String baseHTML = getHtml(url.toString());
baseText = getTextFromHtml(baseHTML);
}
return getText(url.toString(), url.toString(), maxHops, 0, baseText);
}
/* Same as getText(URL, int, boolean) above, but is used by the main method
* to generate LC test collection files: *.txt, *.key, and *.html
*/
public String getTextAndHTML(URL url, int maxHops, boolean diff) throws ClientProtocolException, IOException, TikaException, SAXException {
saveHTML = true;
htmlForURL = "";
String baseText = null;
if (diff) {
String baseHTML = getHtml(url.toString());
baseText = getTextFromHtml(baseHTML);
}
return getText(url.toString(), url.toString(), maxHops, 0, baseText);
}
/**
* Returns a text representation of the website at the specified URL by crawling
* all links up to the maximum number of "hops" from the initial URL limited
* by the base URL. All links must match the base URL to be traversed.
*
* @param url Website to retrieve text for
* @param baseURL Base URL used to filter links
* @param maxHops Maximum number of hops to crawl
* @return
* @throws ClientProtocolException
* @throws IOException
*/
public String getText(URL url, String baseURL, int maxHops, boolean diff) throws ClientProtocolException, IOException, TikaException, SAXException {
String baseText = null;
if (diff) {
String baseHTML = getHtml(url.toString());
baseText = getTextFromHtml(baseHTML);
}
return getText(url.toString(), baseURL, maxHops, 0, baseText);
}
public boolean isPartOf(String url, String baseUrl)
{
String newBaseUrl = stripPrefix(baseUrl);
String newUrl = stripPrefix(url);
return newUrl.startsWith(newBaseUrl);
/*
if (baseUrl.startsWith("http://webarchive.loc.gov"))
{
String archiveUrl = url.substring(url.lastIndexOf("http://"), url.length());
String archiveBase = baseUrl.substring(baseUrl.lastIndexOf("http://"), baseUrl.length());
return archiveUrl.startsWith(archiveBase);
}
else
return url.startsWith(baseUrl);
*/
}
private String stripPrefix(String url)
{
String newUrl = url;
for (Pattern p: ignorePatterns)
{
Matcher m = p.matcher(url);
if (m.matches())
{
String prefix = m.group(1);
// Ignore this prefix
newUrl = url.substring(prefix.length(), url.length());
break;
}
}
return newUrl;
}
public String stripLocPrefix(String url)
{
if (url.startsWith("http://webarchive.loc.gov"))
return url.substring(url.lastIndexOf("http://"), url.length());
else
return url;
}
private String getText(String url, String baseURL, int maxHops,
int currentHop) throws ClientProtocolException, IOException
{
String baseHTML = getHtml(url);
return getText(url, baseURL, maxHops, currentHop, baseHTML);
}
private String getHtml(String url) throws IOException
{
String html = "";
HttpClient client = new DefaultHttpClient();
HttpGet get = new HttpGet(url);
HttpParams params = new BasicHttpParams();
params.setParameter("http.protocol.handle-redirects", true);
get.setParams(params);
if (proxyHost != null)
{
HttpHost proxy = new HttpHost(proxyHost, proxyPort);
client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
}
HttpResponse response = client.execute(get);
HttpEntity entity = response.getEntity();
if (entity != null) {
Header contentType = entity.getContentType();
try
{
// Only process links of type text/html
if (!contentType.getValue().contains("text/html"))
return getTextFromURL(url);
} catch (Exception e) {
logger.error(e);
}
// Read the response
InputStream is = entity.getContent();
StringWriter sw = new StringWriter();
int c;
while ((c = is.read()) != -1)
sw.write(c);
is.close();
sw.close();
// Get text of current page
html = sw.toString();
}
client.getConnectionManager().shutdown();
return html;
}
/**
* Internal method used to recursively traverse a website up to the maximum number of "hops"
*
* @param url Website to be crawled
* @param baseURL Base URL used as a filter
* @param maxHops Maximum number of hops to crawl
* @param currentHop Current hop
* @return
* @throws ClientProtocolException
* @throws IOException
*/
private String getText(String url, String baseURL, int maxHops,
int currentHop, String baseText) throws ClientProtocolException, IOException
{
logger.debug("getText " + url + "(" + maxHops + "," + currentHop + ")");
// Add this URL to the list of processed URLs
String tmpUrl = stripLocPrefix(url);
Integer tmpHop = retrievedURLs.get(tmpUrl);
if (tmpHop != null && tmpHop <= currentHop) {
logger.debug("Skipping " + tmpUrl + ", already seen");
return "";
}
if (!isPartOf(url, baseURL)) {
logger.debug("Skipping " + url + ", not part of current site.");
return "";
}
String html = getHtml(url);
String text = "";
if (saveHTML)
htmlForURL = htmlForURL + html;
try
{
// Get the text of the current page
String tmpText = getTextFromHtml(html);
String diffText = "";
if (currentHop > 0 && baseText != null)
{
diffText = getDiff(tmpText, baseText);
//diffText = diffText.replaceAll("\\s+", " ");
diffText = diffText.replaceAll(tmpUrl.toLowerCase(), "");
text += diffText;
}
else
text = tmpText;
// Add this URL to the list of processed URLs
if (tmpHop == null || tmpHop > currentHop)
retrievedURLs.put(tmpUrl, currentHop);
} catch (Exception e) {
logger.warn(e);
}
// Continue to process additional links
if (currentHop < maxHops)
{
logger.debug("Getting links from " + tmpUrl);
// Get links from the current page
List<String> links = getLinks(url, new StringBuffer(html));
for (String link : links) {
String tmpLink = stripLocPrefix(link);
// For each link, if not already processed, get text
tmpHop = retrievedURLs.get(tmpLink);
if (tmpHop == null || tmpHop > currentHop)
{
//if (!retrievedURLs.containsKey(tmpLink)) {
String tmp = getText(link, baseURL, maxHops, currentHop+1, baseText);
text += tmp;
}
}
}
return text;
}
/**
* Uses the Tika library to extract text from HTML
*
* @param html HTML to process
* @return
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
protected String getTextFromHtml(String html) throws IOException, SAXException, TikaException
{
InputStream is = new ByteArrayInputStream(html.getBytes());
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler(-1);
parser.parse(is, handler, metadata);
is.close();
return handler.toString();
}
/**
* Uses the Tika library to extract text from a URL
*
* @param path URL to process
* @return
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
protected String getTextFromURL(String path) throws IOException, SAXException, TikaException
{
URL url = new URL(path);
InputStream is = url.openStream();
Metadata metadata = new Metadata();
int slash = path.lastIndexOf('/');
String name = path.substring(slash + 1);
if (name.length() > 0) {
metadata.set(Metadata.RESOURCE_NAME_KEY, name);
}
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler(-1);
parser.parse(is, handler, metadata);
is.close();
return handler.toString();
}
/**
* Returns a list of absolute URLs from the fetched page
* @param baseUrl Base URL
* @param sb Fetched HTML
* @return
*/
protected List<String> getLinks(String baseUrl, StringBuffer html)
{
List<String> links = new ArrayList<String>();
Matcher tagmatch = HREF_PATTERN.matcher(html.toString());
while (tagmatch.find())
{
String link = tagmatch.group(1);
if (valid(link)) {
links.add(makeAbsolute(baseUrl, link));
}
}
return links;
}
private String getDiff(String base, String current)
{
List<String> baseRows = Arrays.asList(base.split("\n"));
List<String> currentRows = Arrays.asList(current.split("\n"));
Patch patch = DiffUtils.diff(baseRows, currentRows);
List<Delta> deltas = patch.getDeltas();
String diff = "";
for (Delta delta: deltas) {
Chunk c= delta.getOriginal();
List<String> lines = (List<String>) c.getLines();
for (String line : lines)
diff += line + "\n";
}
return diff;
}
/**
* Returns true if the specified URL is a valid HTTP URL.
* @param s
* @return
*/
private boolean valid(String s) {
if (s.matches("javascript:.*|mailto:.*") || s.equals("#")) {
return false;
}
return true;
}
/**
* Create an absolute URL given a base URL and relative URL
* @param url Base URL
* @param link Relative URL
* @return
*/
protected static String makeAbsolute(String baseUrl, String relativeUrl)
{
String absoluteUrl = "";
try {
if (!baseUrl.endsWith("/"))
baseUrl += "/";
URI base = new URI(baseUrl);
absoluteUrl = base.resolve(relativeUrl).toString();
} catch (IllegalArgumentException e) {
absoluteUrl = "";
logger.warn(e);
} catch (URISyntaxException e) {
logger.warn(e);
}
return absoluteUrl;
}
}