/**
*
* Copyright 2013-2014 OpenSextant.org
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.opensextant.xtext.collectors.web;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.client.utils.DateUtils;
import org.opensextant.ConfigException;
import org.opensextant.util.FileUtility;
import org.opensextant.xtext.ConvertedDocument;
import org.opensextant.xtext.ExclusionFilter;
import org.opensextant.xtext.collectors.CollectionListener;
import org.opensextant.xtext.collectors.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A demonstration of how to use the WebClient to crawl a site and convert it as you go.
* An optional collection listener is settable to let you do something with each item collected/converted.
*
* @author ubaldino
*
*/
public class DefaultWebCrawl extends WebClient implements ExclusionFilter, Collector, CrawlFilter {
/**
* A collection listener to consult as far as how to record the found & converted content
* as well as to determine what is worth saving.
*
*/
protected CollectionListener listener = null;
private final Logger log = LoggerFactory.getLogger(getClass());
private boolean allowCurrentSiteOnly = true;
private boolean allowCurrentDirOnly = false;
private HashSet<String> errorPages = new HashSet<>();
private List<String> prefixFilters = new ArrayList<>();
private List<String> prefixIgnore = new ArrayList<>();
/**
*
* @param srcSite
* top level site
* @param destFolder
* output folder
* @throws MalformedURLException
* if srcSite is invalid format
* @throws ConfigException
* if other setup error
*/
public DefaultWebCrawl(String srcSite, String destFolder) throws MalformedURLException,
ConfigException {
super(srcSite, destFolder);
}
/**
* Important that you set a listener if you want to see what was captured.
* As well as optimize future harvests. Listener tells the collector if the item in question was harvested or not.
*
* @param l
* listener to use
*/
public void setListener(CollectionListener l) {
listener = l;
}
public void addPrefixFilter(String u) {
if (StringUtils.isNotBlank(u)) {
this.prefixFilters.add(u);
}
}
public void addPrefixFilters(Collection<String> arr) {
if (arr != null) {
this.prefixFilters.addAll(arr);
}
}
public void addIgnoreFilter(String u) {
if (StringUtils.isNotBlank(u)) {
this.prefixIgnore.add(u);
}
}
public void addIgnoreFilters(Collection<String> arr) {
if (arr != null) {
this.prefixIgnore.addAll(arr);
}
}
/**
* For web crawl, this default crawler considers flash, video media, etc. to be out of scope.
* Other HREF links, like mailto:xyz@me.com are also items to avoid.
* Method is left open so you may override.
*
* @param path
* a url
*/
@Override
public boolean filterOutFile(String path) {
String url = path.toLowerCase();
if (url.startsWith("mailto:")) {
return true;
}
if (url.endsWith(".atom") || url.endsWith(".rss")) {
return true;
}
if (url.endsWith(".flv")) {
return true;
}
if (url.endsWith(".mp4")) {
return true;
}
if (url.contains("xmlrpc")) {
return true;
}
return false;
}
/**
* Run the collection.
* Make sure you have set your converter and collection listener
* If you have a converter that also has a conversion listener, whoa!! good luck.
* This web crawl example is meant to provide the mechanics of the conversion listener
* as implemented by the collection listener.
* The details on how actions at collection time differ from conversion time are TBD.
*
* @throws IOException
* on collection err
*/
@Override
public void collect() throws IOException {
try {
collectItems(null, this.getSite());
} catch (NoSuchAlgorithmException err) {
log.error("Hashing error", err);
}
}
/**
* Override this if you have differnt ideas about what URL patterns are of interest.
* DEFAULT FILTER OUT: video files, page anchors, mailto links
*
* @param link
* a URL
* @return true if link should be ignored.
*/
public boolean filterOut(HyperLink link) {
if (filterOutFile(link.getAbsoluteURL())) {
return true;
}
if (link.isPageAnchor()) {
log.debug("Filter out anchor link {}", link);
return true;
}
return false;
}
/**
* recursive folder crawl through a site. This is where docs are
* converted and recorded.
* As hashing algorithms are used in defining concise output paths, NoSuchAlgorithmException is thrown.
*
* @param _link
* a URL
* @param startingSite
* the top level site
* @throws IOException
* on err
* @throws NoSuchAlgorithmException
* error that never happens
*/
public void collectItems(String _link, URL startingSite) throws IOException,
NoSuchAlgorithmException {
String link = startingSite.toString();
if (_link != null) {
link = _link;
}
HyperLink thisLink = new HyperLink(link, new URL(link), getSite());
if (errorPages.contains(thisLink.getAbsoluteURL())) {
log.debug("Do not visit error pages tracked in this session; link: {}", link);
return;
}
HttpResponse page = getPage(prepURL(link));
/*
* As of XText 1.4, this HTTP header does not appear to be avaiable often using this http API:
*/
Header lastModStr = page.getFirstHeader("Last-Modified");
Date lastMod = null;
if (lastModStr != null) {
lastMod = DateUtils.parseDate(lastModStr.getValue());
}
/*
* 1. Capture the page content represented by the requested link.
* It is saved to FILE.html
*/
String rawData = WebClient.readTextStream(page.getEntity().getContent());
String thisPath = thisLink.getNormalPath();
//if (StringUtils.isEmpty(thisPath)) {
// return;
//}
if (thisLink.isDynamic() && (!thisPath.endsWith("html"))) {
thisPath = String.format("%s.html", thisPath);
}
File thisPage = createArchiveFile(thisPath, thisLink.isFolder());
// OVERWRITE:
if (!thisPage.exists()) {
FileUtility.writeFile(rawData, thisPage.getAbsolutePath());
}
log.info("Starting in on {} from {} @ depth=" + depth, link, site);
pause();
++depth;
collectItemsOnPage(rawData, thisLink.getURL(), getSite());
}
/**
*
* @param f file object
* @throws IOException on err
*/
public void collect(File f) throws IOException {
String pageContent = FileUtility.readFile(f, "UTF-8");
collectItemsOnPage(pageContent, getSite(), getSite());
}
/**
* User filters are likely to be more general ALLOW, but specific DENIES within what is allowed.
*
* @param path filepath
* @return if user options filter out the given path
*/
protected boolean userFilteredOut(final String path) {
/*
* Caller URL filters. Filter In.
*/
boolean allow = true;
if (this.prefixFilters.size() > 0) {
allow = false;
for (String filt : prefixFilters) {
if (path.startsWith(filt)) {
allow = true;
break;
}
}
}
if (!allow) {
return true;
}
/*
* Okay, url was allowed, but does it fit a pattern that is to be filtered out?
*/
if (this.prefixIgnore.size() > 0) {
for (String filt : prefixIgnore) {
if (path.startsWith(filt)) {
allow = false;
break;
}
}
}
return !allow;
}
/**
* Internal method for parsing and harvesting from a single page and then crawling deeper, if instructed to do so.
*
* @param pageContent raw HTML
* @param url url for HTML
* @param site top level url for site
*/
protected void collectItemsOnPage(String pageContent, URL url, URL site) {
Collection<HyperLink> items = parseContentPage(pageContent, url, site);
/* 2. Collect items on this page.
*
*/
for (HyperLink l : items) {
if (filterOut(l)) {
continue;
}
if (this.isAllowCurrentSiteOnly() && !(l.isCurrentSite() || l.isCurrentHost())) {
// Page represented by link, l, is on another website.
log.debug("Not on current site: {}", l);
continue;
}
if (this.isAllowCurrentDirOnly() && !l.isCurrentPage()) {
// Page represented by link, l, is on another directory on same or site.
log.debug("Not on current directory: {}", l);
continue;
}
/* TODO: fix "key", as it represents not just path, but unique URLs
* different URLs with same path would collide.
* TODO: in general fix the ability to crawl off requested site.
* If that is really needed, this is not the crawling capability you want.
*
*/
String key = l.getNormalPath();
if (key == null) {
key = l.getAbsoluteURL();
}
if (found.containsKey(key)) {
// We already did this.
continue;
}
if (userFilteredOut(key)) {
// We don't want to do this.
log.debug("Filtered Out by User: {}", key);
continue;
}
found.put(key, l);
// B. Drop files in archive mirroring the original
//
if (saved.contains(l.getId())) {
// in theory this item resolved to an item that was already saved.
// ignore.
continue;
}
// Download artifacts
if (l.isFile() || l.isWebPage()) {
pause();
log.info("Pulling page {}", l);
try {
// The default document ID will be an MD5 hash ID of the URL.
// This may differ for other collectors/harvesters/listeners
//
try {
if (listener != null && listener.exists(l.getId())) {
// You already collected this. So it will be ignored.
continue;
}
} catch (Exception err1) {
log.error("Collection Listener error", err1);
continue;
}
// create URL for link and download artifact.
HttpResponse itemPage = getPage(l.getURL());
// Regardless of the item's discovered path, determine
// the relative path.
if (itemPage.getStatusLine().getStatusCode() >= 400) {
this.errorPages.add(l.getAbsoluteURL());
log.error("Failing on this request, HTTP status>=400, LINK={}", l.getURL());
continue;
}
/*
* Identify the correct type of file this item is, from HTTP headers & MIME, not just the link
*/
Header contentType = itemPage.getEntity().getContentType();
if (contentType != null) {
l.setMIMEType(contentType.getValue());
}
/*
* Create a non-trivial path for the item.
*
*/
String fpath = l.getNormalPath();
if (l.isDynamic()) {
if (!fpath.endsWith(".html")) {
fpath = fpath + ".html";
}
}
File itemSaved = createArchiveFile(fpath, false);
File dir = new File(itemSaved.getParentFile().getAbsolutePath());
FileUtility.makeDirectory(dir);
l.setFilepath(itemSaved);
// CACHE the identify of this URL.
saved.add(l.getId());
WebClient.downloadFile(itemPage.getEntity(), itemSaved.getAbsolutePath());
convertContent(itemSaved, l);
// Continue to crawl deeper...
//
if (l.isWebPage() && depth <= MAX_DEPTH) {
collectItems(l.getAbsoluteURL(), site);
}
} catch (Exception fileErr) {
log.error("Item for URL {} was not saved due to a net or IO issue.",
l.getAbsoluteURL(), fileErr);
}
}
}
--depth;
}
/**
* convert and record a downloaded item, given the item and its source URL.
*
* @param item
* item to convert
* @param link
* link representing the original/source
* @throws IOException
* on err
* @throws ConfigException
* on err
* @throws NoSuchAlgorithmException
* an error that never happens
*/
protected void convertContent(File item, HyperLink link)
throws IOException, ConfigException, NoSuchAlgorithmException {
if (item == null || link == null) {
throw new IOException("Bad data - null values for file and link...");
}
if (converter == null && listener != null) {
log.debug("Link {} was saved to {}", link.getAbsoluteURL(), item.getAbsolutePath());
listener.collected(item);
return;
}
/**
* Convert the item.
*/
ConvertedDocument doc = null;
if (item.exists()) {
// record with a success state.
doc = converter.convert(item);
if (doc != null) {
if (doc.textpath == null) {
log.error("Expecting the content to be non-null for {}", doc.getFilepath());
return;
}
//doc.setDefaultID();
doc.setId(link.getId());
doc.addSourceURL(link.getAbsoluteURL(), link.getReferrer());
// This path must already exist
doc.saveBuffer(new File(doc.textpath));
if (listener != null) {
listener.collected(doc, item.getAbsolutePath());
}
} else {
log.error("Document was not converted, FILE={}", item);
}
}
}
/**
* @see org.opensextant.xtext.collectors.web.CrawlFilter#isAllowCurrentDirOnly()
*/
@Override
public boolean isAllowCurrentDirOnly() {
return allowCurrentDirOnly;
}
/**
* @see org.opensextant.xtext.collectors.web.CrawlFilter#setAllowCurrentDirOnly(boolean)
*/
@Override
public void setAllowCurrentDirOnly(boolean allowCurrentDirOnly) {
this.allowCurrentDirOnly = allowCurrentDirOnly;
}
/**
* @see org.opensextant.xtext.collectors.web.CrawlFilter#isAllowCurrentSiteOnly()
*/
@Override
public boolean isAllowCurrentSiteOnly() {
return allowCurrentSiteOnly;
}
/**
* @see org.opensextant.xtext.collectors.web.CrawlFilter#setAllowCurrentSiteOnly(boolean)
*/
@Override
public void setAllowCurrentSiteOnly(boolean allowCurrentSiteOnly) {
this.allowCurrentSiteOnly = allowCurrentSiteOnly;
}
}