/**
*
* Copyright 2013-2014 OpenSextant.org
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.opensextant.xtext.collectors.sharepoint;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.Collection;
import org.apache.http.HttpResponse;
import org.opensextant.ConfigException;
import org.opensextant.util.TextUtils;
import org.opensextant.xtext.ConvertedDocument;
import org.opensextant.xtext.ExclusionFilter;
import org.opensextant.xtext.collectors.CollectionListener;
import org.opensextant.xtext.collectors.Collector;
import org.opensextant.xtext.collectors.web.CrawlFilter;
import org.opensextant.xtext.collectors.web.HyperLink;
import org.opensextant.xtext.collectors.web.WebClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* TODO: generalize this so there is a single web crawler and the site implementation might be
* sharepoint, generic HTTP, or other. The objective of this crawler is to collect documents from
* sharepoint. Landing pages, e.g. HTML for sites and sub-sites, themselves will not be harvested.
*
* @author ubaldino
*
*/
public class DefaultSharepointCrawl extends SharepointClient implements ExclusionFilter, Collector,
CrawlFilter {
/**
* A collection listener to consult as far as how to record the found & converted content
* as well as to determine what is worth saving.
*
*/
protected CollectionListener listener = null;
private final Logger log = LoggerFactory.getLogger(getClass());
private boolean allowCurrentSiteOnly = true;
private boolean allowCurrentDirOnly = false;
/**
* Instantiates a new default sharepoint crawl.
*
* @param srcSite site url
* @param destFolder output folder
* @param u user ID
* @param p password
* @param dom domain
* @throws MalformedURLException on err
* @throws ConfigException on err
*/
public DefaultSharepointCrawl(String srcSite, String destFolder, String u, String p, String dom)
throws MalformedURLException, ConfigException {
super(srcSite, destFolder, u, p, dom);
}
/**
* Important that you set a listener if you want to see what was captured.
* As well as optimize future harvests. Listener tells the collector if the item in question was harvested or not.
* @param l listener to use
*/
public void setListener(CollectionListener l) {
listener = l;
}
/**
* For web crawl, this default crawler considers flash, video media, etc. to be out of scope.
* Other HREF links, like mailto:xyz@me.com are also items to avoid.
* Method is left open so you may override.
* @param path a url
*/
@Override
public boolean filterOutFile(String path) {
String url = path.toLowerCase();
if (url.endsWith(".flv")) {
return true;
}
if (url.endsWith(".mp4")) {
return true;
}
if (url.startsWith("mailto:")) {
return true;
}
return false;
}
/**
* Run the collection.
* Make sure you have set your converter and collection listener
* If you have a converter that also has a conversion listener, whoa!! good luck.
* This web crawl example is meant to provide the mechanics of the conversion listener
* as implemented by the collection listener.
* The details on how actions at collection time differ from conversion time are TBD.
*
* @throws IOException on err
*/
@Override
public void collect() throws IOException {
collectItems(this.getSite());
}
/**
* Override this if you have differnt ideas about what URL patterns are of interest.
* DEFAULT FILTER OUT: video files, page anchors, mailto links
* @param link found link
* @return true if link should be ignored.
*/
public boolean filterOut(HyperLink link) {
if (filterOutFile(link.getAbsoluteURL())) {
return true;
}
// if (link.isPageAnchor()) {
// log.debug("Filter out anchor link {}", link);
// return true;
// }
return false;
}
/**
* recursive folder crawl through sharepoint site. This is where docs are
* converted and recorded.
* TODO: test more completely the depths of recursive folders this supports.
*
* @param link URL to collect
* @throws IOException on err
*/
public void collectItems(URL link) throws IOException {
if (depth >= MAX_DEPTH) {
log.info("Maximum Depth reached with link: {}", link);
return;
}
HttpResponse page = getPage(link);
String rawData = WebClient.readTextStream(page.getEntity().getContent());
Collection<SPLink> items = parseContentPage(rawData, link);
++depth;
for (SPLink l : items) {
if (filterOut(l)) {
log.debug("Filtering out {}", l);
continue;
}
if (this.isAllowCurrentSiteOnly() && !(l.isCurrentSite() || l.isCurrentHost())) {
// Page represented by link, l, is on another website.
log.info("Not on current site: {}", l);
continue;
}
// Download artifacts
if (l.isFile()) {
pause();
try {
String oid = TextUtils.text_id(l.getAbsoluteURL());
try {
if (listener != null && listener.exists(oid)) {
continue;
}
} catch (Exception err1) {
log.error("Collection Listener error", err1);
continue;
}
// create URL for link and download artifact.
// encode URL prior to retrieval.
//
HttpResponse itemPage = getPage(l.getURL());
// B. Drop files in archive mirroring the original
// Sharepoint site structure.
File itemSaved = createArchiveFile(l.getNormalPath(), false /*not dir*/);
WebClient.downloadFile(itemPage.getEntity(), itemSaved.getAbsolutePath());
convertContent(itemSaved, l);
} catch (Exception fileErr) {
log.error("Item for URL {} was not saved due to a net or IO issue.",
l.getAbsoluteURL(), fileErr);
}
}
// Traverse sub-folders, N-deep?
}
// D. Get Folders at this level.
//
for (SPLink l : items) {
// Download folders, recursively.
if (l.isSharepointFolder()) {
try {
collectItems(l.getSimplifiedFolderURL());
} catch (Exception fileErr) {
log.error("Folder URL {} was not saved due to a net or IO issue.",
l.getSimplifiedFolderURL(), fileErr);
}
}
// Traverse sub-folders, N-deep?
}
--depth;
}
/**
* TODO: redesign so both Web crawl and Sharepoint crawl share this common routine:
* copy copy copy -- see DefaultWebCrawl
*
* convert and record a downloaded item, given the item and its source URL.
* @param item item
* @param link original URL where item was found
* @throws IOException on err
* @throws ConfigException on err
* @throws NoSuchAlgorithmException on err
*/
protected void convertContent(File item, HyperLink link) throws IOException, ConfigException,
NoSuchAlgorithmException {
if (item == null || link == null) {
throw new IOException("Bad data - null values for file and link...");
}
if (converter == null && listener != null) {
log.debug("Link {} was saved to {}", link.getAbsoluteURL(), item.getAbsolutePath());
listener.collected(item);
return;
}
/**
* Convert the item.
*/
ConvertedDocument doc = null;
if (item.exists()) {
// record with a success state.
doc = converter.convert(item);
if (doc != null) {
doc.setDefaultID();
doc.addSourceURL(link.getAbsoluteURL(), link.getReferrer());
// This path must already exist
doc.saveBuffer(new File(doc.textpath));
if (listener != null) {
listener.collected(doc, item.getAbsolutePath());
}
} else {
log.error("Document was not converted, FILE={}", item);
}
}
}
/**
* @see org.opensextant.xtext.collectors.web.CrawlFilter#isAllowCurrentDirOnly()
*/
@Override
public boolean isAllowCurrentDirOnly() {
return allowCurrentDirOnly;
}
/* (non-Javadoc)
* @see org.opensextant.xtext.collectors.web.CrawlFilter#setAllowCurrentDirOnly(boolean)
*/
@Override
public void setAllowCurrentDirOnly(boolean allowCurrentDirOnly) {
this.allowCurrentDirOnly = allowCurrentDirOnly;
}
/* (non-Javadoc)
* @see org.opensextant.xtext.collectors.web.CrawlFilter#isAllowCurrentSiteOnly()
*/
@Override
public boolean isAllowCurrentSiteOnly() {
return allowCurrentSiteOnly;
}
/* (non-Javadoc)
* @see org.opensextant.xtext.collectors.web.CrawlFilter#setAllowCurrentSiteOnly(boolean)
*/
@Override
public void setAllowCurrentSiteOnly(boolean allowCurrentSiteOnly) {
this.allowCurrentSiteOnly = allowCurrentSiteOnly;
}
}