/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.bolt;
import static com.digitalpebble.stormcrawler.Constants.StatusStreamName;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.LoggerFactory;
import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.parse.Outlink;
import com.digitalpebble.stormcrawler.parse.ParseData;
import com.digitalpebble.stormcrawler.parse.ParseFilter;
import com.digitalpebble.stormcrawler.parse.ParseFilters;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.google.common.primitives.Bytes;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.SiteMapURL.ChangeFrequency;
import crawlercommons.sitemaps.UnknownFormatException;
/**
* Extracts URLs from sitemap files. The parsing is triggered by the presence of
* 'isSitemap=true' in the metadata. Any tuple which does not have this
* key/value in the metadata is simply passed on to the default stream, whereas
* any URLs extracted from the sitemaps is sent to the 'status' field.
*/
@SuppressWarnings("serial")
public class SiteMapParserBolt extends StatusEmitterBolt {
public static final String isSitemapKey = "isSitemap";
private static final org.slf4j.Logger LOG = LoggerFactory
.getLogger(SiteMapParserBolt.class);
private boolean strictMode = false;
private boolean sniffWhenNoSMKey = false;
private ParseFilter parseFilters;
private int filterHoursSinceModified = -1;
private int maxOffsetGuess = 300;
@Override
public void execute(Tuple tuple) {
Metadata metadata = (Metadata) tuple.getValueByField("metadata");
// TODO check that we have the right number of fields?
byte[] content = tuple.getBinaryByField("content");
String url = tuple.getStringByField("url");
String isSitemap = metadata.getFirstValue(isSitemapKey);
// doesn't have the metadata expected
if (!Boolean.valueOf(isSitemap)) {
int found = -1;
if (sniffWhenNoSMKey) {
// try based on the first bytes?
// works for XML and non-compressed documents
byte[] clue = "http://www.sitemaps.org/schemas/sitemap/0.9"
.getBytes();
byte[] beginning = content;
if (content.length > maxOffsetGuess && maxOffsetGuess > 0) {
beginning = Arrays.copyOfRange(content, 0, maxOffsetGuess);
}
found = Bytes.indexOf(beginning, clue);
if (found != -1) {
LOG.info("{} detected as sitemap based on content", url);
}
}
// not a sitemap file
if (found == -1) {
// just pass it on
this.collector.emit(tuple, tuple.getValues());
this.collector.ack(tuple);
return;
}
}
// it is a sitemap
String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
List<Outlink> outlinks;
try {
outlinks = parseSiteMap(url, content, ct, metadata);
} catch (Exception e) {
// exception while parsing the sitemap
String errorMessage = "Exception while parsing " + url + ": " + e;
LOG.error(errorMessage);
// send to status stream in case another component wants to update
// its status
metadata.setValue(Constants.STATUS_ERROR_SOURCE, "sitemap parsing");
metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
collector.emit(Constants.StatusStreamName, tuple, new Values(url,
metadata, Status.ERROR));
collector.ack(tuple);
return;
}
// apply the parse filters if any to the current document
try {
ParseResult parse = new ParseResult();
parse.setOutlinks(outlinks);
ParseData parseData = parse.get(url);
parseData.setMetadata(metadata);
parseFilters.filter(url, content, null, parse);
} catch (RuntimeException e) {
String errorMessage = "Exception while running parse filters on "
+ url + ": " + e;
LOG.error(errorMessage);
metadata.setValue(Constants.STATUS_ERROR_SOURCE,
"content filtering");
metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
collector.emit(StatusStreamName, tuple, new Values(url, metadata,
Status.ERROR));
collector.ack(tuple);
return;
}
// send to status stream
for (Outlink ol : outlinks) {
Values v = new Values(ol.getTargetURL(), ol.getMetadata(),
Status.DISCOVERED);
collector.emit(Constants.StatusStreamName, tuple, v);
}
// marking the main URL as successfully fetched
// regardless of whether we got a parse exception or not
collector.emit(Constants.StatusStreamName, tuple, new Values(url,
metadata, Status.FETCHED));
collector.ack(tuple);
}
private List<Outlink> parseSiteMap(String url, byte[] content,
String contentType, Metadata parentMetadata)
throws UnknownFormatException, IOException {
crawlercommons.sitemaps.SiteMapParser parser = new crawlercommons.sitemaps.SiteMapParser(
strictMode);
URL sURL = new URL(url);
AbstractSiteMap siteMap;
// let the parser guess what the mimetype is
if (StringUtils.isBlank(contentType)
|| contentType.contains("octet-stream")) {
siteMap = parser.parseSiteMap(content, sURL);
} else {
siteMap = parser.parseSiteMap(contentType, content, sURL);
}
List<Outlink> links = new ArrayList<>();
if (siteMap.isIndex()) {
SiteMapIndex smi = (SiteMapIndex) siteMap;
Collection<AbstractSiteMap> subsitemaps = smi.getSitemaps();
// keep the subsitemaps as outlinks
// they will be fetched and parsed in the following steps
Iterator<AbstractSiteMap> iter = subsitemaps.iterator();
while (iter.hasNext()) {
AbstractSiteMap asm = iter.next();
String target = asm.getUrl().toExternalForm();
Date lastModified = asm.getLastModified();
String lastModifiedValue = "";
if (lastModified != null) {
// filter based on the published date
if (filterHoursSinceModified != -1) {
Calendar rightNow = Calendar.getInstance();
rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
if (lastModified.before(rightNow.getTime())) {
LOG.info(
"{} has a modified date {} which is more than {} hours old",
target, lastModified.toString(),
filterHoursSinceModified);
continue;
}
}
lastModifiedValue = lastModified.toString();
}
Outlink ol = filterOutlink(sURL, target, parentMetadata,
isSitemapKey, "true", "sitemap.lastModified",
lastModifiedValue);
if (ol == null) {
continue;
}
links.add(ol);
LOG.debug("{} : [sitemap] {}", url, target);
}
}
// sitemap files
else {
SiteMap sm = (SiteMap) siteMap;
// TODO see what we can do with the LastModified info
Collection<SiteMapURL> sitemapURLs = sm.getSiteMapUrls();
Iterator<SiteMapURL> iter = sitemapURLs.iterator();
while (iter.hasNext()) {
SiteMapURL smurl = iter.next();
// TODO handle priority in metadata
double priority = smurl.getPriority();
// TODO convert the frequency into a numerical value and handle
// it in metadata
ChangeFrequency freq = smurl.getChangeFrequency();
String target = smurl.getUrl().toExternalForm();
String lastModifiedValue = "";
Date lastModified = smurl.getLastModified();
if (lastModified != null) {
// filter based on the published date
if (filterHoursSinceModified != -1) {
Calendar rightNow = Calendar.getInstance();
rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
if (lastModified.before(rightNow.getTime())) {
LOG.info(
"{} has a modified date {} which is more than {} hours old",
target, lastModified.toString(),
filterHoursSinceModified);
continue;
}
}
lastModifiedValue = lastModified.toString();
}
Outlink ol = filterOutlink(sURL, target, parentMetadata,
isSitemapKey, "false", "sitemap.lastModified",
lastModifiedValue);
if (ol == null) {
continue;
}
links.add(ol);
LOG.debug("{} : [sitemap] {}", url, target);
}
}
return links;
}
@Override
@SuppressWarnings({ "rawtypes", "unchecked" })
public void prepare(Map stormConf, TopologyContext context,
OutputCollector collector) {
super.prepare(stormConf, context, collector);
sniffWhenNoSMKey = ConfUtils.getBoolean(stormConf,
"sitemap.sniffContent", false);
filterHoursSinceModified = ConfUtils.getInt(stormConf,
"sitemap.filter.hours.since.modified", -1);
parseFilters = ParseFilters.fromConf(stormConf);
maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess",
300);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
super.declareOutputFields(declarer);
declarer.declare(new Fields("url", "content", "metadata"));
}
}