/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.parse.filter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.filtering.URLFilters;
import com.digitalpebble.stormcrawler.parse.Outlink;
import com.digitalpebble.stormcrawler.parse.ParseData;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.digitalpebble.stormcrawler.util.MetadataTransfer;
import com.digitalpebble.stormcrawler.util.URLUtil;
import com.fasterxml.jackson.databind.JsonNode;
/**
* ParseFilter to extract additional links with Xpath can be configured with
* e.g.
*
* <pre>
* {@code
* {
* "class": "com.digitalpebble.stormcrawler.parse.filter.LinkParseFilter",
* "name": "LinkParseFilter",
* "params": {
* "pattern": "//IMG[@src]",
* "pattern2": "//VIDEO/SOURCE[@src]"
* }
* }
* }
* </pre>
**/
public class LinkParseFilter extends XPathFilter {
private static final Logger LOG = LoggerFactory
.getLogger(LinkParseFilter.class);
private MetadataTransfer metadataTransfer;
private URLFilters urlFilters;
@Override
public void filter(String URL, byte[] content, DocumentFragment doc,
ParseResult parse) {
ParseData parseData = parse.get(URL);
Metadata metadata = parseData.getMetadata();
Map<String, Outlink> dedup = new HashMap<String, Outlink>();
for (Outlink o : parse.getOutlinks()) {
dedup.put(o.getTargetURL(), o);
}
java.net.URL sourceUrl;
try {
sourceUrl = new URL(URL);
} catch (MalformedURLException e1) {
// we would have known by now as previous components check whether
// the URL is valid
LOG.error("MalformedURLException on {}", URL);
return;
}
// applies the XPATH expression in the order in which they are produced
java.util.Iterator<List<LabelledExpression>> iter = expressions
.values().iterator();
while (iter.hasNext()) {
List<LabelledExpression> leList = iter.next();
for (LabelledExpression le : leList) {
try {
List<String> values = le.evaluate(doc);
if (values == null || values.isEmpty()) {
continue;
}
for (String target : values) {
// resolve URL
target = URLUtil.resolveURL(sourceUrl, target)
.toExternalForm();
// apply filtering
target = urlFilters.filter(sourceUrl, metadata, target);
if (target == null) {
continue;
}
// check whether we already have this link
if (dedup.containsKey(target)) {
continue;
}
// create oulink
Outlink ol = new Outlink(target);
// get the metadata for the outlink from the parent one
Metadata metadataOL = metadataTransfer
.getMetaForOutlink(target, URL, metadata);
ol.setMetadata(metadataOL);
dedup.put(ol.getTargetURL(), ol);
}
} catch (Exception e) {
LOG.error("Error evaluating {}: {}", le.key, e);
}
}
}
parse.setOutlinks(new ArrayList(dedup.values()));
}
@SuppressWarnings("rawtypes")
@Override
public void configure(Map stormConf, JsonNode filterParams) {
super.configure(stormConf, filterParams);
this.metadataTransfer = MetadataTransfer.getInstance(stormConf);
this.urlFilters = URLFilters.fromConf(stormConf);
}
}