/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.filtering;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.slf4j.LoggerFactory;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.NullNode;
/**
* Wrapper for the URLFilters defined in a JSON configuration
*/
public class URLFilters implements URLFilter {
public static final URLFilters emptyURLFilters = new URLFilters();
private static final org.slf4j.Logger LOG = LoggerFactory
.getLogger(URLFilters.class);
private URLFilter[] filters;
private URLFilters() {
filters = new URLFilters[0];
}
/**
* Loads and configure the URLFilters based on the storm config if there is
* one otherwise returns an empty URLFilter.
**/
public static URLFilters fromConf(Map stormConf) {
String urlconfigfile = ConfUtils.getString(stormConf,
"urlfilters.config.file");
if (StringUtils.isNotBlank(urlconfigfile)) {
try {
return new URLFilters(stormConf, urlconfigfile);
} catch (IOException e) {
String message = "Exception caught while loading the URLFilters from "
+ urlconfigfile;
LOG.error(message);
throw new RuntimeException(message, e);
}
}
return URLFilters.emptyURLFilters;
}
/**
* Loads the filters from a JSON configuration file
*
* @throws IOException
*/
public URLFilters(Map stormConf, String configFile) throws IOException {
// load the JSON configFile
// build a JSON object out of it
JsonNode confNode;
try (InputStream confStream = getClass().getClassLoader()
.getResourceAsStream(configFile)) {
ObjectMapper mapper = new ObjectMapper();
confNode = mapper.readValue(confStream, JsonNode.class);
} catch (Exception e) {
throw new IOException("Unable to build JSON object from file", e);
}
configure(stormConf, confNode);
}
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata,
String urlToFilter) {
String normalizedURL = urlToFilter;
try {
for (URLFilter filter : filters) {
long start = System.currentTimeMillis();
normalizedURL = filter.filter(sourceUrl, sourceMetadata,
normalizedURL);
long end = System.currentTimeMillis();
LOG.debug("URLFilter {} took {} msec", filter.getClass()
.getName(), end - start);
if (normalizedURL == null)
break;
}
} catch (Exception e) {
LOG.error("URL filtering threw exception", e);
}
return normalizedURL;
}
@Override
public void configure(Map stormConf, JsonNode jsonNode) {
// initialises the filters
List<URLFilter> filterLists = new ArrayList<>();
// get the filters part
String name = getClass().getCanonicalName();
jsonNode = jsonNode.get(name);
if (jsonNode == null) {
LOG.info("No field {} in JSON config. Skipping", name);
filters = new URLFilter[0];
return;
}
// conf node contains a list of objects
Iterator<JsonNode> filterIter = jsonNode.elements();
while (filterIter.hasNext()) {
JsonNode afilterNode = filterIter.next();
String filterName = "<unnamed>";
JsonNode nameNode = afilterNode.get("name");
if (nameNode != null) {
filterName = nameNode.textValue();
}
JsonNode classNode = afilterNode.get("class");
if (classNode == null) {
LOG.error("Filter {} doesn't specified a 'class' attribute",
filterName);
continue;
}
String className = classNode.textValue().trim();
filterName += '[' + className + ']';
// check that it is available and implements the interface URLFilter
try {
Class<?> filterClass = Class.forName(className);
boolean interfaceOK = URLFilter.class
.isAssignableFrom(filterClass);
if (!interfaceOK) {
LOG.error("Class {} does not implement URLFilter",
className);
continue;
}
URLFilter filterInstance = (URLFilter) filterClass
.newInstance();
JsonNode paramNode = afilterNode.get("params");
if (paramNode != null) {
filterInstance.configure(stormConf, paramNode);
} else {
filterInstance.configure(stormConf, NullNode.getInstance());
}
filterLists.add(filterInstance);
LOG.info("Loaded instance of class {}", className);
} catch (Exception e) {
LOG.error("Can't setup {}: {}", filterName, e);
continue;
}
}
filters = filterLists.toArray(new URLFilter[filterLists.size()]);
}
}