/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.filtering; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.NullNode; /** * Wrapper for the URLFilters defined in a JSON configuration */ public class URLFilters implements URLFilter { public static final URLFilters emptyURLFilters = new URLFilters(); private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(URLFilters.class); private URLFilter[] filters; private URLFilters() { filters = new URLFilters[0]; } /** * Loads and configure the URLFilters based on the storm config if there is * one otherwise returns an empty URLFilter. **/ public static URLFilters fromConf(Map stormConf) { String urlconfigfile = ConfUtils.getString(stormConf, "urlfilters.config.file"); if (StringUtils.isNotBlank(urlconfigfile)) { try { return new URLFilters(stormConf, urlconfigfile); } catch (IOException e) { String message = "Exception caught while loading the URLFilters from " + urlconfigfile; LOG.error(message); throw new RuntimeException(message, e); } } return URLFilters.emptyURLFilters; } /** * Loads the filters from a JSON configuration file * * @throws IOException */ public URLFilters(Map stormConf, String configFile) throws IOException { // load the JSON configFile // build a JSON object out of it JsonNode confNode; try (InputStream confStream = getClass().getClassLoader() .getResourceAsStream(configFile)) { ObjectMapper mapper = new ObjectMapper(); confNode = mapper.readValue(confStream, JsonNode.class); } catch (Exception e) { throw new IOException("Unable to build JSON object from file", e); } configure(stormConf, confNode); } @Override public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) { String normalizedURL = urlToFilter; try { for (URLFilter filter : filters) { long start = System.currentTimeMillis(); normalizedURL = filter.filter(sourceUrl, sourceMetadata, normalizedURL); long end = System.currentTimeMillis(); LOG.debug("URLFilter {} took {} msec", filter.getClass() .getName(), end - start); if (normalizedURL == null) break; } } catch (Exception e) { LOG.error("URL filtering threw exception", e); } return normalizedURL; } @Override public void configure(Map stormConf, JsonNode jsonNode) { // initialises the filters List<URLFilter> filterLists = new ArrayList<>(); // get the filters part String name = getClass().getCanonicalName(); jsonNode = jsonNode.get(name); if (jsonNode == null) { LOG.info("No field {} in JSON config. Skipping", name); filters = new URLFilter[0]; return; } // conf node contains a list of objects Iterator<JsonNode> filterIter = jsonNode.elements(); while (filterIter.hasNext()) { JsonNode afilterNode = filterIter.next(); String filterName = "<unnamed>"; JsonNode nameNode = afilterNode.get("name"); if (nameNode != null) { filterName = nameNode.textValue(); } JsonNode classNode = afilterNode.get("class"); if (classNode == null) { LOG.error("Filter {} doesn't specified a 'class' attribute", filterName); continue; } String className = classNode.textValue().trim(); filterName += '[' + className + ']'; // check that it is available and implements the interface URLFilter try { Class<?> filterClass = Class.forName(className); boolean interfaceOK = URLFilter.class .isAssignableFrom(filterClass); if (!interfaceOK) { LOG.error("Class {} does not implement URLFilter", className); continue; } URLFilter filterInstance = (URLFilter) filterClass .newInstance(); JsonNode paramNode = afilterNode.get("params"); if (paramNode != null) { filterInstance.configure(stormConf, paramNode); } else { filterInstance.configure(stormConf, NullNode.getInstance()); } filterLists.add(filterInstance); LOG.info("Loaded instance of class {}", className); } catch (Exception e) { LOG.error("Can't setup {}: {}", filterName, e); continue; } } filters = filterLists.toArray(new URLFilter[filterLists.size()]); } }