/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.parse; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.NullNode; /** * Wrapper for the ParseFilters defined in a JSON configuration */ public class ParseFilters extends ParseFilter { public static final ParseFilters emptyParseFilter = new ParseFilters(); private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(ParseFilters.class); private ParseFilter[] filters; private ParseFilters() { filters = new ParseFilter[0]; } /** * Loads and configure the ParseFilters based on the storm config if there * is one otherwise returns an emptyParseFilter. **/ @SuppressWarnings("rawtypes") public static ParseFilters fromConf(Map stormConf) { String parseconfigfile = ConfUtils.getString(stormConf, "parsefilters.config.file"); if (StringUtils.isNotBlank(parseconfigfile)) { try { return new ParseFilters(stormConf, parseconfigfile); } catch (IOException e) { String message = "Exception caught while loading the ParseFilters from " + parseconfigfile; LOG.error(message); throw new RuntimeException(message, e); } } return ParseFilters.emptyParseFilter; } /** * loads the filters from a JSON configuration file * * @throws IOException */ @SuppressWarnings("rawtypes") public ParseFilters(Map stormConf, String configFile) throws IOException { // load the JSON configFile // build a JSON object out of it JsonNode confNode = null; InputStream confStream = null; try { confStream = getClass().getClassLoader().getResourceAsStream( configFile); ObjectMapper mapper = new ObjectMapper(); confNode = mapper.readValue(confStream, JsonNode.class); } catch (Exception e) { throw new IOException("Unable to build JSON object from file", e); } finally { if (confStream != null) { confStream.close(); } } configure(stormConf, confNode); } @SuppressWarnings("rawtypes") @Override public void configure(Map stormConf, JsonNode filtersConf) { // initialises the filters List<ParseFilter> filterLists = new ArrayList<>(); // get the filters part String name = getClass().getCanonicalName(); filtersConf = filtersConf.get(name); if (filtersConf == null) { LOG.info("No field {} in JSON config. Skipping", name); filters = new ParseFilter[0]; return; } // conf node contains a list of objects Iterator<JsonNode> filterIter = filtersConf.elements(); while (filterIter.hasNext()) { JsonNode afilterConf = filterIter.next(); String filterName = "<unnamed>"; JsonNode nameNode = afilterConf.get("name"); if (nameNode != null) { filterName = nameNode.textValue(); } JsonNode classNode = afilterConf.get("class"); if (classNode == null) { LOG.error("Filter {} doesn't specified a 'class' attribute", filterName); continue; } String className = classNode.textValue().trim(); filterName += '[' + className + ']'; // check that it is available and implements the interface // ParseFilter try { Class<?> filterClass = Class.forName(className); boolean subClassOK = ParseFilter.class .isAssignableFrom(filterClass); if (!subClassOK) { LOG.error("Filter {} does not extend ParseFilter", filterName); continue; } ParseFilter filterInstance = (ParseFilter) filterClass .newInstance(); JsonNode paramNode = afilterConf.get("params"); if (paramNode != null) { filterInstance.configure(stormConf, paramNode); } else { // Pass in a nullNode if missing filterInstance.configure(stormConf, NullNode.getInstance()); } filterLists.add(filterInstance); LOG.info("Setup {}", filterName); } catch (Exception e) { LOG.error("Can't setup {}: {}", filterName, e); throw new RuntimeException("Can't setup " + filterName, e); } } filters = filterLists.toArray(new ParseFilter[filterLists.size()]); } @Override public boolean needsDOM() { for (ParseFilter filter : filters) { boolean needsDOM = filter.needsDOM(); if (needsDOM) { return true; } } return false; } @Override public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { for (ParseFilter filter : filters) { long start = System.currentTimeMillis(); if (doc == null && filter.needsDOM()) { LOG.info( "ParseFilter {} needs DOM but has none to work on - skip : {}", filter.getClass().getName(), URL); continue; } filter.filter(URL, content, doc, parse); long end = System.currentTimeMillis(); LOG.debug("ParseFilter {} took {} msec", filter.getClass() .getName(), end - start); } } }