/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.filtering.regex; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; import org.xml.sax.InputSource; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.filtering.URLFilter; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ArrayNode; /** * The RegexURLNormalizer is a URL filter that normalizes URLs by matching a * regular expression and inserting a replacement string. * * Adapted from Apache Nutch 1.9. */ public class RegexURLNormalizer implements URLFilter { private static final Logger LOG = LoggerFactory .getLogger(RegexURLNormalizer.class); /** * Class which holds a compiled pattern and its corresponding substitution * string. */ private static class Rule { public Pattern pattern; public String substitution; } private List<Rule> rules; private static final List<Rule> EMPTY_RULES = Collections.emptyList(); @Override public void configure(Map stormConf, JsonNode paramNode) { JsonNode node = paramNode.get("urlNormalizers"); if (node != null && node.isArray()) { rules = readRules((ArrayNode) node); } else { JsonNode filenameNode = paramNode.get("regexNormalizerFile"); String rulesFileName; if (filenameNode != null) { rulesFileName = filenameNode.textValue(); } else { rulesFileName = "default-regex-normalizers.xml"; } rules = readRules(rulesFileName); } } /** * This function does the replacements by iterating through all the regex * patterns. It accepts a string url as input and returns the altered * string. If the normalized url is an empty string, the function will * return null. */ @Override public String filter(URL sourceUrl, Metadata sourceMetadata, String urlString) { Iterator<Rule> i = rules.iterator(); while (i.hasNext()) { Rule r = i.next(); Matcher matcher = r.pattern.matcher(urlString); urlString = matcher.replaceAll(r.substitution); } if (urlString.equals("")) { urlString = null; } return urlString; } /** Populates a List of Rules off of JsonNode. */ private List<Rule> readRules(ArrayNode rulesList) { List<Rule> rules = new ArrayList<>(); for (JsonNode regexNode : rulesList) { if (regexNode == null || regexNode.isNull()) { LOG.warn("bad config: 'regex' element is null"); continue; } JsonNode patternNode = regexNode.get("pattern"); JsonNode substitutionNode = regexNode.get("substitution"); String substitutionValue = ""; if (substitutionNode != null) { substitutionValue = substitutionNode.asText(); } if (patternNode != null && StringUtils.isNotBlank(patternNode.asText())) { Rule rule = createRule(patternNode.asText(), substitutionValue); if (rule != null) { rules.add(rule); } } } if (rules.size() == 0) { rules = EMPTY_RULES; } return rules; } /** Reads the configuration file and populates a List of Rules. */ private List<Rule> readRules(String rulesFile) { try { InputStream regexStream = getClass().getClassLoader() .getResourceAsStream(rulesFile); Reader reader = new InputStreamReader(regexStream, StandardCharsets.UTF_8); return readConfiguration(reader); } catch (Exception e) { LOG.error("Error loading rules from file: {}", e); return EMPTY_RULES; } } private List<Rule> readConfiguration(Reader reader) { List<Rule> rules = new ArrayList<>(); try { // borrowed heavily from code in Configuration.java Document doc = DocumentBuilderFactory.newInstance() .newDocumentBuilder().parse(new InputSource(reader)); Element root = doc.getDocumentElement(); if ((!"regex-normalize".equals(root.getTagName())) && (LOG.isErrorEnabled())) { LOG.error("bad conf file: top-level element not <regex-normalize>"); } NodeList regexes = root.getChildNodes(); for (int i = 0; i < regexes.getLength(); i++) { Node regexNode = regexes.item(i); if (!(regexNode instanceof Element)) { continue; } Element regex = (Element) regexNode; if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) { LOG.warn("bad conf file: element not <regex>"); } NodeList fields = regex.getChildNodes(); String patternValue = null; String subValue = null; for (int j = 0; j < fields.getLength(); j++) { Node fieldNode = fields.item(j); if (!(fieldNode instanceof Element)) { continue; } Element field = (Element) fieldNode; if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) { patternValue = ((Text) field.getFirstChild()).getData(); } if ("substitution".equals(field.getTagName()) && field.hasChildNodes()) { subValue = ((Text) field.getFirstChild()).getData(); } if (!field.hasChildNodes()) { subValue = ""; } } if (patternValue != null && subValue != null) { Rule rule = createRule(patternValue, subValue); rules.add(rule); } } } catch (Exception e) { LOG.error("error parsing conf file", e); return EMPTY_RULES; } if (rules.size() == 0) { return EMPTY_RULES; } return rules; } private Rule createRule(String patternValue, String subValue) { Rule rule = new Rule(); try { rule.pattern = Pattern.compile(patternValue); } catch (PatternSyntaxException e) { LOG.error( "skipped rule: {} -> {} : invalid regular expression pattern" + patternValue, subValue, e); return null; } rule.substitution = subValue; return rule; } }