/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.filtering.regex;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.filtering.URLFilter;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
/**
* An abstract class for implementing Regex URL filtering. Adapted from Apache
* Nutch 1.9
*/
public abstract class RegexURLFilterBase implements URLFilter {
private static final Logger LOG = LoggerFactory
.getLogger(RegexURLFilterBase.class);
/** A list of applicable rules */
private List<RegexRule> rules;
@Override
public void configure(Map stormConf, JsonNode paramNode) {
JsonNode node = paramNode.get("urlFilters");
if (node != null && node.isArray()) {
rules = readRules((ArrayNode) node);
} else {
JsonNode filenameNode = paramNode.get("regexFilterFile");
String rulesFileName;
if (filenameNode != null) {
rulesFileName = filenameNode.textValue();
} else {
rulesFileName = "default-regex-filters.txt";
}
rules = readRules(rulesFileName);
}
}
/** Populates a List of Rules off of JsonNode. */
private List<RegexRule> readRules(ArrayNode rulesList) {
List<RegexRule> rules = new ArrayList<>();
for (JsonNode urlFilterNode : rulesList) {
try {
RegexRule rule = createRule(urlFilterNode.asText());
if (rule != null) {
rules.add(rule);
}
} catch (IOException e) {
LOG.error("There was an error reading regex filter {}",
urlFilterNode.asText(), e);
}
}
return rules;
}
private List<RegexRule> readRules(String rulesFile) {
List<RegexRule> rules = new ArrayList<>();
try {
InputStream regexStream = getClass().getClassLoader()
.getResourceAsStream(rulesFile);
Reader reader = new InputStreamReader(regexStream,
StandardCharsets.UTF_8);
BufferedReader in = new BufferedReader(reader);
String line;
while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
}
RegexRule rule = createRule(line);
if (rule != null) {
rules.add(rule);
}
}
} catch (IOException e) {
LOG.error("There was an error reading the default-regex-filters file");
e.printStackTrace();
}
return rules;
}
private RegexRule createRule(String line) throws IOException {
char first = line.charAt(0);
boolean sign;
switch (first) {
case '+':
sign = true;
break;
case '-':
sign = false;
break;
case ' ':
case '\n':
case '#': // skip blank & comment lines
return null;
default:
throw new IOException("Invalid first character: " + line);
}
String regex = line.substring(1);
LOG.trace("Adding rule [{}]", regex);
RegexRule rule = createRule(sign, regex);
return rule;
}
/**
* Creates a new {@link RegexRule}.
*
* @param sign
* of the regular expression. A <code>true</code> value means
* that any URL matching this rule must be included, whereas a
* <code>false</code> value means that any URL matching this rule
* must be excluded.
* @param regex
* is the regular expression associated to this rule.
*/
protected abstract RegexRule createRule(boolean sign, String regex);
/*
* -------------------------- * <implementation:URLFilter> *
* --------------------------
*/
@Override
public String filter(URL pageUrl, Metadata sourceMetadata, String url) {
for (RegexRule rule : rules) {
if (rule.match(url)) {
return rule.accept() ? url : null;
}
}
return null;
}
}