/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.parse.filter;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import javax.xml.namespace.QName;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.xml.serialize.Method;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.parse.ParseData;
import com.digitalpebble.stormcrawler.parse.ParseFilter;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.fasterxml.jackson.databind.JsonNode;
/**
* Simple ParseFilter to illustrate and test the interface. Reads a XPATH
* pattern from the config file and stores the value as metadata
*/
public class XPathFilter extends ParseFilter {
private enum EvalFunction {
NONE, STRING, SERIALIZE;
public QName getReturnType() {
switch (this) {
case STRING:
return XPathConstants.STRING;
default:
return XPathConstants.NODESET;
}
}
}
private static final Logger LOG = LoggerFactory
.getLogger(XPathFilter.class);
private XPathFactory factory = XPathFactory.newInstance();
private XPath xpath = factory.newXPath();
protected final Map<String, List<LabelledExpression>> expressions = new HashMap<>();
class LabelledExpression {
String key;
private EvalFunction evalFunction;
private XPathExpression expression;
private LabelledExpression(String key, String expression)
throws XPathExpressionException {
this.key = key;
if (expression.startsWith("string(")) {
evalFunction = EvalFunction.STRING;
} else if (expression.startsWith("serialize(")) {
expression = expression.substring(10, expression.length() - 1);
evalFunction = EvalFunction.SERIALIZE;
} else {
evalFunction = EvalFunction.NONE;
}
this.expression = xpath.compile(expression);
}
List<String> evaluate(DocumentFragment doc)
throws XPathExpressionException, IOException {
Object evalResult = expression.evaluate(doc,
evalFunction.getReturnType());
List<String> values = new LinkedList<>();
switch (evalFunction) {
case STRING:
if (evalResult != null) {
String strippedValue = StringUtils
.strip((String) evalResult);
values.add(strippedValue);
}
break;
case SERIALIZE:
NodeList nodesToSerialize = (NodeList) evalResult;
StringWriter out = new StringWriter();
OutputFormat format = new OutputFormat(Method.XHTML, null,
false);
format.setOmitXMLDeclaration(true);
XMLSerializer serializer = new XMLSerializer(out, format);
for (int i = 0; i < nodesToSerialize.getLength(); i++) {
Node node = nodesToSerialize.item(i);
switch (node.getNodeType()) {
case Node.ELEMENT_NODE:
serializer.serialize((Element) node);
break;
case Node.DOCUMENT_NODE:
serializer.serialize((Document) node);
break;
case Node.DOCUMENT_FRAGMENT_NODE:
serializer.serialize((DocumentFragment) node);
break;
case Node.TEXT_NODE:
String text = node.getTextContent();
if (text.length() > 0) {
values.add(text);
}
// By pass the rest of the code since it is used to
// extract
// the value out of the serialized which isn't used in
// this case
continue;
}
String serializedValue = out.toString();
if (serializedValue.length() > 0) {
values.add(serializedValue);
}
out.getBuffer().setLength(0);
}
break;
default:
NodeList nodes = (NodeList) evalResult;
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
values.add(StringUtils.strip(node.getTextContent()));
}
}
return values;
}
}
@Override
public void filter(String URL, byte[] content, DocumentFragment doc,
ParseResult parse) {
ParseData parseData = parse.get(URL);
Metadata metadata = parseData.getMetadata();
// applies the XPATH expression in the order in which they are produced
java.util.Iterator<List<LabelledExpression>> iter = expressions
.values().iterator();
while (iter.hasNext()) {
List<LabelledExpression> leList = iter.next();
for (LabelledExpression le : leList) {
try {
List<String> values = le.evaluate(doc);
if (values != null && !values.isEmpty()) {
metadata.addValues(le.key, values);
break;
}
} catch (XPathExpressionException e) {
LOG.error("Error evaluating {}: {}", le.key, e);
} catch (IOException e) {
LOG.error("Error evaluating {}: {}", le.key, e);
}
}
}
}
@SuppressWarnings("rawtypes")
@Override
public void configure(Map stormConf, JsonNode filterParams) {
java.util.Iterator<Entry<String, JsonNode>> iter = filterParams
.fields();
while (iter.hasNext()) {
Entry<String, JsonNode> entry = iter.next();
String key = entry.getKey();
JsonNode node = entry.getValue();
if (node.isArray()) {
for (JsonNode expression : node) {
addExpression(key, expression);
}
} else {
addExpression(key, entry.getValue());
}
}
}
private void addExpression(String key, JsonNode expression) {
String xpathvalue = expression.asText();
try {
List<LabelledExpression> lexpressionList = expressions.get(key);
if (lexpressionList == null) {
lexpressionList = new ArrayList<>();
expressions.put(key, lexpressionList);
}
LabelledExpression lexpression = new LabelledExpression(key,
xpathvalue);
lexpressionList.add(lexpression);
} catch (XPathExpressionException e) {
throw new RuntimeException("Can't compile expression : "
+ xpathvalue, e);
}
}
@Override
public boolean needsDOM() {
return true;
}
}