/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.parse.filter; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import com.digitalpebble.stormcrawler.parse.ParseData; import com.digitalpebble.stormcrawler.parse.ParseFilter; import com.digitalpebble.stormcrawler.parse.ParseResult; import com.fasterxml.jackson.databind.JsonNode; /** * Restricts the text of the main document based on the text value of an Xpath * expression (e.g. <div id='maincontent'>). This is useful when dealing * with a known format to get rid of the boilerplate HTML code. **/ public class ContentFilter extends ParseFilter { private static final Logger LOG = LoggerFactory .getLogger(ContentFilter.class); private XPath xpath = XPathFactory.newInstance().newXPath(); private List<XPathExpression> expressions; @Override public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { ParseData pd = parse.get(URL); // TODO determine how to restrict the expressions e.g. regexp on URL // or value in metadata // iterates on the expressions - stops at the first that matches for (XPathExpression expression : expressions) { try { NodeList evalResults = (NodeList) expression.evaluate(doc, XPathConstants.NODESET); if (evalResults.getLength() == 0) { continue; } StringBuilder newText = new StringBuilder(); for (int i = 0; i < evalResults.getLength(); i++) { Node node = evalResults.item(i); newText.append(node.getTextContent()).append("\n"); } // ignore if no text captured if (StringUtils.isBlank(newText.toString())) { LOG.debug( "Found match for doc {} but empty text extracted - skipping", URL); continue; } // give the doc its new text value LOG.debug( "Restricted text for doc {}. Text size was {} and is now {}", URL, pd.getText().length(), newText.length()); pd.setText(newText.toString()); return; } catch (XPathExpressionException e) { LOG.error("Caught XPath expression", e); } } } @SuppressWarnings("rawtypes") @Override public void configure(Map stormConf, JsonNode filterParams) { expressions = new ArrayList<>(); java.util.Iterator<Entry<String, JsonNode>> iter = filterParams .fields(); while (iter.hasNext()) { Entry<String, JsonNode> entry = iter.next(); String key = entry.getKey(); String xpathvalue = entry.getValue().asText(); try { expressions.add(xpath.compile(xpathvalue)); } catch (XPathExpressionException e) { throw new RuntimeException("Can't compile expression : " + xpathvalue, e); } } } @Override public boolean needsDOM() { return true; } }