/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.parse.filter; import java.io.ByteArrayInputStream; import java.io.InputStream; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathFactory; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import com.digitalpebble.stormcrawler.parse.ParseData; import com.digitalpebble.stormcrawler.parse.ParseFilter; import com.digitalpebble.stormcrawler.parse.ParseResult; public class SubDocumentsParseFilter extends ParseFilter { private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(SubDocumentsParseFilter.class); @Override public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { InputStream stream = new ByteArrayInputStream(content); try { DocumentBuilderFactory factory = DocumentBuilderFactory .newInstance(); Document document = factory.newDocumentBuilder().parse(stream); Element root = document.getDocumentElement(); XPath xPath = XPathFactory.newInstance().newXPath(); XPathExpression expression = xPath.compile("//url"); NodeList nodes = (NodeList) expression.evaluate(root, XPathConstants.NODESET); for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); expression = xPath.compile("loc"); Node child = (Node) expression.evaluate(node, XPathConstants.NODE); // create a subdocument for each url found in the sitemap ParseData parseData = parse.get(child.getTextContent()); NodeList childs = node.getChildNodes(); for (int j = 0; j < childs.getLength(); j++) { Node n = childs.item(j); parseData.put(n.getNodeName(), n.getTextContent()); } } } catch (Exception e) { LOG.error("Error processing sitemap from {}: {}", URL, e); } } @Override public boolean needsDOM() { return true; } }