/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.step; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import no.trank.openpipe.api.BasePipelineStep; import no.trank.openpipe.api.PipelineException; import no.trank.openpipe.api.PipelineStepStatus; import no.trank.openpipe.api.document.Document; import no.trank.openpipe.api.document.DomRawData; import no.trank.openpipe.config.annotation.NotEmpty; /** * Uses XPath to extract fields form a xml formatted field. * <p/> * Typical usage would be to set the fieldName(setFieldName()) to where the xml formatted text. Supply the * XPath -> field map in setXPathToFieldName(...). * <p/> * This would put the content of the fields that matches the XPath(s) into new fields with the supplied names. * * @version $Revision$ */ public class ParseXMLXPath extends BasePipelineStep { private static final Logger log = LoggerFactory.getLogger(ParseXML.class); private static final Pattern WS_PATTERN = Pattern.compile("\\s+"); @NotEmpty private String fieldName; @NotEmpty private Map<String, String> xPathToFieldName = Collections.emptyMap(); private List<XPathFieldName> xPaths; private boolean failOnXMLError = true; private DocumentBuilder builder; private XPath xPath; @Override public PipelineStepStatus execute(Document doc) throws PipelineException { if (fieldName != null) return execute_field(doc); else return execute_Dom(doc); } private PipelineStepStatus execute_Dom(Document doc) throws PipelineException { DomRawData domRawData = (DomRawData) doc.getRawData(); try { evalXPaths(doc, domRawData.getDom()); } catch (XPathExpressionException e) { handleException("(dom)", e);//TODO put dom in string? } return PipelineStepStatus.DEFAULT; } private PipelineStepStatus execute_field(Document doc) throws PipelineException { final List<String> list = doc.getFieldValues(fieldName); for (String text : list) { try { final Node reader = builder.parse(new InputSource(new StringReader(text))); evalXPaths(doc, reader); } catch (IOException e) { handleException(text, e); } catch (SAXException e) { handleException(text, e); } catch (XPathExpressionException e) { handleException(text, e); } } return PipelineStepStatus.DEFAULT; } private void handleException(String text, Exception e) throws PipelineException { if (failOnXMLError) { log.debug("Failed parsing of: {}", text); throw new PipelineException("Could not parse XML in field '" + fieldName + "'", e); } else { log.error("{}: Could not parse XML in field '" + fieldName + "'", e); } } private void evalXPaths(Document doc, Node node) throws XPathExpressionException { for (XPathFieldName e : xPaths) { final NodeList nl = (NodeList) e.getXPathExpression().evaluate(node, XPathConstants.NODESET); if (nl != null && nl.getLength() > 0) { buildNodeValue(nl, doc, e.getFieldname()); } } } private static void buildNodeValue(NodeList nl, Document doc, String fieldName) { final StringBuilder buf = new StringBuilder(64); final int length = nl.getLength(); for (int i = 0; i < length; i++) { buildNodeValue(nl.item(i), buf); if (buf.length() > 0) { final String value = buf.substring(0, buf.length() - 1); if (!isBlank(value)) { doc.addFieldValue(fieldName, value); } buf.setLength(0); } } } private static void buildNodeValue(Node n, StringBuilder buf) { if (!isBlank(n.getNodeValue())) { buf.append(n.getNodeValue()); buf.append(' '); } if (n.hasChildNodes()) { final NodeList nl = n.getChildNodes(); for (int i = 0; i < nl.getLength(); i++) { buildNodeValue(nl.item(i), buf); } } } private static boolean isBlank(String data) { return data == null || WS_PATTERN.matcher(data).matches(); } @Override public void prepare() throws PipelineException { super.prepare(); if (builder == null) { try { builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new PipelineException(e); } } if (xPath == null) { xPath = XPathFactory.newInstance().newXPath(); } try { compileXPaths(); } catch (XPathExpressionException e) { throw new PipelineException(e); } } private void compileXPaths() throws XPathExpressionException { xPaths = new ArrayList<XPathFieldName>(xPathToFieldName.size()); for (Map.Entry<String, String> e : xPathToFieldName.entrySet()) { xPaths.add(new XPathFieldName(xPath.compile(e.getKey()), e.getValue())); } } @Override public String getRevision() { return "$Revision$"; } /** * Gets the field name where the source xml is stored in the document. * * @return the field name where the source xml is stored in the document. */ public String getFieldName() { return fieldName; } /** * Sets the field name where the source xml is stored in the document. * * @param fieldName the field name where the source xml is stored in the document. */ public void setFieldName(String fieldName) { this.fieldName = fieldName; } /** * Gets the XPath to field name mappings. * * @return a map of XPath to field names */ public Map<String, String> getXPathToFieldName() { return xPathToFieldName; } /** * Sets the XPath to field name mappings. * * @param xPathToFieldName a map of XPath to field names */ public void setXPathToFieldName(Map<String, String> xPathToFieldName) { this.xPathToFieldName = xPathToFieldName; } /** * Gets the XPath instance used for looking up XPath matches. * * @return XPath instance used for looking up XPath matches */ public XPath getXPath() { return xPath; } /** * Sets the XPath instance used for looking up XPath matches. * <p/> * I none are set this class will construct one using: XPathFactory.newInstance().newXPath(); * * @param xPath the XPath instance used for looking up XPath matches. */ public void setXPath(XPath xPath) { this.xPath = xPath; } /** * Gets if this step should fail if an xml parser error occurs. * * @return true if this step should fail if an xml parser error occurs. */ public boolean isFailOnXMLError() { return failOnXMLError; } /** * Sets if this step should fail if an xml parser error occurs. * <p/> * Default is true * * @param failOnXMLError true if this step should fail if an xml parser error occurs. */ public void setFailOnXMLError(boolean failOnXMLError) { this.failOnXMLError = failOnXMLError; } /** * Gets the xml DocumentBuilder instance to use for xml parsing. * * @return the xml DocumentBuilder instance to use for xml parsing. */ public DocumentBuilder getBuilder() { return builder; } /** * Sets the xml DocumentBuilder instance to use for xml parsing. * <p/> * If this is not set, the step will construct one using: DocumentBuilderFactory.newInstance().newDocumentBuilder(); * * @param builder the xml DocumentBuilder instance to use for xml parsing. */ public void setBuilder(DocumentBuilder builder) { this.builder = builder; } private static final class XPathFieldName { private final XPathExpression xPathExpression; private final String fieldname; private XPathFieldName(XPathExpression xPathExpression, String fieldname) { this.xPathExpression = xPathExpression; this.fieldname = fieldname; } public XPathExpression getXPathExpression() { return xPathExpression; } public String getFieldname() { return fieldname; } } }