/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
* NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package org.apache.pig.piggybank.evaluation.xml;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.apache.pig.EvalFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.PigWarning;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
/**
* XPathAll is a function that allows for Tuple extraction from xml
*/
public class XPathAll extends EvalFunc<Tuple> {
private javax.xml.xpath.XPath xmlPath = null;
private String xml = null;
private Document document;
/**
* Caching of the xpath & xml in case the next call to xpath() is feeding
* the same xml document The reason for this is because creating an xpath
* object is costly.
*/
private static boolean cache = true;
private static boolean ignoreNamespace = true;
private static TupleFactory tupleFactory = TupleFactory.getInstance();
public static enum ARGUMENTS {
XML_FILE(0), XPATH(1), CACHE(2), IGNORE_NAMESPACE(3);
private int argument;
ARGUMENTS(int argument) {
this.argument = argument;
}
int getPosition() {
return this.argument;
}
}
public static final String EMPTY_STRING = "";
/**
* input should contain: 1) xml 2) xpath 3) optional cache xml doc flag 4)
* optional ignore namespace flag
*
* The optional fourth parameter (IGNORE_NAMESPACE), if set true will remove
* the namespace from xPath For example xpath /html:body/html:div will be
* considered as /body/div
*
* Usage: 1) XPathAll(xml, xpath)
* 2) XPathAll(xml, xpath, false)
* 3) XPathAll(xml, xpath, false, false)
*
* @param input
* 1st element should to be the xml 2nd element should be the xpath
* 3rd optional boolean cache flag (default true)
* 4th optional boolean ignore namespace flag(default true)
*
* This UDF will cache the last xml document. This is helpful when
* multiple consecutive xpathAll calls are made for the same xml
* document. Caching can be turned off to ensure that the UDF's
* recreates the internal javax.xml.xpath.XPathAll for every call
*
* This UDF will also support ignoring the namespace in the xml tags.
* This will help to search xpath items by ignoring its namespace.
* Ignoring of the namespace can be turned off for special cases using
* a fourth argument in the UDF.
*
*
* @return Tuple result or null if no match
*/
@Override
public Tuple exec(final Tuple input) throws IOException {
if (!isArgsValid(input)) { // Validate arguments
return null;
}
try {
final String xml = (String) input.get(ARGUMENTS.XML_FILE.getPosition());
if (xml == null) {
warn("Error processing input, invalid parameter" + input, PigWarning.UDF_WARNING_1);
return null;
}
if (input.size() > 2) {
cache = (Boolean) input.get(ARGUMENTS.CACHE.getPosition());
}
if (input.size() > 3) {
ignoreNamespace = (Boolean) input.get(ARGUMENTS.IGNORE_NAMESPACE.getPosition());
}
// Process XML
if (!cache || xmlPath == null || !xml.equals(this.xml)) { // Cache verification
final InputSource source = new InputSource(new StringReader(xml));
this.xml = xml; // track the xml for subsequent calls to this udf
final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
final DocumentBuilder db = dbf.newDocumentBuilder();
this.document = db.parse(source);
final XPathFactory xpathFactory = XPathFactory.newInstance();
this.xmlPath = xpathFactory.newXPath();
}
String xpathString = (String) input.get(ARGUMENTS.XPATH.getPosition());
if (ignoreNamespace) {
xpathString = createNameSpaceIgnoreXpathString(xpathString);
}
final NodeList nodeEntries = (NodeList) xmlPath.compile(xpathString).evaluate(document,
XPathConstants.NODESET);
if (nodeEntries == null) {
return null;
}
Tuple resultTuple = tupleFactory.newTuple(nodeEntries.getLength());
for (int nodeEntryIndex = 0; nodeEntryIndex < nodeEntries.getLength(); nodeEntryIndex++) {
final String ELEMENT_NODE_SEPARATOR = ", ";
Node node = nodeEntries.item(nodeEntryIndex);
// Parse the Node
final NodeList childNodes = node.getChildNodes();
if (childNodes == null) {
continue;
}
String nodeData = "";
boolean dataFlag = false;
for (int i = 0; i < childNodes.getLength(); i++) {
try {
Node subNode = childNodes.item(i);
if (subNode.getNodeType() == Node.ELEMENT_NODE) {
if (subNode.getFirstChild().getNodeValue() == null) {
// If There is no direct element, return blank
nodeData = nodeData.concat(ELEMENT_NODE_SEPARATOR);
nodeData = nodeData.concat(EMPTY_STRING);
dataFlag = true;
continue;
}
nodeData = nodeData.concat(ELEMENT_NODE_SEPARATOR);
nodeData = nodeData.concat(subNode.getFirstChild().getNodeValue());
dataFlag = true;
} else if (subNode.getNodeType() == Node.TEXT_NODE
|| subNode.getNodeType() == Node.ATTRIBUTE_NODE) {
nodeData = nodeData.concat(ELEMENT_NODE_SEPARATOR);
nodeData = nodeData.concat(subNode.getNodeValue());
dataFlag = true;
}
} catch (Exception ex) {
continue;
}
}
if (dataFlag) {
nodeData = nodeData.replaceFirst(ELEMENT_NODE_SEPARATOR, EMPTY_STRING);
resultTuple.set(nodeEntryIndex, nodeData);
}
}
return resultTuple;
} catch (Exception e) {
warn("Error processing input " + input.getType(0), PigWarning.UDF_WARNING_1);
return null;
}
}
/**
* Validates values of the input parameters.
*
* @param Tuple
* @return boolean
*/
private boolean isArgsValid(final Tuple input) {
if (input == null || input.size() <= 1) {
warn("Error processing input, not enough parameters or null input" + input, PigWarning.UDF_WARNING_1);
return false;
}
if (input.size() > 4) {
warn("Error processing input, too many parameters" + input, PigWarning.UDF_WARNING_1);
return false;
}
try {
// 3rd Parameter - CACHE
if (input.size() > 2 && !(input.get(ARGUMENTS.CACHE.getPosition()) instanceof Boolean)) {
warn("Error processing input, invalid value in 3rd parameter" + input, PigWarning.UDF_WARNING_1);
return false;
}
// 4rd Parameter IGNORE_NAMESPACE
if (input.size() > 3 && !(input.get(ARGUMENTS.IGNORE_NAMESPACE.getPosition()) instanceof Boolean)) {
warn("Error processing input, invalid value in 4th parameter" + input, PigWarning.UDF_WARNING_1);
return false;
}
} catch (Exception ex) {
return false;
}
return true;
}
/**
* Returns a new the xPathString by adding additional parameters
* in the existing xPathString for ignoring the namespace during compilation.
*
* @param String xpathString
* @return String modified xpathString
*/
private String createNameSpaceIgnoreXpathString(final String xpathString) {
final String QUERY_PREFIX = "//*";
final String LOCAL_PREFIX = "[local-name()='";
final String LOCAL_POSTFIX = "']";
final String SPLITTER = "/";
try {
String xpathStringWithLocalName = EMPTY_STRING;
String[] individualNodes = xpathString.split(SPLITTER);
for (String node : individualNodes) {
xpathStringWithLocalName = xpathStringWithLocalName.concat(QUERY_PREFIX + LOCAL_PREFIX + node
+ LOCAL_POSTFIX);
}
return xpathStringWithLocalName;
} catch (Exception ex) {
return xpathString;
}
}
/**
* Returns argument schemas of the UDF.
*
* @return List
*/
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
final List<FuncSpec> funcList = new ArrayList<FuncSpec>();
/* either two chararray arguments */
List<FieldSchema> fields = new ArrayList<FieldSchema>();
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
Schema twoArgInSchema = new Schema(fields);
funcList.add(new FuncSpec(this.getClass().getName(), twoArgInSchema));
/* or two chararray and a boolean argument */
fields = new ArrayList<FieldSchema>();
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
Schema threeArgInSchema = new Schema(fields);
funcList.add(new FuncSpec(this.getClass().getName(), threeArgInSchema));
/* or two chararray and two boolean arguments */
fields = new ArrayList<FieldSchema>();
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
Schema fourArgInSchema = new Schema(fields);
funcList.add(new FuncSpec(this.getClass().getName(), fourArgInSchema));
return funcList;
}
}