/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
* NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package org.apache.pig.piggybank.evaluation.xml;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPathFactory;
import org.apache.pig.EvalFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.PigWarning;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
/**
* XPath is a function that allows for text extraction from xml
*/
public class XPath extends EvalFunc<String> {
/** Hold onto last xpath & xml in case the next call to xpath() is feeding the same xml document
* The reason for this is because creating an xpath object is costly. */
private javax.xml.xpath.XPath xpath = null;
private String xml = null;
private Document document;
private static boolean cache = true;
/**
* input should contain: 1) xml 2) xpath 3) optional cache xml doc flag
*
* Usage:
* 1) XPath(xml, xpath)
* 2) XPath(xml, xpath, false)
*
* @param 1st element should to be the xml
* 2nd element should be the xpath
* 3rd optional boolean cache flag (default true)
*
* This UDF will cache the last xml document. This is helpful when multiple consecutive xpath calls are made for the same xml document.
* Caching can be turned off to ensure that the UDF's recreates the internal javax.xml.xpath.XPath for every call
*
* @return chararrary result or null if no match
*/
@Override
public String exec(final Tuple input) throws IOException {
if (input == null || input.size() <= 1) {
warn("Error processing input, not enough parameters or null input" + input,
PigWarning.UDF_WARNING_1);
return null;
}
if (input.size() > 3) {
warn("Error processing input, too many parameters" + input,
PigWarning.UDF_WARNING_1);
return null;
}
try {
final String xml = (String) input.get(0);
if (xml == null) {
return null;
}
if(input.size() > 2)
cache = (Boolean) input.get(2);
if(!cache || xpath == null || !xml.equals(this.xml))
{
final InputSource source = new InputSource(new StringReader(xml));
this.xml = xml; //track the xml for subsequent calls to this udf
final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
final DocumentBuilder db = dbf.newDocumentBuilder();
this.document = db.parse(source);
final XPathFactory xpathFactory = XPathFactory.newInstance();
this.xpath = xpathFactory.newXPath();
}
final String xpathString = (String) input.get(1);
final String value = xpath.evaluate(xpathString, document);
return value;
} catch (Exception e) {
warn("Error processing input " + input.getType(0),
PigWarning.UDF_WARNING_1);
return null;
}
}
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
final List<FuncSpec> funcList = new ArrayList<FuncSpec>();
/*either two chararray arguments*/
List<FieldSchema> fields = new ArrayList<FieldSchema>();
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
Schema twoArgInSchema = new Schema(fields);
funcList.add(new FuncSpec(this.getClass().getName(), twoArgInSchema));
/*or two chararray and a boolean argument*/
fields = new ArrayList<FieldSchema>();
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
Schema threeArgInSchema = new Schema(fields);
funcList.add(new FuncSpec(this.getClass().getName(), threeArgInSchema));
return funcList;
}
}