/**
* Copyright (C) 2014 Stratio (http://stratio.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.stratio.ingestion.deserializer.xmlxpath;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Map.Entry;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.conf.ConfigurationException;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.serialization.EventDeserializer;
import org.apache.flume.serialization.ResettableInputStream;
import org.apache.flume.serialization.Seekable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
//@formatter:off
/**
* <p>XML XPath Deserializer. Read InputStream as XML compile a XPathExpression and create event for each element
* result of apply that expression to the xml in headers. Maintain whole xml in body.</p>.
* <ul>
* <li><em>outputField</em>: Output Field in header where put events. Default: element.</li>
* <li><em>expression</em>: XPath expression. </li>
* </ul>
*
* <p>A special option is the chance to evaluate xpath expression for each event and add result in a header. For example:</p>
* <code>
* <li>headers.author= <XPathExpression> will put result of expression in author field of header.</li>
* </code>
*/
//@formatter:on
public class XmlXpathDeserializer implements EventDeserializer {
private static final Logger log = LoggerFactory.getLogger(XmlXpathDeserializer.class);
private static final String CONF_XPATH_EXPRESSION = "expression";
private static final String CONF_OUTPUT_HEADER = "outputHeader";
private static final String CONF_OUTPUT_BODY = "outputBody";
private static final boolean DEFAULT_OUTPUT_BODY = true;
private boolean isOpen;
private String outputHeader;
private boolean outputBody;
private String body;
private final XPath xpath;
private Document doc = null;
private List<String> list = null;
private ListIterator<String> markIt, currentIt;
XmlXpathDeserializer(Context context, ResettableInputStream in) throws IOException {
try {
final String expression = context.getString(CONF_XPATH_EXPRESSION);
outputBody = context.getBoolean(CONF_OUTPUT_BODY, DEFAULT_OUTPUT_BODY);
if (!outputBody) {
if (!context.containsKey(CONF_OUTPUT_HEADER)) {
throw new ConfigurationException(
String.format("Either %s must be false or %s must be defined", CONF_OUTPUT_BODY, CONF_OUTPUT_HEADER));
}
outputHeader = context.getString(CONF_OUTPUT_HEADER);
}
xpath = XPathFactory.newInstance().newXPath();
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder;
try {
docBuilder = factory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
throw new IOException("Creating DocumentBuilder failed", e);
}
try {
doc = docBuilder.parse(new ResettableInputStreamInputStream(in));
} catch (SAXException e) {
throw new IOException("Cannot parse body", e);
}
// Extract full xml to body
try {
body = documentToString(doc);
} catch (TransformerException e) {
throw new IOException("Cannot serialize XML", e);
}
if (doc != null) {
isOpen = true;
}
NodeList nodeList;
try {
final XPathExpression expr = xpath.compile(expression);
nodeList = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
list = new ArrayList<String>(nodeList.getLength());
log.debug("XPath expression matched {} elements", nodeList.getLength());
} catch (XPathExpressionException e) {
throw new IOException("Applying XPath expression failed", e);
}
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
String eventSt = nodeToString(node);
list.add(eventSt);
}
markIt = list.listIterator();
currentIt = list.listIterator();
} finally {
try {
in.close();
} catch (IOException ex) {
log.warn("Error while closing input stream");
}
}
}
@Override
public Event readEvent() throws IOException {
ensureOpen();
if (!currentIt.hasNext()) {
return null;
} else {
final String node = currentIt.next();
if (outputBody) {
return EventBuilder.withBody(node, Charsets.UTF_8);
} else {
final Event event = EventBuilder.withBody(body, Charsets.UTF_8);
event.getHeaders().put(outputHeader, node);
return event;
}
}
}
@Override
public List<Event> readEvents(int numEvents) throws IOException {
ensureOpen();
List<Event> events = Lists.newLinkedList();
for (int i = 0; i < numEvents; i++) {
Event event = readEvent();
if (event != null) {
events.add(event);
}
}
return events;
}
@Override
public void mark() throws IOException {
ensureOpen();
int index = currentIt.previousIndex();
markIt = index >= 0 ? list.listIterator(currentIt.previousIndex()) : list.listIterator(0);
if (markIt.hasNext()) {
markIt.next();
}
}
@Override
public void reset() throws IOException {
ensureOpen();
int index = markIt.previousIndex();
currentIt = index >= 0 ? list.listIterator(markIt.previousIndex()) : list.listIterator(0);
if (currentIt.hasNext()) {
currentIt.next();
}
}
@Override
public void close() throws IOException {
if (isOpen) {
isOpen = false;
}
}
private void ensureOpen() {
if (!isOpen) {
throw new IllegalStateException("Serializer has been closed");
}
}
public String nodeToString(Node node) {
StringWriter writer = new StringWriter();
TransformerFactory tfactory = TransformerFactory.newInstance();
Transformer xform;
try {
xform = tfactory.newTransformer();
xform.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
Source src = new DOMSource(node);
Result result = new StreamResult(writer);
xform.transform(src, result);
} catch (TransformerException e) {
e.printStackTrace();
}
return writer.toString();
}
public String documentToString(Document document) throws TransformerException {
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
StringWriter writer = new StringWriter();
transformer.transform(new DOMSource(document), new StreamResult(writer));
return writer.getBuffer().toString().replaceAll("\n|\r|\t", "");
}
/**
* From a properties, evaluate every xpath expression in value and put result in a map
* maintaining given key.
*
* @param properties
* @return
*/
private Map<String, String> evaluateStaticFields(ImmutableMap<String, String> properties) {
Map<String, String> headers = new HashMap<String, String>();
for (Entry<String, String> entry : properties.entrySet()) {
try {
XPathExpression expression = xpath.compile(entry.getValue());
String value = (String) expression.evaluate(doc, XPathConstants.STRING);
headers.put(entry.getKey(), value);
} catch (XPathExpressionException e) {
e.printStackTrace();
}
}
return headers;
}
public static class Builder implements EventDeserializer.Builder {
@Override
public EventDeserializer build(Context context, ResettableInputStream in) {
if (!(in instanceof Seekable)) {
throw new IllegalArgumentException(
"Cannot use this deserializer without a Seekable input stream");
}
try {
return new XmlXpathDeserializer(context, in);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
}
}