package org.myrobotlab.document.transformer;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.myrobotlab.document.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
* This stage will load a config file that contains a field name to xpath
* expression mapping. The xpaths will be applied to xml data on the document
* and the extracted values will be mapped to the appropriate fields on the
* document based on the config
*
* @author kwatters
*
*/
public class XPathExtractor extends AbstractStage {
protected String xmlField = "xml";
protected String configFile = "config/xpaths.txt";
// mapping of field name to the xpaths that evaluate for its extraction
protected HashMap<XPathExpression, ArrayList<String>> xpaths = new HashMap<XPathExpression, ArrayList<String>>();
protected boolean useNamespaces = true;
private DocumentBuilderFactory factory;
private DocumentBuilder builder;
private XPathFactory xpathFactory;
private XPath xpath;
// TODO: move this to the base class.
private boolean debug = false;
@Override
public void startStage(StageConfiguration config) {
if (config != null) {
xmlField = config.getProperty("xmlField", "xml");
configFile = config.getProperty("configFile", "config/xpaths.txt");
useNamespaces = Boolean.valueOf(config.getProperty("useNamespaces", "true"));
}
factory = DocumentBuilderFactory.newInstance();
// TODO: do we really care about name spaces (they can be a pain sometimes)
factory.setNamespaceAware(useNamespaces);
try {
builder = factory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
xpathFactory = XPathFactory.newInstance();
xpath = xpathFactory.newXPath();
// TODO Auto-generated method stub
try {
xpaths = loadConfig(configFile);
} catch (XPathExpressionException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public List<Document> processDocument(Document doc) {
// TODO Auto-generated method stub
for (Object o : doc.getField(xmlField)) {
// TODO: this is bad , lets cast
String xml = (String) o;
try {
processXml(xml, doc);
} catch (XPathExpressionException | SAXException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
continue;
}
}
return null;
}
private void processXml(String xml, Document doc) throws SAXException, IOException, XPathExpressionException {
// Ok. now for each of the configured xpaths, we want to parse the xml
// evaluate the xpaths expressions and put the values into the mrl documnet
// object.
InputStream stream = new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8));
org.w3c.dom.Document xmldoc = builder.parse(stream);
// TODO: iterate the xpaths..
for (XPathExpression xpath : xpaths.keySet()) {
NodeList nodes = (NodeList) xpath.evaluate(xmldoc, XPathConstants.NODESET);
for (int i = 0; i < nodes.getLength(); i++) {
for (String fieldName : xpaths.get(xpath)) {
// add the evaluated xpath to the fields that this xpath maps to.
doc.addToField(fieldName, nodes.item(i).getTextContent());
}
}
}
}
protected HashMap<XPathExpression, ArrayList<String>> loadConfig(String filename) throws XPathExpressionException {
HashMap<XPathExpression, ArrayList<String>> configMap = new HashMap<XPathExpression, ArrayList<String>>();
FileInputStream fstream;
try {
fstream = new FileInputStream(filename);
} catch (FileNotFoundException e) {
System.out.println("XPATH Extractor config file not found: " + filename);
e.printStackTrace();
return null;
}
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine;
// Read File Line By Line
try {
while ((strLine = br.readLine()) != null) {
// ignore white space
strLine = strLine.trim();
// ignore commented out lines
if (strLine.matches("^#.*")) {
continue;
}
// skip blank lines
if (strLine.length() == 0) {
continue;
}
String fieldName = strLine.split(",")[0];
int offset = fieldName.length() + 1;
String strXPath = strLine.substring(offset, strLine.length());
// compile the
XPathExpression xPath = xpath.compile(strXPath);
if (debug) {
System.out.println("Adding XPATH " + strXPath + " Maps To : " + fieldName);
}
if (configMap.containsKey(xPath)) {
configMap.get(xPath).add(fieldName);
} else {
ArrayList<String> fields = new ArrayList<String>();
fields.add(fieldName);
configMap.put(xPath, fields);
}
}
} catch (IOException e) {
System.out.println("IO Exception reading from file " + filename);
e.printStackTrace();
// return what we can...
return configMap;
}
// try to not leak some file handles.
try {
br.close();
} catch (IOException e) {
System.out.println("Exception occured when trying to close the config file..");
e.printStackTrace();
}
return configMap;
}
@Override
public void stopStage() {
// TODO Auto-generated method stub
}
@Override
public void flush() {
// no batching in this transformer. no need to flush?
}
public String getXmlField() {
return xmlField;
}
public void setXmlField(String xmlField) {
this.xmlField = xmlField;
}
public String getConfigFile() {
return configFile;
}
public void setConfigFile(String configFile) {
this.configFile = configFile;
}
public boolean isUseNamespaces() {
return useNamespaces;
}
public void setUseNamespaces(boolean useNamespaces) {
this.useNamespaces = useNamespaces;
}
}