/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.xml;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Queue;
import java.util.Set;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.tools.ant.types.resources.FileResource;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import de.tudarmstadt.ukp.dkpro.core.api.io.FileSetCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field;
/**
* A component reader for XML files implemented with XPath.
* <p>
* This is currently optimized for TREC format, which means the style topics are presented in. You
* should provide the parameter XPath expression that of the <i>parent</i> node And the child nodes
* of each parent node will be stored separately in its own CAS.
* <p>
* If your expression evaluates to leaf nodes, empty CASes will be created.
*/
@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML})
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field" })
public class XmlXPathReader
extends FileSetCollectionReaderBase
{
/**
* Specifies the XPath expression to all nodes to be processed. Different segments will be
* separated via PARAM_ID_TAG, and each segment will be stored in a separate CAS.
*/
public static final String PARAM_XPATH_EXPRESSION = "rootXPath";
@ConfigurationParameter(name = PARAM_XPATH_EXPRESSION, mandatory = true)
private String rootXPath;
/**
* Tags which should be worked on. If empty then all tags will be processed.
* <p>
*
* If this and PARAM_EXCLUDE_TAGS are both provided, tags in set PARAM_INCLUDE_TAGS -
* PARAM_EXCLUDE_TAGS will be processed.
*/
public static final String PARAM_INCLUDE_TAGS = "includeTags";
@ConfigurationParameter(name = PARAM_INCLUDE_TAGS, mandatory = true, defaultValue = {})
private Set<String> includeTags;
/**
* Tags which should be ignored. If empty then all tags will be processed.
* <p>
*
* If this and PARAM_INCLUDE_TAGS are both provided, tags in set PARAM_INCLUDE_TAGS -
* PARAM_EXCLUDE_TAGS will be processed.
*/
public static final String PARAM_EXCLUDE_TAGS = "excludeTags";
@ConfigurationParameter(name = PARAM_EXCLUDE_TAGS, mandatory = true, defaultValue = {})
private Set<String> excludeTags;
/**
* Language of the documents. If given, it will be set in each CAS.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
private String language;
/**
* Specify to substitute tag names in CAS.
* <p>
* Please give the substitutions each in before - after order. For example to substitute "foo"
* with "bar", and "hey" with "ho", you can provide { "foo", "bar", "hey", "ho" }.
*
*/
public static final String PARAM_SUBSTITUTE_TAGS = "workingDir";
@ConfigurationParameter(name = PARAM_SUBSTITUTE_TAGS, mandatory = false)
private String[] substituteTags;
/**
* Tag which contains the docId. If it is given, it will be ensured that within the same
* document there is only one id tag and it is not empty
*/
public static final String PARAM_DOC_ID_TAG = "docIdTag";
@ConfigurationParameter(name = PARAM_DOC_ID_TAG, mandatory = false)
private String docIdTag;
private Iterator<FileResource> fileIterator;
private FileResource currentFileResource;
private XPathExpression compiledRootXPath;
private XPathExpression compiledIdXPath;
private ArrayDeque<Node> nodes; // Stores nodes
// Substitution
boolean useSubstitution = false;
private HashMap<String, String> substitution;
@Override
public void initialize(UimaContext arg0)
throws ResourceInitializationException
{
super.initialize(arg0);
fileIterator = getFileSetIterator();
XPath xpath = XPathFactory.newInstance().newXPath();
nodes = new ArrayDeque<Node>();
if (StringUtils.isWhitespace(rootXPath)) {
throw new IllegalArgumentException(
"Illegal root XPath expression. Please provide a valid one.");
}
try {
compiledRootXPath = xpath.compile(rootXPath);
}
catch (XPathExpressionException e) {
throw new IllegalArgumentException(
"Illegal root XPath expression. Please provide a valid one.");
}
if (docIdTag != null) {
if (StringUtils.isWhitespace(docIdTag)) {
throw new IllegalArgumentException(
"Illegal ID XPath expression. Please provide a valid one.");
}
try {
compiledIdXPath = xpath.compile(docIdTag);
}
catch (XPathExpressionException e) {
throw new IllegalArgumentException(
"Illegal ID XPath expression. Please provide a valid one.");
}
}
// Substitution
if (substituteTags != null && substituteTags.length > 0) {
if (substituteTags.length % 2 != 0) {
throw new IllegalArgumentException(
"Parameter substitute tags must "
+ "be given in an array of even number of elements, in 'before, after' order");
}
useSubstitution = true;
substitution = new HashMap<String, String>(substituteTags.length);
for (int i = 0; i < substituteTags.length; i += 2) {
substitution.put(substituteTags[i], substituteTags[i + 1]);
}
}
processNextFile();
}
/**
* Read in next file and store the nodes which satisfy the given XPath expression in the queue
* for further process.
*/
private void processNextFile()
{
if (fileIterator.hasNext()) {
currentFileResource = fileIterator.next();
File currentFile = currentFileResource.getFile();
FileInputStream inputStream = null;
NodeList nodeList = null;
try {
inputStream = new FileInputStream(currentFile);
InputSource inputSource = new InputSource(inputStream);
nodeList = (NodeList) compiledRootXPath.evaluate(inputSource,
XPathConstants.NODESET);
}
catch (FileNotFoundException e) {
// Should not happen
new RuntimeException(e);
}
catch (XPathExpressionException e) {
new RuntimeException(e);
}
finally {
IOUtils.closeQuietly(inputStream);
}
// Add nodes to the queue
if (nodeList != null) {
for (int i = 0; i < nodeList.getLength(); i++) {
nodes.add(nodeList.item(i));
}
}
}
}
/**
* Check whether there is still nodes to be processed.
* <p>
* After all nodes from current file get processed, read in nodes from the next file
*
* @return true if there is still nodes to process <br>
* false iff there is neither nodes nor files remaining
*/
@Override
public boolean hasNext()
throws IOException, CollectionException
{
if (nodes.isEmpty()) {
if (fileIterator.hasNext()) {
processNextFile();
}
else {
return false;
}
}
return true;
}
@Override
public void getNext(CAS cas)
throws IOException
{
// Initialize CAS with document meta data
initCas(cas, currentFileResource, null);
if (!StringUtils.isWhitespace(language)) {
cas.setDocumentLanguage(language);
}
// The buffer where document text is to be stored
StringBuilder documentText = new StringBuilder();
Node node = nodes.poll();
if (node != null) {
processNode(cas, node, documentText);
}
// Set document text in cas or error if nothing gets parsed out
String documentTextString = documentText.toString();
if (StringUtils.isWhitespace(documentTextString)) {
cas.setDocumentText("[Parse error]");
}
else {
cas.setDocumentText(documentTextString);
}
}
/**
* Add the text in current node to document text buffer, create and add to index a Field
* annotation out of the text. This usually processes a document.
*/
private void processNode(CAS cas, Node node, StringBuilder documentText)
{
if (node.hasChildNodes()) {
if (docIdTag != null) {
ensureIdValidity(node);
}
NodeList docFields = node.getChildNodes();
for (int i = 0; i < docFields.getLength(); i++) {
Node field = docFields.item(i);
int begin = documentText.length();
String nodeTag = field.getLocalName();
if (nodeTag != null && isIncluded(nodeTag)) {
String nodeText = field.getTextContent();
documentText = documentText.append(nodeText);
int end = documentText.length();
documentText = documentText.append("\n");
// Substitue tag if specified
if (useSubstitution && substitution.containsKey(nodeTag)) {
nodeTag = substitution.get(nodeTag);
}
createFieldAnnotation(cas, nodeTag, begin, end);
}
}
}
}
private void ensureIdValidity(Node node)
{
NodeList idNodes = null;
try {
idNodes = (NodeList) compiledIdXPath.evaluate(node, XPathConstants.NODESET);
}
catch (XPathExpressionException e) {
// Already checked in initialize(), should not happen.
getUimaContext().getLogger().log(Level.WARNING, e.getLocalizedMessage());
return;
}
if (idNodes.getLength() == 0) {
// DocID not found
throw new IllegalStateException("DocID tag \"" + docIdTag + "\" not found: "
+ currentFileResource.getFile().getAbsolutePath());
}
else if (idNodes.getLength() != 1) {
// DocID not unique (two id elements in one doc)
throw new IllegalStateException("DocID tag \"" + docIdTag
+ "\" has multiple occurences: "
+ currentFileResource.getFile().getAbsolutePath());
}
Node idNode = idNodes.item(0);
String id = idNode.getTextContent();
if (StringUtils.isEmpty(id)) {
// Empty DocID (e.g. <num></num>)
throw new IllegalStateException("Emtpy DocID tag \"" + docIdTag + "\" in file: "
+ currentFileResource.getFile().getAbsolutePath());
}
getUimaContext().getLogger().log(Level.INFO, "ID '" + id + "' found");
}
private boolean isIncluded(final String tagName)
{
boolean needToBeParsed = (includeTags.size() == 0) || includeTags.contains(tagName);
if (excludeTags.size() > 0 && excludeTags.contains(tagName)) {
needToBeParsed = false;
}
return needToBeParsed;
}
/**
* Create and add to index a Field annotation with the given data
*/
private void createFieldAnnotation(CAS cas, String nodeTag, int begin, int end)
{
JCas jcas = null;
try {
jcas = cas.getJCas();
}
catch (CASException e) {
// Should not happen
throw new RuntimeException(e);
}
Field field = new Field(jcas, begin, end);
field.setName(nodeTag);
field.addToIndexes();
}
public static class XmlNodes
{
public FileResource fileResource;
public Queue<Node> nodes;
public XmlNodes(FileResource fileResource, Queue<Node> nodes)
{
this.fileResource = fileResource;
this.nodes = nodes;
}
}
}