/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.xml; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayDeque; import java.util.HashMap; import java.util.Iterator; import java.util.Queue; import java.util.Set; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.tools.ant.types.resources.FileResource; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import de.tudarmstadt.ukp.dkpro.core.api.io.FileSetCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field; /** * A component reader for XML files implemented with XPath. * <p> * This is currently optimized for TREC format, which means the style topics are presented in. You * should provide the parameter XPath expression that of the <i>parent</i> node And the child nodes * of each parent node will be stored separately in its own CAS. * <p> * If your expression evaluates to leaf nodes, empty CASes will be created. */ @MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field" }) public class XmlXPathReader extends FileSetCollectionReaderBase { /** * Specifies the XPath expression to all nodes to be processed. Different segments will be * separated via PARAM_ID_TAG, and each segment will be stored in a separate CAS. */ public static final String PARAM_XPATH_EXPRESSION = "rootXPath"; @ConfigurationParameter(name = PARAM_XPATH_EXPRESSION, mandatory = true) private String rootXPath; /** * Tags which should be worked on. If empty then all tags will be processed. * <p> * * If this and PARAM_EXCLUDE_TAGS are both provided, tags in set PARAM_INCLUDE_TAGS - * PARAM_EXCLUDE_TAGS will be processed. */ public static final String PARAM_INCLUDE_TAGS = "includeTags"; @ConfigurationParameter(name = PARAM_INCLUDE_TAGS, mandatory = true, defaultValue = {}) private Set<String> includeTags; /** * Tags which should be ignored. If empty then all tags will be processed. * <p> * * If this and PARAM_INCLUDE_TAGS are both provided, tags in set PARAM_INCLUDE_TAGS - * PARAM_EXCLUDE_TAGS will be processed. */ public static final String PARAM_EXCLUDE_TAGS = "excludeTags"; @ConfigurationParameter(name = PARAM_EXCLUDE_TAGS, mandatory = true, defaultValue = {}) private Set<String> excludeTags; /** * Language of the documents. If given, it will be set in each CAS. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) private String language; /** * Specify to substitute tag names in CAS. * <p> * Please give the substitutions each in before - after order. For example to substitute "foo" * with "bar", and "hey" with "ho", you can provide { "foo", "bar", "hey", "ho" }. * */ public static final String PARAM_SUBSTITUTE_TAGS = "workingDir"; @ConfigurationParameter(name = PARAM_SUBSTITUTE_TAGS, mandatory = false) private String[] substituteTags; /** * Tag which contains the docId. If it is given, it will be ensured that within the same * document there is only one id tag and it is not empty */ public static final String PARAM_DOC_ID_TAG = "docIdTag"; @ConfigurationParameter(name = PARAM_DOC_ID_TAG, mandatory = false) private String docIdTag; private Iterator<FileResource> fileIterator; private FileResource currentFileResource; private XPathExpression compiledRootXPath; private XPathExpression compiledIdXPath; private ArrayDeque<Node> nodes; // Stores nodes // Substitution boolean useSubstitution = false; private HashMap<String, String> substitution; @Override public void initialize(UimaContext arg0) throws ResourceInitializationException { super.initialize(arg0); fileIterator = getFileSetIterator(); XPath xpath = XPathFactory.newInstance().newXPath(); nodes = new ArrayDeque<Node>(); if (StringUtils.isWhitespace(rootXPath)) { throw new IllegalArgumentException( "Illegal root XPath expression. Please provide a valid one."); } try { compiledRootXPath = xpath.compile(rootXPath); } catch (XPathExpressionException e) { throw new IllegalArgumentException( "Illegal root XPath expression. Please provide a valid one."); } if (docIdTag != null) { if (StringUtils.isWhitespace(docIdTag)) { throw new IllegalArgumentException( "Illegal ID XPath expression. Please provide a valid one."); } try { compiledIdXPath = xpath.compile(docIdTag); } catch (XPathExpressionException e) { throw new IllegalArgumentException( "Illegal ID XPath expression. Please provide a valid one."); } } // Substitution if (substituteTags != null && substituteTags.length > 0) { if (substituteTags.length % 2 != 0) { throw new IllegalArgumentException( "Parameter substitute tags must " + "be given in an array of even number of elements, in 'before, after' order"); } useSubstitution = true; substitution = new HashMap<String, String>(substituteTags.length); for (int i = 0; i < substituteTags.length; i += 2) { substitution.put(substituteTags[i], substituteTags[i + 1]); } } processNextFile(); } /** * Read in next file and store the nodes which satisfy the given XPath expression in the queue * for further process. */ private void processNextFile() { if (fileIterator.hasNext()) { currentFileResource = fileIterator.next(); File currentFile = currentFileResource.getFile(); FileInputStream inputStream = null; NodeList nodeList = null; try { inputStream = new FileInputStream(currentFile); InputSource inputSource = new InputSource(inputStream); nodeList = (NodeList) compiledRootXPath.evaluate(inputSource, XPathConstants.NODESET); } catch (FileNotFoundException e) { // Should not happen new RuntimeException(e); } catch (XPathExpressionException e) { new RuntimeException(e); } finally { IOUtils.closeQuietly(inputStream); } // Add nodes to the queue if (nodeList != null) { for (int i = 0; i < nodeList.getLength(); i++) { nodes.add(nodeList.item(i)); } } } } /** * Check whether there is still nodes to be processed. * <p> * After all nodes from current file get processed, read in nodes from the next file * * @return true if there is still nodes to process <br> * false iff there is neither nodes nor files remaining */ @Override public boolean hasNext() throws IOException, CollectionException { if (nodes.isEmpty()) { if (fileIterator.hasNext()) { processNextFile(); } else { return false; } } return true; } @Override public void getNext(CAS cas) throws IOException { // Initialize CAS with document meta data initCas(cas, currentFileResource, null); if (!StringUtils.isWhitespace(language)) { cas.setDocumentLanguage(language); } // The buffer where document text is to be stored StringBuilder documentText = new StringBuilder(); Node node = nodes.poll(); if (node != null) { processNode(cas, node, documentText); } // Set document text in cas or error if nothing gets parsed out String documentTextString = documentText.toString(); if (StringUtils.isWhitespace(documentTextString)) { cas.setDocumentText("[Parse error]"); } else { cas.setDocumentText(documentTextString); } } /** * Add the text in current node to document text buffer, create and add to index a Field * annotation out of the text. This usually processes a document. */ private void processNode(CAS cas, Node node, StringBuilder documentText) { if (node.hasChildNodes()) { if (docIdTag != null) { ensureIdValidity(node); } NodeList docFields = node.getChildNodes(); for (int i = 0; i < docFields.getLength(); i++) { Node field = docFields.item(i); int begin = documentText.length(); String nodeTag = field.getLocalName(); if (nodeTag != null && isIncluded(nodeTag)) { String nodeText = field.getTextContent(); documentText = documentText.append(nodeText); int end = documentText.length(); documentText = documentText.append("\n"); // Substitue tag if specified if (useSubstitution && substitution.containsKey(nodeTag)) { nodeTag = substitution.get(nodeTag); } createFieldAnnotation(cas, nodeTag, begin, end); } } } } private void ensureIdValidity(Node node) { NodeList idNodes = null; try { idNodes = (NodeList) compiledIdXPath.evaluate(node, XPathConstants.NODESET); } catch (XPathExpressionException e) { // Already checked in initialize(), should not happen. getUimaContext().getLogger().log(Level.WARNING, e.getLocalizedMessage()); return; } if (idNodes.getLength() == 0) { // DocID not found throw new IllegalStateException("DocID tag \"" + docIdTag + "\" not found: " + currentFileResource.getFile().getAbsolutePath()); } else if (idNodes.getLength() != 1) { // DocID not unique (two id elements in one doc) throw new IllegalStateException("DocID tag \"" + docIdTag + "\" has multiple occurences: " + currentFileResource.getFile().getAbsolutePath()); } Node idNode = idNodes.item(0); String id = idNode.getTextContent(); if (StringUtils.isEmpty(id)) { // Empty DocID (e.g. <num></num>) throw new IllegalStateException("Emtpy DocID tag \"" + docIdTag + "\" in file: " + currentFileResource.getFile().getAbsolutePath()); } getUimaContext().getLogger().log(Level.INFO, "ID '" + id + "' found"); } private boolean isIncluded(final String tagName) { boolean needToBeParsed = (includeTags.size() == 0) || includeTags.contains(tagName); if (excludeTags.size() > 0 && excludeTags.contains(tagName)) { needToBeParsed = false; } return needToBeParsed; } /** * Create and add to index a Field annotation with the given data */ private void createFieldAnnotation(CAS cas, String nodeTag, int begin, int end) { JCas jcas = null; try { jcas = cas.getJCas(); } catch (CASException e) { // Should not happen throw new RuntimeException(e); } Field field = new Field(jcas, begin, end); field.setName(nodeTag); field.addToIndexes(); } public static class XmlNodes { public FileResource fileResource; public Queue<Node> nodes; public XmlNodes(FileResource fileResource, Queue<Node> nodes) { this.fileResource = fileResource; this.nodes = nodes; } } }