/* * Copyright (c) 2013, University of Toronto. * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may obtain * a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package edu.toronto.cs.xml2rdf.xml; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.XMLSerializer; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * @author Soheil Hassas Yeganeh <soheil@cs.toronto.edu> */ public class XMLUtils { static boolean debug = true; static XPathFactory factory = XPathFactory.newInstance(); // ekzhu: These *ByPath functions would not work for path containing namespaces. public static NodeList getNodesByPath(String path, Element localElement, Document doc) throws XPathExpressionException { // Note: if using absolute path, then the root element must also be specified, // that is, it should be like "/clinical_studies/clinical_study/..." XPath xpath = factory.newXPath(); Object element = path.startsWith("/") || localElement == null ? doc : localElement; NodeList nodeList = (NodeList) xpath.evaluate(path, element, XPathConstants.NODESET); return nodeList; } public static boolean getBooleanPath(String path, Element localElement, Document doc) throws XPathExpressionException { // Note: if using absolute path, then the root element must also be specified, // that is, it should be like "/clinical_studies/clinical_study/..." XPath xpath = factory.newXPath(); Object element = path.startsWith("/") || localElement == null ? doc : localElement; boolean res = (Boolean) xpath.evaluate(path, element, XPathConstants.BOOLEAN); return res; } public static String getStringByPath(String path, Element localElement, Document doc) throws XPathExpressionException { // Note the difference between this function and function "getStringsByPath" // The path for this function should be like "/clinical_studies/clinical_study/brief_title", // which returns ONLY ONE string of the first matched element "brief_title" XPath xpath = factory.newXPath(); Object element = path.startsWith("/") || localElement == null ? doc : localElement; return (String) xpath.evaluate(path, element, XPathConstants.STRING); } public static Set<String> getStringsByPath(String path, Element localElement, Document doc) throws XPathExpressionException { // Note the difference between this function and function "getStringByPath" // The path for this function should be like "/clinical_studies/clinical_study/brief_title/text()", // with the extra "/text()" at the end, and it returns ALL strings of ALL matching element "brief_title" Set<String> ret = new HashSet<String>(); NodeList nl = getNodesByPath(path, localElement, doc); for (int i = 0; i < nl.getLength(); i++) { if (nl.item(i) instanceof Text) { ret.add(((Text) nl.item(i)).getTextContent().trim()); } } return ret; } public static Document parse(String path, int maxElement) throws SAXException, IOException, ParserConfigurationException { // File Parser #1 DocumentBuilder builder = createNsAwareDocumentBuilder(); Document doc = builder.parse(path); doc = pruneDocument(doc, maxElement); return doc; } private static Document pruneDocument(Document doc, int maxElement) throws ParserConfigurationException { if (maxElement == -1) { return doc; } Document newDoc = (Document) doc.cloneNode(false); Element newRoot = (Element) doc.getDocumentElement().cloneNode(false); newDoc.adoptNode(newRoot); newDoc.appendChild(newRoot); NodeList nl = doc.getDocumentElement().getChildNodes(); System.out.println("XML Child#: " + nl.getLength()); for (int i = 0; i < maxElement && i < nl.getLength(); i++) { if (!(nl.item(i) instanceof Element)) { maxElement++; continue; } Node item = nl.item(i).cloneNode(true); newDoc.adoptNode(item); newDoc.getDocumentElement().appendChild(item); } if (debug) { System.out.println("Creating document of " + newDoc.getDocumentElement().getChildNodes().getLength()); } return newDoc; } public static Document parse(InputStream is, int maxElement) throws SAXException, IOException, ParserConfigurationException { // File Parser #2 DocumentBuilder builder = createNsAwareDocumentBuilder(); Document doc = builder.parse(is); doc = pruneDocument(doc, maxElement); return doc; } public static Document parse(Reader reader, int maxElement) throws SAXException, IOException, ParserConfigurationException { // File Parser #3 DocumentBuilder builder = createNsAwareDocumentBuilder(); Document doc = builder.parse(new InputSource(reader)); doc = pruneDocument(doc, maxElement); return doc; } public static DocumentBuilder createNsAwareDocumentBuilder() throws ParserConfigurationException { DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); builderFactory.setNamespaceAware(true); return builderFactory.newDocumentBuilder(); } public static boolean isLeaf(Node node) { NodeList nodeList = node.getChildNodes(); if (nodeList.getLength() == 0) { return true; } for (int i = 0; i < nodeList.getLength(); i++) { if (nodeList.item(i) instanceof Element) { // if the node contains child element it is not // a leaf node return false; } } return true; } public static List<String> getAllLeaves(Element element) { // Get a list of strings representing the relative path // (including the current element) to all the leaf elements // under the current element // Eric: Why return a List? Returning a Set seems to make // more sense. if (element == null) { return null; } List<String> ret = new LinkedList<String>(); if (isLeaf(element)) { ret.add(element.getNodeName()); } else { NodeList nl = element.getChildNodes(); for (int i = 0; i < nl.getLength(); i++) { Node n = nl.item(i); if (n instanceof Element) { Element childElement = (Element) n; for (String childNodeName : getAllLeaves(childElement)) { ret.add(element.getNodeName() + "/" + childNodeName); } } } } return ret; } public static List<String> getAllLeaveValues(Element element) throws XPathExpressionException { if (element == null) { return null; } List<String> ret = new LinkedList<String>(); if (isLeaf(element)) { ret.add(element.getTextContent()); } else { NodeList nl = element.getChildNodes(); for (int i = 0; i < nl.getLength(); i++) { Node n = nl.item(i); if (n instanceof Element) { Element childElement = (Element) n; for (String childText : getAllLeaveValues(childElement)) { ret.add(childText); } } } } return ret; } public static byte[] asByteArray(Element element) throws IOException { ByteArrayOutputStream bis = new ByteArrayOutputStream(); OutputFormat format = new OutputFormat(element.getOwnerDocument()); XMLSerializer serializer = new XMLSerializer( bis, format); serializer.asDOMSerializer(); serializer.serialize(element); return bis.toByteArray(); } public static String asString(Element element) throws IOException { return new String(asByteArray(element)); } public static Document attributize(Document doc) throws ParserConfigurationException { Element root = doc.getDocumentElement(); attributize(root); return doc; } private static void attributize(Element root) { NamedNodeMap attributeMap = root.getAttributes(); for (int i = 0; i < attributeMap.getLength(); i++) { org.w3c.dom.Attr attr = (Attr) attributeMap.item(i); Element attrElement = root.getOwnerDocument().createElement(attr.getName()); attrElement.setTextContent(attr.getValue()); root.appendChild(attrElement); } NodeList children = root.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { if (children.item(i) instanceof Element) { attributize((Element) children.item(i)); } } } public static Document addRoot(Document dataDoc, String elementName) { Element oldRoot = dataDoc.getDocumentElement(); Element newRoot = dataDoc.createElement(elementName); dataDoc.removeChild(oldRoot); newRoot.appendChild(oldRoot); dataDoc.appendChild(newRoot); return dataDoc; } }