/* * Copyright (c) 2013, University of Toronto. * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may obtain * a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package edu.toronto.cs.xcurator.common; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.util.ArrayList; import java.util.List; import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; public class XmlParser { private final boolean debug; public XmlParser() { this.debug = false; } public XmlParser(boolean debug) { this.debug = debug; } public Document parse(String path, int maxElement) throws SAXException, IOException, ParserConfigurationException { DocumentBuilder builder = XMLUtils.createNsAwareDocumentBuilder(); Document doc = builder.parse(path); doc = pruneDocument(doc, maxElement); return doc; } public Document parse(InputStream is, int maxElement) throws SAXException, IOException, ParserConfigurationException { DocumentBuilder builder = XMLUtils.createNsAwareDocumentBuilder(); Document doc = builder.parse(is); doc = pruneDocument(doc, maxElement); return doc; } public Document parse(Reader reader, int maxElement) throws SAXException, IOException, ParserConfigurationException { DocumentBuilder builder = XMLUtils.createNsAwareDocumentBuilder(); Document doc = builder.parse(new InputSource(reader)); doc = pruneDocument(doc, maxElement); return doc; } private Document pruneDocument(Document doc, int maxElement) throws ParserConfigurationException { if (maxElement == -1) { return doc; } Document newDoc = (Document) doc.cloneNode(false); Element newRoot = (Element) doc.getDocumentElement().cloneNode(false); newDoc.adoptNode(newRoot); newDoc.appendChild(newRoot); NodeList nl = doc.getDocumentElement().getChildNodes(); for (int i = 0; i < maxElement && i < nl.getLength(); i++) { if (!(nl.item(i) instanceof Element)) { maxElement++; continue; } Node item = nl.item(i).cloneNode(true); newDoc.adoptNode(item); newDoc.getDocumentElement().appendChild(item); } if (debug) { System.out.println("Creating document of " + newDoc.getDocumentElement().getChildNodes().getLength()); } return newDoc; } /** * Get immediate (1st level) children that are leaves. * * @param root * @return */ public List<Element> getLeafChildElements(Element root) { List<Element> leaves = new ArrayList<>(); if (isLeaf(root)) { return leaves; } NodeList nl = root.getChildNodes(); for (int i = 0; i < nl.getLength(); i++) { Node n = nl.item(i); if (n instanceof Element && isLeaf(n)) { leaves.add((Element) n); } } return leaves; } /** * Get all attributes as a list from the element, ignoring namespace * definitions. * * @param element * @return */ public List<Attr> getAttributes(Element element) { List<Attr> attrList = new ArrayList<>(); NamedNodeMap attributeMap = element.getAttributes(); for (int i = 0; i < attributeMap.getLength(); i++) { Attr attr = (Attr) attributeMap.item(i); if (isNamespaceDef(attr)) { continue; } attrList.add(attr); } return attrList; } /** * Check if the attribute node is a namespace definition. * * @param attr * @return */ public boolean isNamespaceDef(Attr attr) { String prefix = attr.getPrefix(); return (prefix != null && prefix.equals(XMLConstants.XMLNS_ATTRIBUTE)) || attr.getNodeName().equals(XMLConstants.XMLNS_ATTRIBUTE); } /** * Check if the node is a leaf node (with no child elements). * * @param node * @return */ public boolean isLeaf(Node node) { NodeList nodeList = node.getChildNodes(); if (nodeList.getLength() == 0) { return true; } for (int i = 0; i < nodeList.getLength(); i++) { if (nodeList.item(i) instanceof Element) { // if the node contains child element it is not // a leaf node return false; } } return true; } }