/** Copyright 2013,214 hbz, Pascal Christoph. * Licensed under the Eclipse Public License 1.0 **/ package org.lobid.lodmill; import java.util.HashSet; import org.apache.commons.lang.StringEscapeUtils; import org.culturegraph.mf.framework.DefaultXmlPipe; import org.culturegraph.mf.framework.StreamReceiver; import org.culturegraph.mf.framework.XmlReceiver; import org.culturegraph.mf.framework.annotations.Description; import org.culturegraph.mf.framework.annotations.In; import org.culturegraph.mf.framework.annotations.Out; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * An XML entity splitter. * * @author Pascal Christoph (dr0i) * */ @Description("Splits all entities (aka records) residing in one XML document into multiple single XML documents.") @In(XmlReceiver.class) @Out(StreamReceiver.class) public final class XmlEntitySplitter extends DefaultXmlPipe<StreamReceiver> { private String entity; private StringBuilder builder = new StringBuilder(); private HashSet<String> namespaces = new HashSet<>(); private boolean inEntity = false; private int recordCnt = 0; private String root; private String rootStart = ""; private String rootEnd = ""; private String xmlDeclaration = "<?xml version = \"1.0\" encoding = \"UTF-8\"?>"; private int entityDepth = 0; /** * default constructor */ public XmlEntitySplitter() { } /** * enriched constructor setting the top level element and the entity name * * @param aTopLevelElement the name of the top level XML tag * @param aEntityName the name of the tag defining a new entity to be split */ public XmlEntitySplitter(String aTopLevelElement, String aEntityName) { setTopLevelElement(aTopLevelElement); setEntityName(aEntityName); } /** * Sets the name of the entity. All these entities in the XML stream will be * XML documents on their own. * * @param name Identifies the entities */ public void setEntityName(final String name) { this.entity = name; } /** * Sets the top-level XML document element. * * @param root the top level element. Don't set it to omit setting top level * element. */ public void setTopLevelElement(final String root) { this.root = root; this.rootStart = "<" + root; this.rootEnd = "</" + root + ">"; } /** * Sets the XML declaration. * * @param xmlDeclaration the xml declaration. Default is '<?xml version = * "1.0" encoding = "UTF-8"?>'. If empty value is given, the xml * declaration is skipped. */ public void setXmlDeclaration(final String xmlDeclaration) { this.xmlDeclaration = xmlDeclaration; } @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { super.startPrefixMapping(prefix, uri); if (root != null & !prefix.isEmpty() && uri != null) { namespaces.add(" xmlns:" + prefix + "=\"" + uri + "\""); } } @Override public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) throws SAXException { if (!inEntity) { if (entity.equals(localName)) { builder = new StringBuilder(); getReceiver().startRecord(String.valueOf(this.recordCnt++)); inEntity = true; appendValuesToEntity(qName, attributes); entityDepth++; } } else { if (entity.equals(localName)) { entityDepth++; } appendValuesToEntity(qName, attributes); } } private void appendValuesToEntity(final String qName, final Attributes attributes) { this.builder.append("<" + qName); if (attributes.getLength() > 0) { for (int i = 0; i < attributes.getLength(); i++) { builder.append(" " + attributes.getQName(i) + "=\"" + StringEscapeUtils.escapeXml(attributes.getValue(i)) + "\""); } } builder.append(">"); } @Override public void endElement(final String uri, final String localName, final String qName) throws SAXException { if (inEntity) { builder.append("</" + qName + ">"); if (entity.equals(localName)) { if (entityDepth <= 1) { StringBuilder sb = new StringBuilder(xmlDeclaration + rootStart); if (this.root != null && namespaces.size() > 0) { for (String ns : namespaces) { sb.append(ns); } sb.append(">"); } builder.insert(0, sb.toString()).append(rootEnd); getReceiver().literal("entity", builder.toString()); getReceiver().endRecord(); reset(); return; } entityDepth--; } } } @Override public void characters(final char[] chars, final int start, final int length) throws SAXException { try { builder.append( StringEscapeUtils.escapeXml(new String(chars, start, length))); } catch (Exception e) { reset(); } } private void reset() { inEntity = false; builder = new StringBuilder(); entityDepth = 0; } /** * Returns the XML declaration. * * @return the XML decalration */ public String getXmlDeclaration() { return xmlDeclaration; } @Override public void onResetStream() { reset(); } }