/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.processors.standard; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.nifi.annotation.behavior.EventDriven; import org.apache.nifi.annotation.behavior.InputRequirement; import org.apache.nifi.annotation.behavior.InputRequirement.Requirement; import org.apache.nifi.annotation.behavior.SideEffectFree; import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.behavior.WritesAttribute; import org.apache.nifi.annotation.behavior.WritesAttributes; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.processor.AbstractProcessor; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.ProcessorInitializationContext; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.util.StandardValidators; import org.apache.nifi.processors.standard.util.XmlElementNotifier; import org.apache.nifi.stream.io.BufferedInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import static org.apache.nifi.flowfile.attributes.FragmentAttributes.FRAGMENT_COUNT; import static org.apache.nifi.flowfile.attributes.FragmentAttributes.FRAGMENT_ID; import static org.apache.nifi.flowfile.attributes.FragmentAttributes.FRAGMENT_INDEX; import static org.apache.nifi.flowfile.attributes.FragmentAttributes.SEGMENT_ORIGINAL_FILENAME; import static org.apache.nifi.flowfile.attributes.FragmentAttributes.copyAttributesToOriginal; @EventDriven @SideEffectFree @SupportsBatching @Tags({"xml", "split"}) @InputRequirement(Requirement.INPUT_REQUIRED) @CapabilityDescription("Splits an XML File into multiple separate FlowFiles, each comprising a child or descendant of the original root element") @WritesAttributes({ @WritesAttribute(attribute = "fragment.identifier", description = "All split FlowFiles produced from the same parent FlowFile will have the same randomly generated UUID added for this attribute"), @WritesAttribute(attribute = "fragment.index", description = "A one-up number that indicates the ordering of the split FlowFiles that were created from a single parent FlowFile"), @WritesAttribute(attribute = "fragment.count", description = "The number of split FlowFiles generated from the parent FlowFile"), @WritesAttribute(attribute = "segment.original.filename ", description = "The filename of the parent FlowFile") }) public class SplitXml extends AbstractProcessor { public static final PropertyDescriptor SPLIT_DEPTH = new PropertyDescriptor.Builder() .name("Split Depth") .description("Indicates the XML-nesting depth to start splitting XML fragments. A depth of 1 means split the root's children, whereas a depth of" + " 2 means split the root's children's children and so forth.") .required(true) .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) .defaultValue("1") .build(); public static final Relationship REL_ORIGINAL = new Relationship.Builder() .name("original") .description("The original FlowFile that was split into segments. If the FlowFile fails processing, nothing will be sent to this relationship") .build(); public static final Relationship REL_SPLIT = new Relationship.Builder() .name("split") .description("All segments of the original FlowFile will be routed to this relationship") .build(); public static final Relationship REL_FAILURE = new Relationship.Builder() .name("failure") .description("If a FlowFile fails processing for any reason (for example, the FlowFile is not valid XML), it will be routed to this relationship") .build(); private List<PropertyDescriptor> properties; private Set<Relationship> relationships; private static final String FEATURE_PREFIX = "http://xml.org/sax/features/"; public static final String ENABLE_NAMESPACES_FEATURE = FEATURE_PREFIX + "namespaces"; public static final String ENABLE_NAMESPACE_PREFIXES_FEATURE = FEATURE_PREFIX + "namespace-prefixes"; private static final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); static { saxParserFactory.setNamespaceAware(true); try { saxParserFactory.setFeature(ENABLE_NAMESPACES_FEATURE, true); saxParserFactory.setFeature(ENABLE_NAMESPACE_PREFIXES_FEATURE, true); } catch (Exception e) { final Logger staticLogger = LoggerFactory.getLogger(SplitXml.class); staticLogger.warn("Unable to configure SAX Parser to make namespaces available", e); } } @Override protected void init(final ProcessorInitializationContext context) { final List<PropertyDescriptor> properties = new ArrayList<>(); properties.add(SPLIT_DEPTH); this.properties = Collections.unmodifiableList(properties); final Set<Relationship> relationships = new HashSet<>(); relationships.add(REL_ORIGINAL); relationships.add(REL_SPLIT); relationships.add(REL_FAILURE); this.relationships = Collections.unmodifiableSet(relationships); } @Override public Set<Relationship> getRelationships() { return relationships; } @Override protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { return properties; } @Override public void onTrigger(final ProcessContext context, final ProcessSession session) { final FlowFile original = session.get(); if (original == null) { return; } final int depth = context.getProperty(SPLIT_DEPTH).asInteger(); final ComponentLog logger = getLogger(); final List<FlowFile> splits = new ArrayList<>(); final String fragmentIdentifier = UUID.randomUUID().toString(); final AtomicInteger numberOfRecords = new AtomicInteger(0); final XmlSplitterSaxParser parser = new XmlSplitterSaxParser(xmlTree -> { FlowFile split = session.create(original); split = session.write(split, out -> out.write(xmlTree.getBytes("UTF-8"))); split = session.putAttribute(split, FRAGMENT_ID.key(), fragmentIdentifier); split = session.putAttribute(split, FRAGMENT_INDEX.key(), Integer.toString(numberOfRecords.getAndIncrement())); split = session.putAttribute(split, SEGMENT_ORIGINAL_FILENAME.key(), split.getAttribute(CoreAttributes.FILENAME.key())); splits.add(split); }, depth); final AtomicBoolean failed = new AtomicBoolean(false); session.read(original, rawIn -> { try (final InputStream in = new BufferedInputStream(rawIn)) { SAXParser saxParser = null; try { saxParser = saxParserFactory.newSAXParser(); final XMLReader reader = saxParser.getXMLReader(); reader.setContentHandler(parser); reader.parse(new InputSource(in)); } catch (final ParserConfigurationException | SAXException e) { logger.error("Unable to parse {} due to {}", new Object[]{original, e}); failed.set(true); } } }); if (failed.get()) { session.transfer(original, REL_FAILURE); session.remove(splits); } else { splits.forEach((split) -> { split = session.putAttribute(split, FRAGMENT_COUNT.key(), Integer.toString(numberOfRecords.get())); session.transfer(split, REL_SPLIT); }); final FlowFile originalToTransfer = copyAttributesToOriginal(session, original, fragmentIdentifier, numberOfRecords.get()); session.transfer(originalToTransfer, REL_ORIGINAL); logger.info("Split {} into {} FlowFiles", new Object[]{originalToTransfer, splits.size()}); } } private static class XmlSplitterSaxParser implements ContentHandler { private static final String XML_PROLOGUE = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; private final XmlElementNotifier notifier; private final int splitDepth; private final StringBuilder sb = new StringBuilder(XML_PROLOGUE); private int depth = 0; private Map<String, String> prefixMap = new TreeMap<>(); public XmlSplitterSaxParser(XmlElementNotifier notifier, int splitDepth) { this.notifier = notifier; this.splitDepth = splitDepth; } @Override public void characters(char[] ch, int start, int length) throws SAXException { // if we're not at a level where we care about capturing text, then return if (depth <= splitDepth) { return; } // capture text for (int i = start; i < start + length; i++) { char c = ch[i]; switch (c) { case '<': sb.append("<"); break; case '>': sb.append(">"); break; case '&': sb.append("&"); break; case '\'': sb.append("'"); break; case '"': sb.append("""); break; default: sb.append(c); break; } } } @Override public void endDocument() throws SAXException { } @Override public void endElement(String uri, String localName, String qName) throws SAXException { // We have finished processing this element. Decrement the depth. int newDepth = --depth; // if we're at a level where we care about capturing text, then add the closing element if (newDepth >= splitDepth) { // Add the element end tag. sb.append("</").append(qName).append(">"); } // If we have now returned to level 1, we have finished processing // a 2nd-level element. Send notification with the XML text and // erase the String Builder so that we can start // processing a new 2nd-level element. if (newDepth == splitDepth) { String elementTree = sb.toString(); notifier.onXmlElementFound(elementTree); // Reset the StringBuilder to just the XML prolog. sb.setLength(XML_PROLOGUE.length()); } } @Override public void endPrefixMapping(String prefix) throws SAXException { prefixMap.remove(prefixToNamespace(prefix)); } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } @Override public void processingInstruction(String target, String data) throws SAXException { } @Override public void setDocumentLocator(Locator locator) { } @Override public void skippedEntity(String name) throws SAXException { } @Override public void startDocument() throws SAXException { } @Override public void startElement(final String uri, final String localName, final String qName, final Attributes atts) throws SAXException { // Increment the current depth because start a new XML element. int newDepth = ++depth; // Output the element and its attributes if it is // not the root element. if (newDepth > splitDepth) { sb.append("<"); sb.append(qName); final Set<String> attributeNames = new HashSet<>(); int attCount = atts.getLength(); for (int i = 0; i < attCount; i++) { String attName = atts.getQName(i); attributeNames.add(attName); String attValue = StringEscapeUtils.escapeXml10(atts.getValue(i)); sb.append(" ").append(attName).append("=").append("\"").append(attValue).append("\""); } // If this is the first node we're outputting write out // any additional namespace declarations that are required if (splitDepth == newDepth - 1) { for (Entry<String, String> entry : prefixMap.entrySet()) { // If we've already added this namespace as an attribute then continue if (attributeNames.contains(entry.getKey())) { continue; } sb.append(" "); sb.append(entry.getKey()); sb.append("=\""); sb.append(entry.getValue()); sb.append("\""); } } sb.append(">"); } } @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { final String ns = prefixToNamespace(prefix); prefixMap.put(ns, uri); } private String prefixToNamespace(String prefix) { final String ns; if (prefix.length() == 0) { ns = "xmlns"; } else { ns="xmlns:"+prefix; } return ns; } } }