/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.dataimport; import javax.xml.stream.XMLInputFactory; import static javax.xml.stream.XMLStreamConstants.*; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import java.io.IOException; import java.io.Reader; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * <p> * A streaming xpath parser which uses StAX for XML parsing. It supports only * a subset of xpath syntax. * </p><pre> * /a/b/subject[@qualifier='fullTitle'] * /a/b/subject[@qualifier=]/subtag * /a/b/subject/@qualifier * //a * //a/b... * /a//b * /a//b... * /a/b/c * </pre> * A record is a Map<String,Object> . The key is the provided name * and the value is a String or a List<String> * * This class is thread-safe for parsing xml. But adding fields is not * thread-safe. The recommended usage is to addField() in one thread and * then share the instance across threads. * </p> * <p/> * <b>This API is experimental and may change in the future.</b> * <p> * @version $Id: XPathRecordReader.java 822154 2009-10-06 07:42:28Z noble $ * @since solr 1.3 */ public class XPathRecordReader { private Node rootNode = new Node("/", null); /** * The FLATTEN flag indicates that all text and cdata under a specific * tag should be recursivly fetched and appended to the current Node's * value. */ public static final int FLATTEN = 1; /** * A constructor called with a '|' seperated list of Xpath expressions * which define sub sections of the XML stream that are to be emitted as * seperate records. * * @param forEachXpath The XPATH for which a record is emitted. Once the * xpath tag is encountered, the Node.parse method starts collecting wanted * fields and at the close of the tag, a record is emitted containing all * fields collected since the tag start. Once * emitted the collected fields are cleared. Any fields collected in the * parent tag or above will also be included in the record, but these are * not cleared after emitting the record. * * It uses the ' | ' syntax of XPATH to pass in multiple xpaths. */ public XPathRecordReader(String forEachXpath) { String[] splits = forEachXpath.split("\\|"); for (String split : splits) { split = split.trim(); if (split.startsWith("//")) throw new RuntimeException("forEach cannot start with '//': " + split); if (split.length() == 0) continue; // The created Node has a name set to the full forEach attribute xpath addField0(split, split, false, true, 0); } } /** * A wrapper around {@link #addField0 addField0()} to create a series of * Nodes based on the supplied Xpath and a given fieldName. The created * nodes are inserted into a Node tree. * * @param name The name for this field in the emitted record * @param xpath The xpath expression for this field * @param multiValued If 'true' then the emitted record will have values in * a List<String> */ public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued) { addField0(xpath, name, multiValued, false, 0); return this; } /** * A wrapper around {@link #addField0 addField0()} to create a series of * Nodes based on the supplied Xpath and a given fieldName. The created * nodes are inserted into a Node tree. * * @param name The name for this field in the emitted record * @param xpath The xpath expression for this field * @param multiValued If 'true' then the emitted record will have values in * a List<String> * @param flags FLATTEN: Recursivly combine text from all child XML elements */ public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) { addField0(xpath, name, multiValued, false, flags); return this; } /** * Splits the XPATH into a List of xpath segments and calls build() to * construct a tree of Nodes representing xpath segments. The resulting * tree structure ends up describing all the Xpaths we are interested in. * * @param xpath The xpath expression for this field * @param name The name for this field in the emitted record * @param multiValued If 'true' then the emitted record will have values in * a List<String> * @param isRecord Flags that this XPATH is from a forEach statement * @param flags The only supported flag is 'FLATTEN' */ private void addField0(String xpath, String name, boolean multiValued, boolean isRecord, int flags) { if (!xpath.startsWith("/")) throw new RuntimeException("xpath must start with '/' : " + xpath); List<String> paths = splitEscapeQuote(xpath); // deal with how split behaves when seperator starts a string! if ("".equals(paths.get(0).trim())) paths.remove(0); rootNode.build(paths, name, multiValued, isRecord, flags); rootNode.buildOptimise(null); } /** * Uses {@link #streamRecords streamRecords} to parse the XML source but with * a handler that collects all the emitted records into a single List which * is returned upon completion. * * @param r the stream reader * @return results a List of emitted records */ public List<Map<String, Object>> getAllRecords(Reader r) { final List<Map<String, Object>> results = new ArrayList<Map<String, Object>>(); streamRecords(r, new Handler() { public void handle(Map<String, Object> record, String s) { results.add(record); } }); return results; } /** * Creates an XML stream reader on top of whatever reader has been * configured. Then calls parse() with a handler which is * invoked forEach record emitted. * * @param r the stream reader * @param handler The callback instance */ public void streamRecords(Reader r, Handler handler) { try { XMLStreamReader parser = factory.createXMLStreamReader(r); rootNode.parse(parser, handler, new HashMap<String, Object>(), new Stack<Set<String>>(), false); } catch (Exception e) { throw new RuntimeException(e); } } /** * For each node/leaf in the Node tree there is one object of this class. * This tree of objects represents all the XPaths we are interested in. * For each Xpath segment of interest we create a node. In most cases the * node (branch) is rather basic , but for the final portion (leaf) of any * Xpath we add more information to the Node. When parsing the XML document * we step though this tree as we stream records from the reader. If the XML * document departs from this tree we skip start tags till we are back on * the tree. */ private static class Node { String name; // genrally: segment of the Xpath represented by this Node String fieldName; // the fieldname in the emitted record (key of the map) String xpathName; // the segment of the Xpath represented by this Node String forEachPath; // the full Xpath from the forEach entity attribute List<Node> attributes; // List of attribute Nodes associated with this Node List<Node> childNodes; // List of immediate child Nodes of this node List<Node> wildCardNodes; // List of '//' style decendants of this Node List<Map.Entry<String, String>> attribAndValues; Node wildAncestor; // ancestor Node containing '//' style decendants Node parent; // parent Node in the tree boolean hasText=false; // flag: store/emit streamed text for this node boolean multiValued=false; //flag: this fields values are returned as a List boolean isRecord=false; //flag: this Node starts a new record private boolean flatten; //flag: child text is also to be emitted public Node(String name, Node p) { // Create a basic Node, suitable for the mid portions of any Xpath. // Node.xpathName and Node.name are set to same value. xpathName = this.name = name; parent = p; } public Node(String name, String fieldName, boolean multiValued) { // This is only called from build() when describing an attribute. this.name = name; // a segment from the Xpath this.fieldName = fieldName; // name to store collected values against this.multiValued = multiValued; // return collected values in a List } /** * This is the method where all the XML parsing happens. For each * tag/subtag read from the source, this method is called recursively. * */ private void parse(XMLStreamReader parser, Handler handler, Map<String, Object> values, Stack<Set<String>> stack, // lists of values to purge boolean recordStarted ) throws IOException, XMLStreamException { Set<String> valuesAddedinThisFrame = null; if (isRecord) { // This Node is a match for an XPATH from a forEach attribute, // prepare for the clean up that will occurr when the record // is emitted after its END_ELEMENT is matched recordStarted = true; valuesAddedinThisFrame = new HashSet<String>(); stack.push(valuesAddedinThisFrame); } else if (recordStarted) { // This node is a child of some parent which matched against forEach // attribute. Continue to add values to an existing record. valuesAddedinThisFrame = stack.peek(); } try { /* The input stream has deposited us at this Node in our tree of * intresting nodes. Depending on how this node is of interest, * process further tokens from the input stream and decide what * we do next */ if (attributes != null) { // we interested in storing attributes from the input stream for (Node node : attributes) { String value = parser.getAttributeValue(null, node.name); if (value != null || (recordStarted && !isRecord)) { putText(values, value, node.fieldName, node.multiValued); valuesAddedinThisFrame.add(node.fieldName); } } } Set<Node> childrenFound = new HashSet<Node>(); int event = -1; int flattenedStarts=0; // our tag depth when flattening elements StringBuilder text = new StringBuilder(); while (true) { event = parser.next(); if (event == END_ELEMENT) { if (flattenedStarts > 0) flattenedStarts--; else { if (text.length() > 0 && valuesAddedinThisFrame != null) { valuesAddedinThisFrame.add(fieldName); putText(values, text.toString(), fieldName, multiValued); } if (isRecord) handler.handle(getDeepCopy(values), forEachPath); if (childNodes != null && recordStarted && !isRecord && !childrenFound.containsAll(childNodes)) { // nonReccord nodes where we have not collected text for ALL // the child nodes. for (Node n : childNodes) { // For the multivalue child nodes where we could have, but // didnt, collect text. Push a null string into values. if (!childrenFound.contains(n)) n.putNulls(values); } } return; } } else if (hasText && (event==CDATA || event==CHARACTERS || event==SPACE)) { text.append(parser.getText()); } else if (event == START_ELEMENT) { if ( flatten ) flattenedStarts++; else handleStartElement(parser, childrenFound, handler, values, stack, recordStarted); } // END_DOCUMENT is least likely to appear and should be // last in if-then-else skip chain else if (event == END_DOCUMENT) return; } }finally { if ((isRecord || !recordStarted) && !stack.empty()) { Set<String> cleanThis = stack.pop(); if (cleanThis != null) { for (String fld : cleanThis) values.remove(fld); } } } } /** * If a new tag is encountered, check if it is of interest or not by seeing * if it matches against our node tree. If we have deperted from the node * tree then walk back though the tree's ancestor nodes checking to see if * any // expressions exist for the node and compare them against the new * tag. If matched then "jump" to that node, otherwise ignore the tag. * * Note, the list of // expressions found while walking back up the tree * is chached in the HashMap decends. Then if the new tag is to be skipped, * any inner chil tags are compared against the cache and jumped to if * matched. */ private void handleStartElement(XMLStreamReader parser, Set<Node> childrenFound, Handler handler, Map<String, Object> values, Stack<Set<String>> stack, boolean recordStarted) throws IOException, XMLStreamException { Node n = getMatchingNode(parser,childNodes); Map<String, Object> decends=new HashMap<String, Object>(); if (n != null) { childrenFound.add(n); n.parse(parser, handler, values, stack, recordStarted); return; } // The stream has diverged from the tree of interesting elements, but // are there any wildCardNodes ... anywhere in our path from the root? Node dn = this; // checking our Node first! do { if (dn.wildCardNodes != null) { // Check to see if the streams tag matches one of the "//" all // decendents type expressions for this node. n = getMatchingNode(parser, dn.wildCardNodes); if (n != null) { childrenFound.add(n); n.parse(parser, handler, values, stack, recordStarted); break; } // add the list of this nodes wild decendents to the cache for (Node nn : dn.wildCardNodes) decends.put(nn.name, nn); } dn = dn.wildAncestor; // leap back along the tree toward root } while (dn != null) ; if (n == null) { // we have a START_ELEMENT which is not within the tree of // interesting nodes. Skip over the contents of this element // but recursivly repeat the above for any START_ELEMENTs // found within this element. int count = 1; // we have had our first START_ELEMENT while (count != 0) { int token = parser.next(); if (token == START_ELEMENT) { Node nn = (Node) decends.get(parser.getLocalName()); if (nn != null) { // We have a //Node which matches the stream's parser.localName childrenFound.add(nn); // Parse the contents of this stream element nn.parse(parser, handler, values, stack, recordStarted); } else count++; } else if (token == END_ELEMENT) count--; } } } /** * Check if the current tag is to be parsed or not. We step through the * supplied List "searchList" looking for a match. If matched, return the * Node object. */ private Node getMatchingNode(XMLStreamReader parser,List<Node> searchL){ if (searchL == null) return null; String localName = parser.getLocalName(); for (Node n : searchL) { if (n.name.equals(localName)) { if (n.attribAndValues == null) return n; if (checkForAttributes(parser, n.attribAndValues)) return n; } } return null; } private boolean checkForAttributes(XMLStreamReader parser, List<Map.Entry<String, String>> attrs) { for (Map.Entry<String, String> e : attrs) { String val = parser.getAttributeValue(null, e.getKey()); if (val == null) return false; if (e.getValue() != null && !e.getValue().equals(val)) return false; } return true; } /** * A recursive routine that walks the Node tree from a supplied start * pushing a null string onto every multiValued fieldName's List of values * where a value has not been provided from the stream. */ private void putNulls(Map<String, Object> values) { if (attributes != null) { for (Node n : attributes) { if (n.multiValued) putText(values, null, n.fieldName, true); } } if (hasText && multiValued) putText(values, null, fieldName, true); if (childNodes != null) { for (Node childNode : childNodes) childNode.putNulls(values); } } /** * Add the field name and text into the values Map. If it is a non * multivalued field, then the text is simply placed in the object * portion of the Map. If it is a multivalued field then the text is * pushed onto a List which is the object portion of the Map. */ @SuppressWarnings("unchecked") private void putText(Map<String, Object> values, String value, String fieldName, boolean multiValued) { if (multiValued) { List<String> v = (List<String>) values.get(fieldName); if (v == null) { v = new ArrayList<String>(); values.put(fieldName, v); } v.add(value); } else { values.put(fieldName, value); } } /** * Walk the Node tree propagating any wildDescentant information to * child nodes. This allows us to optimise the performance of the * main parse method. */ private void buildOptimise(Node wa) { wildAncestor=wa; if ( wildCardNodes != null ) wa = this; if ( childNodes != null ) for ( Node n : childNodes ) n.buildOptimise(wa); } /** * Build a Node tree structure representing all Xpaths of intrest to us. * This must be done before parsing of the XML stream starts. Each node * holds one portion of an Xpath. Taking each Xpath segment in turn this * method walks the Node tree and finds where the new segment should be * inserted. It creates a Node representing a field's name, XPATH and * some flags and inserts the Node into the Node tree. */ private void build( List<String> paths, // a List of segments from the split xpaths String fieldName, // the fieldName assoc with this Xpath boolean multiValued, // flag if this fieldName is multiValued or not boolean record, // is this xpath a record or a field int flags // are we to flatten matching xpaths ) { // recursivly walk the paths Lists adding new Nodes as required String xpseg = paths.remove(0); // shift out next Xpath segment if (paths.isEmpty() && xpseg.startsWith("@")) { // we have reached end of element portion of Xpath and can now only // have an element attribute. Add it to this nodes list of attributes if (attributes == null) { attributes = new ArrayList<Node>(); } xpseg = xpseg.substring(1); // strip the '@' attributes.add(new Node(xpseg, fieldName, multiValued)); } else if ( xpseg.length() == 0) { // we have a '//' selector for all decendents of the current nodes xpseg = paths.remove(0); // shift out next Xpath segment if (wildCardNodes == null) wildCardNodes = new ArrayList<Node>(); Node n = getOrAddNode(xpseg, wildCardNodes); if (paths.isEmpty()) { // We are current a leaf node. // xpath with content we want to store and return n.hasText = true; // we have to store text found here n.fieldName = fieldName; // name to store collected text against n.multiValued = multiValued; // true: text be stored in a List n.flatten = flags == FLATTEN; // true: store text from child tags } else { // recurse to handle next paths segment n.build(paths, fieldName, multiValued, record, flags); } } else { if (childNodes == null) childNodes = new ArrayList<Node>(); // does this "name" already exist as a child node. Node n = getOrAddNode(xpseg,childNodes); if (paths.isEmpty()) { // We have emptied paths, we are for the moment a leaf of the tree. // When parsing the actual input we have traversed to a position // where we actutally have to do something. getOrAddNode() will // have created and returned a new minimal Node with name and // xpathName already populated. We need to add more information. if (record) { // forEach attribute n.isRecord = true; // flag: forEach attribute, prepare to emit rec n.forEachPath = fieldName; // the full forEach attribute xpath } else { // xpath with content we want to store and return n.hasText = true; // we have to store text found here n.fieldName = fieldName; // name to store collected text against n.multiValued = multiValued; // true: text be stored in a List n.flatten = flags == FLATTEN; // true: store text from child tags } } else { // recurse to handle next paths segment n.build(paths, fieldName, multiValued, record, flags); } } } private Node getOrAddNode(String xpathName, List<Node> searchList ) { for (Node n : searchList) if (n.xpathName.equals(xpathName)) return n; // new territory! add a new node for this Xpath bitty Node n = new Node(xpathName, this); // a minimal Node initalization Matcher m = ATTRIB_PRESENT_WITHVAL.matcher(xpathName); if (m.find()) { n.name = m.group(1); int start = m.start(2); while (true) { HashMap<String, String> attribs = new HashMap<String, String>(); if (!m.find(start)) break; attribs.put(m.group(3), m.group(5)); start = m.end(6); if (n.attribAndValues == null) n.attribAndValues = new ArrayList<Map.Entry<String, String>>(); n.attribAndValues.addAll(attribs.entrySet()); } } searchList.add(n); return n; } /** * Copies a supplied Map to a new Map which is returned. Used to copy a * records values. If a fields value is a List then they have to be * deep-copied for thread safety */ private static Map<String, Object> getDeepCopy(Map<String, Object> values) { Map<String, Object> result = new HashMap<String, Object>(); for (Map.Entry<String, Object> entry : values.entrySet()) { if (entry.getValue() instanceof List) { result.put(entry.getKey(), new ArrayList((List) entry.getValue())); } else { result.put(entry.getKey(), entry.getValue()); } } return result; } } // end of class Node /** * The Xpath is split into segments using the '/' as a seperator. However * this method deals with special cases where there is a slash '/' character * inside the attribute value e.g. x/@html='text/html'. We split by '/' but * then reassemble things were the '/' appears within a quoted sub-string. * * We have already enforced that the string must begin with a seperator. This * method depends heavily on how split behaves if the string starts with the * seperator or if a sequence of multiple seperator's appear. */ private static List<String> splitEscapeQuote(String str) { List<String> result = new LinkedList<String>(); String[] ss = str.split("/"); for (int i=0; i<ss.length; i++) { // i=1: skip seperator at start of string StringBuilder sb = new StringBuilder(); int quoteCount = 0; while (true) { sb.append(ss[i]); for (int j=0; j<ss[i].length(); j++) if (ss[i].charAt(j) == '\'') quoteCount++; // have we got a split inside quoted sub-string? if ((quoteCount % 2) == 0) break; // yes!; replace the '/' and loop to concat next token i++; sb.append("/"); } result.add(sb.toString()); } return result; } static XMLInputFactory factory = XMLInputFactory.newInstance(); static{ factory.setProperty(XMLInputFactory.IS_VALIDATING , Boolean.FALSE); factory.setProperty(XMLInputFactory.SUPPORT_DTD , Boolean.FALSE); } /**Implement this interface to stream records as and when one is found. * */ public static interface Handler { /** * @param record The record map. The key is the field name as provided in * the addField() methods. The value can be a single String (for single * valued fields) or a List<String> (for multiValued). * @param xpath The forEach XPATH for which this record is being emitted * If there is any change all parsing will be aborted and the Exception * is propogated up */ public void handle(Map<String, Object> record, String xpath); } private static final Pattern ATTRIB_PRESENT_WITHVAL = Pattern .compile("(\\S*?)?(\\[@)(\\S*?)(='(.*?)')?(\\])"); }