/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.dataimport; import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE; import static org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.xml.transform.Source; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import java.io.CharArrayReader; import java.io.CharArrayWriter; import java.io.Reader; import java.util.*; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; /** * <p> An implementation of EntityProcessor which uses a streaming xpath parser to extract values out of XML documents. * It is typically used in conjunction with HttpDataSource or FileDataSource. </p> <p/> <p> Refer to <a * href="http://wiki.apache.org/solr/DataImportHandler">http://wiki.apache.org/solr/DataImportHandler</a> for more * details. </p> * <p/> * <b>This API is experimental and may change in the future.</b> * * @version $Id: XPathEntityProcessor.java 887875 2009-12-07 10:27:47Z noble $ * @see XPathRecordReader * @since solr 1.3 */ public class XPathEntityProcessor extends EntityProcessorBase { private static final Logger LOG = LoggerFactory.getLogger(XPathEntityProcessor.class); private static final Map<String, Object> END_MARKER = new HashMap<String, Object>(); protected List<String> placeHolderVariables; protected List<String> commonFields; private String pk; private XPathRecordReader xpathReader; protected DataSource<Reader> dataSource; protected javax.xml.transform.Transformer xslTransformer; protected boolean useSolrAddXml = false; protected boolean streamRows = false; // Amount of time to block reading/writing to queue when streaming protected int blockingQueueTimeOut = 10; // Units for pumpTimeOut protected TimeUnit blockingQueueTimeOutUnits = TimeUnit.SECONDS; // Number of rows to queue for asynchronous processing protected int blockingQueueSize = 1000; protected Thread publisherThread; @SuppressWarnings("unchecked") public void init(Context context) { super.init(context); if (xpathReader == null) initXpathReader(); pk = context.getEntityAttribute("pk"); dataSource = context.getDataSource(); rowIterator = null; } private void initXpathReader() { useSolrAddXml = Boolean.parseBoolean(context .getEntityAttribute(USE_SOLR_ADD_SCHEMA)); streamRows = Boolean.parseBoolean(context .getEntityAttribute(STREAM)); if (context.getResolvedEntityAttribute("batchSize") != null) { blockingQueueSize = Integer.parseInt(context.getEntityAttribute("batchSize")); } if (context.getResolvedEntityAttribute("readTimeOut") != null) { blockingQueueTimeOut = Integer.parseInt(context.getEntityAttribute("readTimeOut")); } String xslt = context.getEntityAttribute(XSL); if (xslt != null) { xslt = context.replaceTokens(xslt); try { Source xsltSource = new StreamSource(xslt); // create an instance of TransformerFactory TransformerFactory transFact = TransformerFactory.newInstance(); xslTransformer = transFact.newTransformer(xsltSource); LOG .info("Using xslTransformer: " + xslTransformer.getClass().getName()); } catch (Exception e) { throw new DataImportHandlerException(SEVERE, "Error initializing XSL ", e); } } if (useSolrAddXml) { // Support solr add documents xpathReader = new XPathRecordReader("/add/doc"); xpathReader.addField("name", "/add/doc/field/@name", true); xpathReader.addField("value", "/add/doc/field", true); } else { String forEachXpath = context.getEntityAttribute(FOR_EACH); if (forEachXpath == null) throw new DataImportHandlerException(SEVERE, "Entity : " + context.getEntityAttribute("name") + " must have a 'forEach' attribute"); try { xpathReader = new XPathRecordReader(forEachXpath); for (Map<String, String> field : context.getAllEntityFields()) { if (field.get(XPATH) == null) continue; int flags = 0; if ("true".equals(field.get("flatten"))) { flags = XPathRecordReader.FLATTEN; } String xpath = field.get(XPATH); xpath = context.replaceTokens(xpath); xpathReader.addField(field.get(DataImporter.COLUMN), xpath, Boolean.parseBoolean(field.get(DataImporter.MULTI_VALUED)), flags); } } catch (RuntimeException e) { throw new DataImportHandlerException(SEVERE, "Exception while reading xpaths for fields", e); } } String url = context.getEntityAttribute(URL); List<String> l = url == null ? Collections.EMPTY_LIST : TemplateString.getVariables(url); for (String s : l) { if (s.startsWith(entityName + ".")) { if (placeHolderVariables == null) placeHolderVariables = new ArrayList<String>(); placeHolderVariables.add(s.substring(entityName.length() + 1)); } } for (Map<String, String> fld : context.getAllEntityFields()) { if (fld.get(COMMON_FIELD) != null && "true".equals(fld.get(COMMON_FIELD))) { if (commonFields == null) commonFields = new ArrayList<String>(); commonFields.add(fld.get(DataImporter.COLUMN)); } } } public Map<String, Object> nextRow() { Map<String, Object> result; if (!context.isRootEntity()) return fetchNextRow(); while (true) { result = fetchNextRow(); if (result == null) return null; if (pk == null || result.get(pk) != null) return result; } } @Override public void postTransform(Map<String, Object> r) { readUsefulVars(r); } @SuppressWarnings("unchecked") private Map<String, Object> fetchNextRow() { Map<String, Object> r = null; while (true) { if (rowIterator == null) initQuery(context.replaceTokens(context.getEntityAttribute(URL))); r = getNext(); if (r == null) { Object hasMore = context.getSessionAttribute(HAS_MORE, Context.SCOPE_ENTITY); try { if ("true".equals(hasMore) || Boolean.TRUE.equals(hasMore)) { String url = (String) context.getSessionAttribute(NEXT_URL, Context.SCOPE_ENTITY); if (url == null) url = context.getEntityAttribute(URL); addNamespace(); initQuery(context.replaceTokens(url)); r = getNext(); if (r == null) return null; } else { return null; } } finally { context.setSessionAttribute(HAS_MORE,null,Context.SCOPE_ENTITY); context.setSessionAttribute(NEXT_URL,null,Context.SCOPE_ENTITY); } } addCommonFields(r); return r; } } private void addNamespace() { Map<String, Object> namespace = new HashMap<String, Object>(); Set<String> allNames = new HashSet<String>(); if (commonFields != null) allNames.addAll(commonFields); if (placeHolderVariables != null) allNames.addAll(placeHolderVariables); if(allNames.isEmpty()) return; for (String name : allNames) { Object val = context.getSessionAttribute(name, Context.SCOPE_ENTITY); if (val != null) namespace.put(name, val); } ((VariableResolverImpl)context.getVariableResolver()).addNamespace(entityName, namespace); } private void addCommonFields(Map<String, Object> r) { if(commonFields != null){ for (String commonField : commonFields) { if(r.get(commonField) == null) { Object val = context.getSessionAttribute(commonField, Context.SCOPE_ENTITY); if(val != null) r.put(commonField, val); } } } } private void initQuery(String s) { Reader data = null; try { final List<Map<String, Object>> rows = new ArrayList<Map<String, Object>>(); try { data = dataSource.getData(s); } catch (Exception e) { if (ABORT.equals(onError)) { wrapAndThrow(SEVERE, e); } else if (SKIP.equals(onError)) { if (LOG.isDebugEnabled()) LOG.debug("Skipping url : " + s, e); wrapAndThrow(DataImportHandlerException.SKIP, e); } else { LOG.warn("Failed for url : " + s, e); rowIterator = Collections.EMPTY_LIST.iterator(); return; } } if (xslTransformer != null) { try { SimpleCharArrayReader caw = new SimpleCharArrayReader(); xslTransformer.transform(new StreamSource(data), new StreamResult(caw)); data = caw.getReader(); } catch (TransformerException e) { if (ABORT.equals(onError)) { wrapAndThrow(SEVERE, e, "Exception in applying XSL Transformeation"); } else if (SKIP.equals(onError)) { wrapAndThrow(DataImportHandlerException.SKIP, e); } else { LOG.warn("Failed for url : " + s, e); rowIterator = Collections.EMPTY_LIST.iterator(); return; } } } if (streamRows) { rowIterator = getRowIterator(data, s); } else { try { xpathReader.streamRecords(data, new XPathRecordReader.Handler() { @SuppressWarnings("unchecked") public void handle(Map<String, Object> record, String xpath) { rows.add(readRow(record, xpath)); } }); } catch (Exception e) { String msg = "Parsing failed for xml, url:" + s + " rows processed:" + rows.size(); if (rows.size() > 0) msg += " last row: " + rows.get(rows.size() - 1); if (ABORT.equals(onError)) { wrapAndThrow(SEVERE, e, msg); } else if (SKIP.equals(onError)) { LOG.warn(msg, e); Map<String, Object> map = new HashMap<String, Object>(); map.put(SKIP_DOC, Boolean.TRUE); rows.add(map); } else if (CONTINUE.equals(onError)) { LOG.warn(msg, e); } } rowIterator = rows.iterator(); } } finally { if (!streamRows) { closeIt(data); } } } private void closeIt(Reader data) { try { data.close(); } catch (Exception e) { /* Ignore */ } } protected Map<String, Object> readRow(Map<String, Object> record, String xpath) { if (useSolrAddXml) { List<String> names = (List<String>) record.get("name"); List<String> values = (List<String>) record.get("value"); Map<String, Object> row = new HashMap<String, Object>(); for (int i = 0; i < names.size() && i < values.size(); i++) { if (row.containsKey(names.get(i))) { Object existing = row.get(names.get(i)); if (existing instanceof List) { List list = (List) existing; list.add(values.get(i)); } else { List list = new ArrayList(); list.add(existing); list.add(values.get(i)); row.put(names.get(i), list); } } else { row.put(names.get(i), values.get(i)); } } return row; } else { record.put(XPATH_FIELD_NAME, xpath); return record; } } private static class SimpleCharArrayReader extends CharArrayWriter { public Reader getReader() { return new CharArrayReader(super.buf, 0, super.count); } } @SuppressWarnings("unchecked") private Map<String, Object> readUsefulVars(Map<String, Object> r) { Object val = r.get(HAS_MORE); if (val != null) context.setSessionAttribute(HAS_MORE, val,Context.SCOPE_ENTITY); val = r.get(NEXT_URL); if (val != null) context.setSessionAttribute(NEXT_URL, val,Context.SCOPE_ENTITY); if (placeHolderVariables != null) { for (String s : placeHolderVariables) { val = r.get(s); context.setSessionAttribute(s, val,Context.SCOPE_ENTITY); } } if (commonFields != null) { for (String s : commonFields) { Object commonVal = r.get(s); if (commonVal != null) { context.setSessionAttribute(s, commonVal,Context.SCOPE_ENTITY); } } } return r; } private Iterator<Map<String, Object>> getRowIterator(final Reader data, final String s) { //nothing atomic about it. I just needed a StongReference final AtomicReference<Exception> exp = new AtomicReference<Exception>(); final BlockingQueue<Map<String, Object>> blockingQueue = new ArrayBlockingQueue<Map<String, Object>>(blockingQueueSize); final AtomicBoolean isEnd = new AtomicBoolean(false); final AtomicBoolean throwExp = new AtomicBoolean(true); publisherThread = new Thread() { public void run() { try { xpathReader.streamRecords(data, new XPathRecordReader.Handler() { @SuppressWarnings("unchecked") public void handle(Map<String, Object> record, String xpath) { if (isEnd.get()) { throwExp.set(false); //To end the streaming . otherwise the parsing will go on forever //though consumer has gone away throw new RuntimeException("BREAK"); } Map<String, Object> row; try { row = readRow(record, xpath); } catch (Exception e) { isEnd.set(true); return; } offer(row); } }); } catch (Exception e) { if(throwExp.get()) exp.set(e); } finally { closeIt(data); if (!isEnd.get()) { offer(END_MARKER); } } } private void offer(Map<String, Object> row) { try { while (!blockingQueue.offer(row, blockingQueueTimeOut, blockingQueueTimeOutUnits)) { if (isEnd.get()) return; LOG.debug("Timeout elapsed writing records. Perhaps buffer size should be increased."); } } catch (InterruptedException e) { return; } finally { synchronized (this) { notifyAll(); } } } }; publisherThread.start(); return new Iterator<Map<String, Object>>() { private Map<String, Object> lastRow; int count = 0; public boolean hasNext() { return !isEnd.get(); } public Map<String, Object> next() { Map<String, Object> row; do { try { row = blockingQueue.poll(blockingQueueTimeOut, blockingQueueTimeOutUnits); if (row == null) { LOG.debug("Timeout elapsed reading records."); } } catch (InterruptedException e) { LOG.debug("Caught InterruptedException while waiting for row. Aborting."); isEnd.set(true); return null; } } while (row == null); if (row == END_MARKER) { isEnd.set(true); if (exp.get() != null) { String msg = "Parsing failed for xml, url:" + s + " rows processed in this xml:" + count; if (lastRow != null) msg += " last row in this xml:" + lastRow; if (ABORT.equals(onError)) { wrapAndThrow(SEVERE, exp.get(), msg); } else if (SKIP.equals(onError)) { wrapAndThrow(DataImportHandlerException.SKIP, exp.get()); } else { LOG.warn(msg, exp.get()); } } return null; } count++; return lastRow = row; } public void remove() { /*no op*/ } }; } public static final String URL = "url"; public static final String HAS_MORE = "$hasMore"; public static final String NEXT_URL = "$nextUrl"; public static final String XPATH_FIELD_NAME = "$forEach"; public static final String FOR_EACH = "forEach"; public static final String XPATH = "xpath"; public static final String COMMON_FIELD = "commonField"; public static final String USE_SOLR_ADD_SCHEMA = "useSolrAddSchema"; public static final String XSL = "xsl"; public static final String STREAM = "stream"; }