Producer.java example

Explorer
recordloader-master
- src
  - java
    - com
      - marklogic
        ps
        Crypto.java
        JEncryption.java
        PasswordDecrypter.java
        PasswordEncrypter.java
        PropertyClientInterface.java
        RecordLoader.java
        SimpleLogger.java
        TestCipher.java
        TestCrypto.java
        Utilities.java
        timing
        TimedEvent.java
        TimedEventDurationComparator.java
        TimedEventTests.java
        Timer.java
        recordloader
        AbstractConfiguration.java
        AbstractContent.java
        AbstractInputHandler.java
        AbstractLoader.java
        Configuration.java
        ContentFactory.java
        ContentInterface.java
        DefaultInputHandler.java
        FatalException.java
        FileLoader.java
        InputHandlerInterface.java
        Loader.java
        LoaderException.java
        LoaderFactory.java
        LoaderInterface.java
        Monitor.java
        Producer.java
        ProducerFactory.java
        TranscodingFileLoader.java
        TranscodingLoader.java
        ZipReference.java
        http
        HttpModuleContent.java
        HttpModuleContentFactory.java
        junit
        ProducerTest.java
        svn
        SvnInputHandler.java
        xcc
        DelimitedDataConfiguration.java
        DelimitedDataLoader.java
        XccAbstractContent.java
        XccAbstractContentFactory.java
        XccConfiguration.java
        XccContent.java
        XccContentFactory.java
        XccModuleContent.java
        XccModuleContentFactory.java
/**
 * Copyright (c) 2006-2011 Mark Logic Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * The use of the Apache License does not indicate that this project is
 * affiliated with the Apache Software Foundation.
 */
package com.marklogic.recordloader;

import java.io.IOException;
import java.io.InputStream;

import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;

import com.marklogic.ps.SimpleLogger;
import com.marklogic.ps.Utilities;

/**
 * @author Michael Blakeley, michael.blakeley@marklogic.com
 * 
 */
public class Producer extends InputStream {

    private String outputEncoding = Configuration.OUTPUT_ENCODING_DEFAULT;

    protected SimpleLogger logger;

    protected XmlPullParser xpp;

    private String recordName;

    private String recordNamespace;

    private int recordDepth = 0;

    private StringBuilder buffer;

    private byte[] byteBuffer;

    private Configuration config;

    private String idName;

    private boolean skippingRecord = false;

    private long bytesRead = 0;

    protected String currentId = null;

    private int byteIndex = 0;

    private boolean keepGoing = true;

    private boolean copyNamespaceDeclarations = true;

    protected boolean startOfRecord = true;

    /**
     * @param _config
     * @param _xpp
     */
    public Producer(Configuration _config, XmlPullParser _xpp) {
        config = _config;
        xpp = _xpp;

        idName = config.getIdNodeName();
        recordNamespace = config.getRecordNamespace();
        recordName = config.getRecordName();

        copyNamespaceDeclarations = config.isCopyNamespaceDeclarations();

        logger = _config.getLogger();
        logger.fine("recordName=" + recordName);
    }

    /**
     * @throws XmlPullParserException
     * 
     */
    private void handleRecordStart() throws XmlPullParserException {

        if (recordDepth > 0) {
            // if recordDepth is set, we're already in a record and this
            // element just happens to have RECORD_NAME
            return;
        }
        recordDepth = xpp.getDepth();
        logger.finest("recordDepth = " + recordDepth);

        // handle automatic id generation here
        String newId;
        boolean useAutomaticIds = config.isUseAutomaticIds();
        logger.fine("useAutomaticIds=" + useAutomaticIds);
        boolean useFileNameIds = config.isUseFilenameIds();
        logger.fine("useFileNameIds=" + useFileNameIds);
        if (!(useAutomaticIds || useFileNameIds || idName.startsWith("@"))) {
            return;
        }
        if (useAutomaticIds) {
            // automatic ids, starting from 1
            // config uses a sequence of long
            // TODO change to be basefile/entry/sequence ???
            newId = config.getAutoId();
            logger.fine("automatic document id " + newId);
        } else if (useFileNameIds) {
            // the constructor had better have set our id!
            // note that skipping won't work for this case
            if (null == currentId) {
                throw new FatalException(
                        "Cannot use filename ids unless the constructor sets currentId");
            }
            logger.fine("using filename id " + currentId);
            newId = currentId;
        } else {
            // if the idName starts with @, it's an attribute
            // handle attributes as idName
            if (xpp.getAttributeCount() < 1) {
                throw new XmlPullParserException(
                        "found no attributes for recordName = "
                                + recordName + ", idName=" + idName
                                + " at " + xpp.getPositionDescription());
            }
            // try with and without a namespace: first, try without
            newId = xpp.getAttributeValue("", idName.substring(1));
            if (newId == null) {
                newId = xpp.getAttributeValue(recordNamespace, idName
                        .substring(1));
            }
            if (newId == null) {
                throw new XmlPullParserException("null id " + idName
                        + " at " + xpp.getPositionDescription());
            }
            logger.fine("found id " + idName + " = " + newId);
        }

        setCurrentId(newId);
    }

    protected void processStartElement() throws IOException,
            XmlPullParserException {
        String name = xpp.getName();
        String namespace = xpp.getNamespace();
        String prefix = xpp.getPrefix();
        boolean isEmpty = xpp.isEmptyElementTag();

        logger.finest("name = " + name);
        String text = xpp.getText();
        logger.finest("text = " + text);
        // guard against and work around a known bug
        if (!text.contains(name)) {
            logger.warning("working around xpp3 bug 249: name = " + name
                    + ", text = " + text);
            text = "<" + (null == prefix ? "" : (prefix + ":")) + name;
            int attributeCount = xpp.getAttributeCount();
            String aPrefix;
            if (attributeCount > 0) {
                for (int i = 0; i < attributeCount; i++) {
                    aPrefix = xpp.getAttributePrefix(i);
                    text += " "
                            + (null == aPrefix ? "" : (aPrefix + ":"))
                            + xpp.getAttributeName(i)
                            + "=\""
                            + Utilities.escapeXml(
                                    xpp.getAttributeValue(i), true)
                                    + "\"";
                }
            }
            text += (isEmpty ? "/>" : ">");
        }

        // TODO use startOfRecord field?
        boolean isRecordRoot = false;

        if (name.equals(recordName) && namespace.equals(recordNamespace)) {
            isRecordRoot = true;
            handleRecordStart();
        }

        // allow for repeated idName elements: first one wins
        // NOTE: idName is namespace-insensitive
        if (null == currentId && name.equals(idName)) {
            // TODO support idNameAttribute or similar

            // pick out the contents and use it for the uri
            if (xpp.nextToken() != XmlPullParser.TEXT) {
                throw new XmlPullParserException("badly formed xml or "
                        + idName + " is not a simple node: at"
                        + xpp.getPositionDescription());
            }

            String newId = xpp.getText();
            logger.fine("found id " + idName + " = " + newId);

            // now we can set currentId
            setCurrentId(newId);

            // now we know that we'll use this content and id
            write(text);
            write(currentId);

            // advance xpp to the END_ELEMENT - brittle?
            if (xpp.nextToken() != XmlPullParser.END_TAG) {
                throw new XmlPullParserException(
                        "badly formed xml: no END_TAG after id text"
                                + xpp.getPositionDescription());
            }
            text = xpp.getText();
            logger.finest("END_TAG = " + text);
            write(text);
            return;
        }

        // if the startId is still defined, and the uri has been found,
        // we should skip as much of this work as possible
        // this avoids OutOfMemory errors, too
        if (skippingRecord) {
            logger.finest("skipping record");
            return;
        }

        // this seems to be the only way to handle empty elements:
        // write it as a end-element, only.
        // note that attributes are still ok in this case
        if (isEmpty) {
            logger.finest("empty element");
            return;
        }

        if (copyNamespaceDeclarations && isRecordRoot) {
            // preserve namespace declarations into this element
            int depth = xpp.getDepth();
            if (depth > 0) {
                int stop = xpp.getNamespaceCount((depth > 1) ? depth - 1
                        : 1);
                if (stop > 0) {
                    StringBuilder decl = null;
                    String nsDeclPrefix, nsDeclUri;
                    logger.finer("checking namespace declarations");
                    for (int i = 0; i < stop; i++) {
                        if (decl == null) {
                            decl = new StringBuilder();
                        }
                        nsDeclPrefix = xpp.getNamespacePrefix(i);
                        nsDeclUri = xpp.getNamespaceUri(i);
                        logger.finest("found namespace declaration "
                                + nsDeclPrefix + " = " + nsDeclUri);
                        decl.append(" xmlns");
                        if (nsDeclPrefix != null) {
                            decl.append(":");
                            decl.append(nsDeclPrefix);
                        }
                        decl.append("=\"");
                        decl.append(nsDeclUri);
                        decl.append("\"");
                    }
                    // copy the namespace decls to the end of the tag
                    if (decl != null) {
                        logger.finer("copying namespace declarations");
                        text = text.replaceFirst(">$", decl.toString()
                                + (isEmpty ? "/" : "") + ">");
                    }
                } else {
                    logger.finer("no namespace declarations to copy");
                }
            } else {
                logger.finer("no namespace declarations to copy at "
                        + depth);
            }
        }

        logger.finest("writing text = " + text);
        write(text);
        return;
    }

    protected boolean processEndElement() throws XmlPullParserException {
        // NOTE: must return false when the record end-element is found

        String name = xpp.getName();
        String namespace = xpp.getNamespace();
        logger.finest("name = " + name);

        // record the element text
        if (!skippingRecord) {
            write(xpp.getText());
        }

        if (!(recordName.equals(name)
                && recordNamespace.equals(namespace)
                && recordDepth == xpp.getDepth())) {
            // not the end of the record: go look for more nodes
            return true;
        }

        // reset recordDepth
        recordDepth = 0;

        // end of record: were we skipping?
        if (skippingRecord) {
            logger.fine("reached the end of skipped record");
            return false;
        }

        // did something go wrong?
        if (null == currentId) {
            throw new XmlPullParserException("end of record element "
                    + name + " with no id found: "
                    + Configuration.ID_NAME_KEY + "=" + idName);
        }

        // end of record
        logger.fine("end of record");
        // logger.finest(buffer.toString()); // DEBUG
        return false;
    }

    /**
     * @param string
     */
    private void write(String string) {
        // if the startId is still defined, and the uri has been found,
        // we should skip as much of this work as possible
        // this avoids OutOfMemory too
        if (skippingRecord) {
            return;
        }

        if (buffer == null) {
            buffer = new StringBuilder();
        }

        // logger.finest("string = " + string); // DEBUG
        buffer.append(string);
    }

    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#getBytesRead()
     */
    public long getBytesRead() {
        return bytesRead;
    }

    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#getCurrentId()
     */
    public String getCurrentId() throws XmlPullParserException,
            IOException {
        // buffer up content until we find the id node
        if (currentId == null) {
            logger.finer("parsing for id");
            while (keepGoing && currentId == null) {
                processNext();
            }
        }

        logger.fine(currentId);
        return currentId;
    }

    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#setCurrentId(java.lang.String)
     */
    public void setCurrentId(String _id) {
        currentId = _id;
    }

    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#isSkippingRecord()
     */
    public boolean isSkippingRecord() {
        return skippingRecord;
    }

    private int readByteBuffer(int _readSize) throws IOException {
        // do we have something ready to read?
        if (byteBuffer != null) {
            if (byteIndex < byteBuffer.length) {
                logger.finer("existing = " + getByteBufferDescription());
                return byteBuffer.length - byteIndex;
            }
            byteBuffer = null;
            buffer = null;
        }

        if (buffer == null) {
            logger.fine("buffer is null");
            byteBuffer = null;
            // must wrap any non-IOException in an IOException
            try {
                while (keepGoing
                        && (buffer == null || buffer.length() < _readSize)) {
                    processNext();
                }
            } catch (XmlPullParserException e) {
                IOException ioe = new IOException();
                ioe.initCause(e);
                throw ioe;
            }
        }

        if (buffer == null) {
            // indicate EOF
            logger.fine("EOF");
            return -1;
        }

        if (byteBuffer == null) {
            // get more bytes
            byteBuffer = buffer.toString().getBytes(outputEncoding);
            byteIndex = 0;
        }

        // logger.fine("new = " + getByteBufferDescription()); // DEBUG
        return byteBuffer.length - byteIndex;
    }

    /*
     * (non-Javadoc)
     * 
     * @see java.io.InputStream#read()
     */
    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#read()
     */
    @Override
    public int read() throws IOException {
        // read and return the next byte
        int available = readByteBuffer(1);

        if (available < 0) {
            return available;
        }

        bytesRead++;
        return byteBuffer[byteIndex++];
    }

    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#read(byte[], int, int)
     */
    @Override
    public int read(byte[] b, int off, int len) throws IOException {
        if (len < 1) {
            return len;
        }

        int available = readByteBuffer(len - 1);
        // DEBUG
        // logger.fine("off = " + off + ", len = " + len + ", avail = " +
        // available);

        if (available < 0) {
            return available;
        }

        // copy byte buffer into target buffer
        int copyLen = Math.min(available, len);
        System.arraycopy(byteBuffer, byteIndex, b, off, copyLen);
        byteIndex += copyLen;
        bytesRead += copyLen;

        return copyLen;
    }

    /**
     * @return
     * @throws XmlPullParserException
     * @throws IOException
     * 
     */
    private void processNext() throws XmlPullParserException, IOException {
        if (!keepGoing) {
            return;
        }

        if (startOfRecord) {
            // this is the start of the record
            // by definition, we are at the start of an element
            logger.fine("processing start of record");
            processStartElement();
            startOfRecord = false;
            return;
        }

        int eventType;
        try {
            // NOTE: next() skips comments, ignorable-whitespace, etc.
            // to catch these, use nextToken() instead.
            eventType = xpp.nextToken();
            switch (eventType) {
            case XmlPullParser.START_TAG:
                logger.finest("eventType = START_TAG");
                processStartElement();
                break;
            case XmlPullParser.TEXT:
                logger.finest("eventType = TEXT: " + xpp.getText());
                write(Utilities.escapeXml(xpp.getText()));
                break;
            case XmlPullParser.CDSECT:
                logger.finest("eventType = CDSECT");
                // round-trip it
                write("<![CDATA[");
                write(xpp.getText());
                write("]]>");
                break;
            case XmlPullParser.IGNORABLE_WHITESPACE:
                logger.finest("eventType = IGNORABLE_WHITESPACE");
                write(xpp.getText());
                break;
            case XmlPullParser.ENTITY_REF:
                logger.finest("eventType = ENTITY_REF");
                write("&");
                write(xpp.getName());
                write(";");
                break;
            case XmlPullParser.DOCDECL:
                logger.finest("eventType = DOCDECL");
                write("<!DOCTYPE");
                write(xpp.getText());
                write(">");
                break;
            case XmlPullParser.PROCESSING_INSTRUCTION:
                logger.finest("eventType = PROCESSING_INSTRUCTION");
                write("<?");
                write(xpp.getText());
                write("?>");
                break;
            case XmlPullParser.COMMENT:
                logger.finest("eventType = COMMENT");
                write("<!--");
                write(xpp.getText());
                write("-->");
                break;
            case XmlPullParser.END_TAG:
                logger.finest("eventType = END_TAG");
                keepGoing = processEndElement();
                break;
            case XmlPullParser.START_DOCUMENT:
                logger.finest("eventType = START_DOCUMENT");
                throw new XmlPullParserException(
                        "unexpected start of document within record!\n"
                                + "recordName = " + recordName
                                + ", recordNamespace = "
                                + recordNamespace + " at "
                                + xpp.getPositionDescription());
            case XmlPullParser.END_DOCUMENT:
                logger.finest("eventType = END_DOCUMENT");
                throw new XmlPullParserException(
                        "end of document before end of current record!\n"
                                + "recordName = " + recordName
                                + ", recordNamespace = "
                                + recordNamespace + " at "
                                + xpp.getPositionDescription());
            default:
                throw new XmlPullParserException("UNIMPLEMENTED: "
                        + eventType);
            }
        } catch (XmlPullParserException e) {
            logger.warning(e.getClass().getSimpleName() + " at "
                    + xpp.getPositionDescription());
            if (e.getMessage().contains("quotation or apostrophe")
                    && !config.isFatalErrors()) {
                // messed-up attribute? skip it?
                logger.warning("attribute error: " + e.getMessage());
                // all we can do is ignore it, apparently
            } else {
                throw e;
            }
        }
    }

    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#setSkippingRecord(boolean)
     */
    public void setSkippingRecord(boolean b)
            throws XmlPullParserException, IOException {
        skippingRecord = b;
        logger.finest("skippingRecord = " + skippingRecord);

        // spool out the rest of the record
        while (skippingRecord && keepGoing) {
            processNext();
        }
    }

    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#getByteBufferDescription()
     */
    public String getByteBufferDescription() {
        if (byteBuffer == null) {
            return "" + byteIndex + " in empty byteBuffer";
        }
        return "" + byteIndex + "/" + byteBuffer.length + " of "
                + new String(byteBuffer);
    }

    /* (non-Javadoc)
     * @see com.marklogic.recordloader.ProducerInterface#getBuffer()
     */
    public String getBuffer() {
        return (null != buffer) ? buffer.toString() : null;
    }

}