/**
* Copyright (c) 2006-2011 Mark Logic Corporation. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* The use of the Apache License does not indicate that this project is
* affiliated with the Apache Software Foundation.
*/
package com.marklogic.recordloader;
import java.io.IOException;
import java.io.InputStream;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import com.marklogic.ps.SimpleLogger;
import com.marklogic.ps.Utilities;
/**
* @author Michael Blakeley, michael.blakeley@marklogic.com
*
*/
public class Producer extends InputStream {
private String outputEncoding = Configuration.OUTPUT_ENCODING_DEFAULT;
protected SimpleLogger logger;
protected XmlPullParser xpp;
private String recordName;
private String recordNamespace;
private int recordDepth = 0;
private StringBuilder buffer;
private byte[] byteBuffer;
private Configuration config;
private String idName;
private boolean skippingRecord = false;
private long bytesRead = 0;
protected String currentId = null;
private int byteIndex = 0;
private boolean keepGoing = true;
private boolean copyNamespaceDeclarations = true;
protected boolean startOfRecord = true;
/**
* @param _config
* @param _xpp
*/
public Producer(Configuration _config, XmlPullParser _xpp) {
config = _config;
xpp = _xpp;
idName = config.getIdNodeName();
recordNamespace = config.getRecordNamespace();
recordName = config.getRecordName();
copyNamespaceDeclarations = config.isCopyNamespaceDeclarations();
logger = _config.getLogger();
logger.fine("recordName=" + recordName);
}
/**
* @throws XmlPullParserException
*
*/
private void handleRecordStart() throws XmlPullParserException {
if (recordDepth > 0) {
// if recordDepth is set, we're already in a record and this
// element just happens to have RECORD_NAME
return;
}
recordDepth = xpp.getDepth();
logger.finest("recordDepth = " + recordDepth);
// handle automatic id generation here
String newId;
boolean useAutomaticIds = config.isUseAutomaticIds();
logger.fine("useAutomaticIds=" + useAutomaticIds);
boolean useFileNameIds = config.isUseFilenameIds();
logger.fine("useFileNameIds=" + useFileNameIds);
if (!(useAutomaticIds || useFileNameIds || idName.startsWith("@"))) {
return;
}
if (useAutomaticIds) {
// automatic ids, starting from 1
// config uses a sequence of long
// TODO change to be basefile/entry/sequence ???
newId = config.getAutoId();
logger.fine("automatic document id " + newId);
} else if (useFileNameIds) {
// the constructor had better have set our id!
// note that skipping won't work for this case
if (null == currentId) {
throw new FatalException(
"Cannot use filename ids unless the constructor sets currentId");
}
logger.fine("using filename id " + currentId);
newId = currentId;
} else {
// if the idName starts with @, it's an attribute
// handle attributes as idName
if (xpp.getAttributeCount() < 1) {
throw new XmlPullParserException(
"found no attributes for recordName = "
+ recordName + ", idName=" + idName
+ " at " + xpp.getPositionDescription());
}
// try with and without a namespace: first, try without
newId = xpp.getAttributeValue("", idName.substring(1));
if (newId == null) {
newId = xpp.getAttributeValue(recordNamespace, idName
.substring(1));
}
if (newId == null) {
throw new XmlPullParserException("null id " + idName
+ " at " + xpp.getPositionDescription());
}
logger.fine("found id " + idName + " = " + newId);
}
setCurrentId(newId);
}
protected void processStartElement() throws IOException,
XmlPullParserException {
String name = xpp.getName();
String namespace = xpp.getNamespace();
String prefix = xpp.getPrefix();
boolean isEmpty = xpp.isEmptyElementTag();
logger.finest("name = " + name);
String text = xpp.getText();
logger.finest("text = " + text);
// guard against and work around a known bug
if (!text.contains(name)) {
logger.warning("working around xpp3 bug 249: name = " + name
+ ", text = " + text);
text = "<" + (null == prefix ? "" : (prefix + ":")) + name;
int attributeCount = xpp.getAttributeCount();
String aPrefix;
if (attributeCount > 0) {
for (int i = 0; i < attributeCount; i++) {
aPrefix = xpp.getAttributePrefix(i);
text += " "
+ (null == aPrefix ? "" : (aPrefix + ":"))
+ xpp.getAttributeName(i)
+ "=\""
+ Utilities.escapeXml(
xpp.getAttributeValue(i), true)
+ "\"";
}
}
text += (isEmpty ? "/>" : ">");
}
// TODO use startOfRecord field?
boolean isRecordRoot = false;
if (name.equals(recordName) && namespace.equals(recordNamespace)) {
isRecordRoot = true;
handleRecordStart();
}
// allow for repeated idName elements: first one wins
// NOTE: idName is namespace-insensitive
if (null == currentId && name.equals(idName)) {
// TODO support idNameAttribute or similar
// pick out the contents and use it for the uri
if (xpp.nextToken() != XmlPullParser.TEXT) {
throw new XmlPullParserException("badly formed xml or "
+ idName + " is not a simple node: at"
+ xpp.getPositionDescription());
}
String newId = xpp.getText();
logger.fine("found id " + idName + " = " + newId);
// now we can set currentId
setCurrentId(newId);
// now we know that we'll use this content and id
write(text);
write(currentId);
// advance xpp to the END_ELEMENT - brittle?
if (xpp.nextToken() != XmlPullParser.END_TAG) {
throw new XmlPullParserException(
"badly formed xml: no END_TAG after id text"
+ xpp.getPositionDescription());
}
text = xpp.getText();
logger.finest("END_TAG = " + text);
write(text);
return;
}
// if the startId is still defined, and the uri has been found,
// we should skip as much of this work as possible
// this avoids OutOfMemory errors, too
if (skippingRecord) {
logger.finest("skipping record");
return;
}
// this seems to be the only way to handle empty elements:
// write it as a end-element, only.
// note that attributes are still ok in this case
if (isEmpty) {
logger.finest("empty element");
return;
}
if (copyNamespaceDeclarations && isRecordRoot) {
// preserve namespace declarations into this element
int depth = xpp.getDepth();
if (depth > 0) {
int stop = xpp.getNamespaceCount((depth > 1) ? depth - 1
: 1);
if (stop > 0) {
StringBuilder decl = null;
String nsDeclPrefix, nsDeclUri;
logger.finer("checking namespace declarations");
for (int i = 0; i < stop; i++) {
if (decl == null) {
decl = new StringBuilder();
}
nsDeclPrefix = xpp.getNamespacePrefix(i);
nsDeclUri = xpp.getNamespaceUri(i);
logger.finest("found namespace declaration "
+ nsDeclPrefix + " = " + nsDeclUri);
decl.append(" xmlns");
if (nsDeclPrefix != null) {
decl.append(":");
decl.append(nsDeclPrefix);
}
decl.append("=\"");
decl.append(nsDeclUri);
decl.append("\"");
}
// copy the namespace decls to the end of the tag
if (decl != null) {
logger.finer("copying namespace declarations");
text = text.replaceFirst(">$", decl.toString()
+ (isEmpty ? "/" : "") + ">");
}
} else {
logger.finer("no namespace declarations to copy");
}
} else {
logger.finer("no namespace declarations to copy at "
+ depth);
}
}
logger.finest("writing text = " + text);
write(text);
return;
}
protected boolean processEndElement() throws XmlPullParserException {
// NOTE: must return false when the record end-element is found
String name = xpp.getName();
String namespace = xpp.getNamespace();
logger.finest("name = " + name);
// record the element text
if (!skippingRecord) {
write(xpp.getText());
}
if (!(recordName.equals(name)
&& recordNamespace.equals(namespace)
&& recordDepth == xpp.getDepth())) {
// not the end of the record: go look for more nodes
return true;
}
// reset recordDepth
recordDepth = 0;
// end of record: were we skipping?
if (skippingRecord) {
logger.fine("reached the end of skipped record");
return false;
}
// did something go wrong?
if (null == currentId) {
throw new XmlPullParserException("end of record element "
+ name + " with no id found: "
+ Configuration.ID_NAME_KEY + "=" + idName);
}
// end of record
logger.fine("end of record");
// logger.finest(buffer.toString()); // DEBUG
return false;
}
/**
* @param string
*/
private void write(String string) {
// if the startId is still defined, and the uri has been found,
// we should skip as much of this work as possible
// this avoids OutOfMemory too
if (skippingRecord) {
return;
}
if (buffer == null) {
buffer = new StringBuilder();
}
// logger.finest("string = " + string); // DEBUG
buffer.append(string);
}
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#getBytesRead()
*/
public long getBytesRead() {
return bytesRead;
}
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#getCurrentId()
*/
public String getCurrentId() throws XmlPullParserException,
IOException {
// buffer up content until we find the id node
if (currentId == null) {
logger.finer("parsing for id");
while (keepGoing && currentId == null) {
processNext();
}
}
logger.fine(currentId);
return currentId;
}
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#setCurrentId(java.lang.String)
*/
public void setCurrentId(String _id) {
currentId = _id;
}
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#isSkippingRecord()
*/
public boolean isSkippingRecord() {
return skippingRecord;
}
private int readByteBuffer(int _readSize) throws IOException {
// do we have something ready to read?
if (byteBuffer != null) {
if (byteIndex < byteBuffer.length) {
logger.finer("existing = " + getByteBufferDescription());
return byteBuffer.length - byteIndex;
}
byteBuffer = null;
buffer = null;
}
if (buffer == null) {
logger.fine("buffer is null");
byteBuffer = null;
// must wrap any non-IOException in an IOException
try {
while (keepGoing
&& (buffer == null || buffer.length() < _readSize)) {
processNext();
}
} catch (XmlPullParserException e) {
IOException ioe = new IOException();
ioe.initCause(e);
throw ioe;
}
}
if (buffer == null) {
// indicate EOF
logger.fine("EOF");
return -1;
}
if (byteBuffer == null) {
// get more bytes
byteBuffer = buffer.toString().getBytes(outputEncoding);
byteIndex = 0;
}
// logger.fine("new = " + getByteBufferDescription()); // DEBUG
return byteBuffer.length - byteIndex;
}
/*
* (non-Javadoc)
*
* @see java.io.InputStream#read()
*/
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#read()
*/
@Override
public int read() throws IOException {
// read and return the next byte
int available = readByteBuffer(1);
if (available < 0) {
return available;
}
bytesRead++;
return byteBuffer[byteIndex++];
}
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#read(byte[], int, int)
*/
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (len < 1) {
return len;
}
int available = readByteBuffer(len - 1);
// DEBUG
// logger.fine("off = " + off + ", len = " + len + ", avail = " +
// available);
if (available < 0) {
return available;
}
// copy byte buffer into target buffer
int copyLen = Math.min(available, len);
System.arraycopy(byteBuffer, byteIndex, b, off, copyLen);
byteIndex += copyLen;
bytesRead += copyLen;
return copyLen;
}
/**
* @return
* @throws XmlPullParserException
* @throws IOException
*
*/
private void processNext() throws XmlPullParserException, IOException {
if (!keepGoing) {
return;
}
if (startOfRecord) {
// this is the start of the record
// by definition, we are at the start of an element
logger.fine("processing start of record");
processStartElement();
startOfRecord = false;
return;
}
int eventType;
try {
// NOTE: next() skips comments, ignorable-whitespace, etc.
// to catch these, use nextToken() instead.
eventType = xpp.nextToken();
switch (eventType) {
case XmlPullParser.START_TAG:
logger.finest("eventType = START_TAG");
processStartElement();
break;
case XmlPullParser.TEXT:
logger.finest("eventType = TEXT: " + xpp.getText());
write(Utilities.escapeXml(xpp.getText()));
break;
case XmlPullParser.CDSECT:
logger.finest("eventType = CDSECT");
// round-trip it
write("<![CDATA[");
write(xpp.getText());
write("]]>");
break;
case XmlPullParser.IGNORABLE_WHITESPACE:
logger.finest("eventType = IGNORABLE_WHITESPACE");
write(xpp.getText());
break;
case XmlPullParser.ENTITY_REF:
logger.finest("eventType = ENTITY_REF");
write("&");
write(xpp.getName());
write(";");
break;
case XmlPullParser.DOCDECL:
logger.finest("eventType = DOCDECL");
write("<!DOCTYPE");
write(xpp.getText());
write(">");
break;
case XmlPullParser.PROCESSING_INSTRUCTION:
logger.finest("eventType = PROCESSING_INSTRUCTION");
write("<?");
write(xpp.getText());
write("?>");
break;
case XmlPullParser.COMMENT:
logger.finest("eventType = COMMENT");
write("<!--");
write(xpp.getText());
write("-->");
break;
case XmlPullParser.END_TAG:
logger.finest("eventType = END_TAG");
keepGoing = processEndElement();
break;
case XmlPullParser.START_DOCUMENT:
logger.finest("eventType = START_DOCUMENT");
throw new XmlPullParserException(
"unexpected start of document within record!\n"
+ "recordName = " + recordName
+ ", recordNamespace = "
+ recordNamespace + " at "
+ xpp.getPositionDescription());
case XmlPullParser.END_DOCUMENT:
logger.finest("eventType = END_DOCUMENT");
throw new XmlPullParserException(
"end of document before end of current record!\n"
+ "recordName = " + recordName
+ ", recordNamespace = "
+ recordNamespace + " at "
+ xpp.getPositionDescription());
default:
throw new XmlPullParserException("UNIMPLEMENTED: "
+ eventType);
}
} catch (XmlPullParserException e) {
logger.warning(e.getClass().getSimpleName() + " at "
+ xpp.getPositionDescription());
if (e.getMessage().contains("quotation or apostrophe")
&& !config.isFatalErrors()) {
// messed-up attribute? skip it?
logger.warning("attribute error: " + e.getMessage());
// all we can do is ignore it, apparently
} else {
throw e;
}
}
}
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#setSkippingRecord(boolean)
*/
public void setSkippingRecord(boolean b)
throws XmlPullParserException, IOException {
skippingRecord = b;
logger.finest("skippingRecord = " + skippingRecord);
// spool out the rest of the record
while (skippingRecord && keepGoing) {
processNext();
}
}
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#getByteBufferDescription()
*/
public String getByteBufferDescription() {
if (byteBuffer == null) {
return "" + byteIndex + " in empty byteBuffer";
}
return "" + byteIndex + "/" + byteBuffer.length + " of "
+ new String(byteBuffer);
}
/* (non-Javadoc)
* @see com.marklogic.recordloader.ProducerInterface#getBuffer()
*/
public String getBuffer() {
return (null != buffer) ? buffer.toString() : null;
}
}