/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.xmlinputstream;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.Namespace;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.vfs2.FileSystemException;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleStepException;
import org.pentaho.di.core.exception.KettleValueException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.util.Utils;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
/**
* Use a StAX parser to read XML in a flexible and fast way.
*
* @author Jens Bleuel
* @since 2011-01-13
*/
// TODO black box testing
public class XMLInputStream extends BaseStep implements StepInterface {
private static Class<?> PKG = XMLInputStream.class; // for i18n purposes, needed by Translator2!!
private static int PARENT_ID_ALLOCATE_SIZE = 1000; // max. number of nested elements, we may let the user configure
// this
private XMLInputStreamMeta meta;
private XMLInputStreamData data;
private int inputFieldIndex;
static final String[] eventDescription = { "UNKNOWN", "START_ELEMENT", "END_ELEMENT", "PROCESSING_INSTRUCTION",
"CHARACTERS", "COMMENT", "SPACE", "START_DOCUMENT", "END_DOCUMENT", "ENTITY_REFERENCE", "ATTRIBUTE", "DTD",
"CDATA", "NAMESPACE", "NOTATION_DECLARATION", "ENTITY_DECLARATION" };
public XMLInputStream( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
}
@Override
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
if ( first && !meta.sourceFromInput ) {
first = false;
if ( data.filenames == null ) {
getFilenamesFromPreviousSteps();
}
openNextFile();
resetElementCounters();
}
Object[] outputRowData;
if ( meta.sourceFromInput ) {
Object[] row = null;
if ( first ) {
first = false;
row = getRow();
// get input field index
if ( getInputRowMeta() == null ) {
throw new KettleException( BaseMessages.getString( PKG, "XMLInputStream.NoIncomingRowsFound" ) );
}
inputFieldIndex = getInputRowMeta().indexOfValue( meta.sourceFieldName );
if ( inputFieldIndex < 0 ) {
throw new KettleException( BaseMessages.getString( PKG, "XMLInputStream.FilenameFieldNotFound",
meta.sourceFieldName ) );
}
}
if ( data.xmlEventReader == null ) {
if ( row == null ) {
row = getRow();
}
if ( row == null ) {
setOutputDone(); // signal end to receiver(s)
return false; // This is the end of this step.
}
String xml = getInputRowMeta().getString( row, inputFieldIndex );
try {
data.xmlEventReader = data.staxInstance.createXMLEventReader( new StringReader( xml ) );
} catch ( XMLStreamException e ) {
throw new KettleException( e );
}
resetElementCounters();
}
outputRowData = getRowFromXML();
if ( outputRowData == null ) {
data.xmlEventReader = null;
return true;
}
} else {
outputRowData = getRowFromXML();
if ( outputRowData == null ) {
if ( openNextFile() ) {
resetElementCounters();
return true;
} else {
setOutputDone(); // signal end to receiver(s)
return false; // This is the end of this step.
}
}
}
putRowOut( outputRowData );
// limit has been reached: stop now. (not exact science since some attributes could be mixed within the last row)
if ( data.rowLimit > 0 && data.rowNumber >= data.rowLimit ) {
setOutputDone();
return false;
}
return true;
}
private boolean openNextFile() throws KettleException {
try {
closeFile();
if ( data.filenr >= data.filenames.length ) {
return false;
}
data.fileObject = KettleVFS.getFileObject( data.filenames[data.filenr], getTransMeta() );
data.inputStream = KettleVFS.getInputStream( data.fileObject );
data.xmlEventReader = data.staxInstance.createXMLEventReader( data.inputStream, data.encoding );
} catch ( IOException e ) {
throw new KettleException( e );
} catch ( XMLStreamException e ) {
throw new KettleException( e );
}
data.filenr++;
if ( meta.isAddResultFile() ) {
// Add this to the result file names...
ResultFile resultFile =
new ResultFile( ResultFile.FILE_TYPE_GENERAL, data.fileObject, getTransMeta().getName(), getStepname() );
resultFile.setComment( BaseMessages.getString( PKG, "XMLInputStream.Log.ResultFileWasRead" ) );
addResultFile( resultFile );
}
return true;
}
private void closeFile() {
if ( data.xmlEventReader != null ) {
try {
data.xmlEventReader.close();
} catch ( XMLStreamException e ) {
if ( log.isBasic() ) {
log.logBasic( BaseMessages.getString( PKG, "XMLInputStream.Log.UnableToCloseFile",
data.filenames[( data.filenr - 1 )] ), e );
}
}
}
if ( data.inputStream != null ) {
try {
data.inputStream.close();
} catch ( IOException e ) {
if ( log.isBasic() ) {
log.logBasic( BaseMessages.getString( PKG, "XMLInputStream.Log.UnableToCloseFile",
data.filenames[( data.filenr - 1 )] ), e );
}
}
}
if ( data.fileObject != null ) {
try {
data.fileObject.close();
} catch ( FileSystemException e ) {
if ( log.isBasic() ) {
log.logBasic( BaseMessages.getString( PKG, "XMLInputStream.Log.UnableToCloseFile",
data.filenames[( data.filenr - 1 )] ), e );
}
}
}
}
private void getFilenamesFromPreviousSteps() throws KettleException {
List<String> filenames = new ArrayList<String>();
int index = -1;
Object[] row = getRow();
// Get the filename field index...
//
String filenameField = environmentSubstitute( meta.getFilename() );
index = getInputRowMeta().indexOfValue( filenameField );
if ( index < 0 ) {
throw new KettleException( BaseMessages.getString(
PKG, "XMLInputStream.FilenameFieldNotFound", filenameField ) );
}
while ( row != null ) {
String filename = getInputRowMeta().getString( row, index );
filenames.add( filename ); // add it to the list...
row = getRow(); // Grab another row...
}
data.filenames = filenames.toArray( new String[filenames.size()] );
logDetailed( BaseMessages.getString( PKG, "XMLInputStream.Log.ReadingFromNrFiles", Integer
.toString( data.filenames.length ) ) );
}
// sends the normal row and attributes
private void putRowOut( Object[] r ) throws KettleStepException, KettleValueException {
data.rowNumber++;
if ( data.pos_xml_filename != -1 ) {
r[data.pos_xml_filename] = new String( data.filenames[( data.filenr - 1 )] );
}
if ( data.pos_xml_row_number != -1 ) {
r[data.pos_xml_row_number] = new Long( data.rowNumber );
}
if ( data.pos_xml_element_id != -1 ) {
r[data.pos_xml_element_id] = data.elementLevelID[data.elementLevel];
}
if ( data.pos_xml_element_level != -1 ) {
r[data.pos_xml_element_level] = new Long( data.elementLevel );
}
if ( data.pos_xml_parent_element_id != -1 ) {
r[data.pos_xml_parent_element_id] = data.elementParentID[data.elementLevel];
}
if ( data.pos_xml_path != -1 ) {
r[data.pos_xml_path] = data.elementPath[data.elementLevel];
}
if ( data.pos_xml_parent_path != -1 && data.elementLevel > 0 ) {
r[data.pos_xml_parent_path] = data.elementPath[data.elementLevel - 1];
}
// We could think of adding an option to filter Start_end Document / Elements, RegEx?
// We could think of adding columns identifying Element-Blocks
// Skip rows? (not exact science since some attributes could be mixed within the last row)
if ( data.nrRowsToSkip == 0 || data.rowNumber > data.nrRowsToSkip ) {
if ( log.isRowLevel() ) {
logRowlevel( "Read row: " + data.outputRowMeta.getString( r ) );
}
putRow( data.outputRowMeta, r );
}
}
private Object[] getRowFromXML() throws KettleException {
Object[] outputRowData = null;
// loop until significant data is there and more data is there
while ( data.xmlEventReader.hasNext() && outputRowData == null && !isStopped() ) {
outputRowData = processEvent();
// log all events (but no attributes sent by the EventReader)
incrementLinesInput();
if ( checkFeedback( getLinesInput() ) && isBasic() ) {
logBasic( BaseMessages.getString( PKG, "XMLInputStream.Log.LineNumber", Long.toString( getLinesInput() ) ) );
}
}
return outputRowData;
}
private Object[] processEvent() throws KettleException {
Object[] outputRowData = RowDataUtil.allocateRowData( data.outputRowMeta.size() );
XMLEvent e = null;
try {
e = data.xmlEventReader.nextEvent();
} catch ( XMLStreamException ex ) {
throw new KettleException( ex );
}
int eventType = e.getEventType();
if ( data.pos_xml_data_type_numeric != -1 ) {
outputRowData[data.pos_xml_data_type_numeric] = new Long( eventType );
}
if ( data.pos_xml_data_type_description != -1 ) {
if ( eventType == 0 || eventType > eventDescription.length ) {
// unknown eventType
outputRowData[data.pos_xml_data_type_description] = eventDescription[0] + "(" + eventType + ")";
} else {
outputRowData[data.pos_xml_data_type_description] = eventDescription[eventType];
}
}
if ( data.pos_xml_location_line != -1 ) {
outputRowData[data.pos_xml_location_line] = new Long( e.getLocation().getLineNumber() );
}
if ( data.pos_xml_location_column != -1 ) {
outputRowData[data.pos_xml_location_column] = new Long( e.getLocation().getColumnNumber() );
}
switch ( eventType ) {
case XMLStreamConstants.START_ELEMENT:
data.elementLevel++;
if ( data.elementLevel > PARENT_ID_ALLOCATE_SIZE - 1 ) {
throw new KettleException(
BaseMessages.getString( PKG, "XMLInputStream.Log.TooManyNestedElements", PARENT_ID_ALLOCATE_SIZE ) );
}
if ( data.elementParentID[data.elementLevel] == null ) {
data.elementParentID[data.elementLevel] = data.elementID;
}
data.elementID++;
data.elementLevelID[data.elementLevel] = data.elementID;
String xml_data_name;
if ( meta.isEnableNamespaces() ) {
String prefix = e.asStartElement().getName().getPrefix();
if ( Utils.isEmpty( prefix ) ) {
xml_data_name = e.asStartElement().getName().getLocalPart();
} else { // add namespace prefix:
xml_data_name = prefix + ":" + e.asStartElement().getName().getLocalPart();
}
} else {
xml_data_name = e.asStartElement().getName().getLocalPart();
}
if ( data.pos_xml_data_name >= 0 ) {
outputRowData[data.pos_xml_data_name] = xml_data_name;
}
// store the name
data.elementName[data.elementLevel] = xml_data_name;
// store simple path
data.elementPath[data.elementLevel] = data.elementPath[data.elementLevel - 1] + "/" + xml_data_name;
// write Namespaces out
if ( meta.isEnableNamespaces() ) {
outputRowData = parseNamespaces( outputRowData, e );
}
// write Attributes out
outputRowData = parseAttributes( outputRowData, e );
break;
case XMLStreamConstants.END_ELEMENT:
parseEndElement( outputRowData, e.asEndElement() );
putRowOut( outputRowData );
data.elementParentID[data.elementLevel + 1] = null;
data.elementLevel--;
outputRowData = null; // continue
break;
case XMLStreamConstants.SPACE:
outputRowData = null; // ignore & continue
break;
case XMLStreamConstants.CHARACTERS:
case XMLStreamConstants.CDATA:
if ( data.pos_xml_data_name >= 0 ) {
outputRowData[data.pos_xml_data_name] = data.elementName[data.elementLevel];
}
String xml_data_value = e.asCharacters().getData();
if ( data.pos_xml_data_value >= 0 ) {
if ( meta.isEnableTrim() ) {
// optional trim is also eliminating white spaces, tab, cr, lf
xml_data_value = Const.trim( xml_data_value );
}
outputRowData[data.pos_xml_data_value] = xml_data_value;
}
if ( data.pos_xml_data_value < 0 || Utils.isEmpty( (String) outputRowData[data.pos_xml_data_value] ) ) {
outputRowData = null; // ignore & continue
}
break;
case XMLStreamConstants.PROCESSING_INSTRUCTION:
outputRowData = null; // ignore & continue
// TODO test if possible
break;
case XMLStreamConstants.COMMENT:
outputRowData = null; // ignore & continue
// TODO test if possible
break;
case XMLStreamConstants.ENTITY_REFERENCE:
// should be resolved by default
outputRowData = null; // ignore & continue
break;
case XMLStreamConstants.START_DOCUMENT:
// just get this information out
break;
case XMLStreamConstants.END_DOCUMENT:
// just get this information out
break;
default:
logBasic( "Event:" + eventType );
outputRowData = null; // ignore & continue
}
return outputRowData;
}
private void parseEndElement( Object[] outputRowData, EndElement el ) {
if ( data.pos_xml_data_name >= 0 ) {
outputRowData[data.pos_xml_data_name] = getEndElementName( el, meta.isEnableNamespaces() );
}
}
/**
* Returns the qualified name of the end element
*
* @param el
* an EndElement event
* @param enabledNamespaces
* indicates if namespaces should be added or not
* @return the qualified name of the end element
*/
private String getEndElementName( EndElement el, boolean enabledNamespaces ) {
if ( !enabledNamespaces ) {
return el.getName().getLocalPart();
} else {
return getName( el.getName().getPrefix(), el.getName().getLocalPart() );
}
}
// Namespaces: put an extra row out for each namespace
@SuppressWarnings( "unchecked" )
private Object[] parseNamespaces( Object[] outputRowData, XMLEvent e ) throws KettleValueException,
KettleStepException {
Iterator<Namespace> iter = e.asStartElement().getNamespaces();
if ( iter.hasNext() ) {
Object[] outputRowDataNamespace = data.outputRowMeta.cloneRow( outputRowData );
putRowOut( outputRowDataNamespace ); // first put the element name info out
// change data_type to ATTRIBUTE
if ( data.pos_xml_data_type_numeric != -1 ) {
outputRowData[data.pos_xml_data_type_numeric] = new Long( XMLStreamConstants.NAMESPACE );
}
if ( data.pos_xml_data_type_description != -1 ) {
outputRowData[data.pos_xml_data_type_description] = eventDescription[XMLStreamConstants.NAMESPACE];
}
}
while ( iter.hasNext() ) {
Object[] outputRowDataNamespace = data.outputRowMeta.cloneRow( outputRowData );
Namespace n = iter.next();
outputRowDataNamespace[data.pos_xml_data_name] = n.getPrefix();
outputRowDataNamespace[data.pos_xml_data_value] = n.getNamespaceURI();
if ( iter.hasNext() ) {
// send out the Namespace row
putRowOut( outputRowDataNamespace );
} else {
// last row: this will be sent out by the outer loop
outputRowData = outputRowDataNamespace;
}
}
return outputRowData;
}
// Attributes: put an extra row out for each attribute
@SuppressWarnings( "unchecked" )
private Object[] parseAttributes( Object[] outputRowData, XMLEvent e ) throws KettleValueException,
KettleStepException {
Iterator<Attribute> iter = e.asStartElement().getAttributes();
if ( iter.hasNext() ) {
Object[] outputRowDataAttribute = data.outputRowMeta.cloneRow( outputRowData );
putRowOut( outputRowDataAttribute ); // first put the element name (or namespace) info out
// change data_type to ATTRIBUTE
if ( data.pos_xml_data_type_numeric != -1 ) {
outputRowData[data.pos_xml_data_type_numeric] = new Long( XMLStreamConstants.ATTRIBUTE );
}
if ( data.pos_xml_data_type_description != -1 ) {
outputRowData[data.pos_xml_data_type_description] = eventDescription[XMLStreamConstants.ATTRIBUTE];
}
}
while ( iter.hasNext() ) {
Object[] outputRowDataAttribute = data.outputRowMeta.cloneRow( outputRowData );
Attribute a = iter.next();
parseAttribute( outputRowDataAttribute, a, meta.isEnableNamespaces() );
if ( iter.hasNext() ) {
// send out the Attribute row
putRowOut( outputRowDataAttribute );
} else {
// last row: this will be sent out by the outer loop
outputRowData = outputRowDataAttribute;
}
}
return outputRowData;
}
private void parseAttribute( Object[] outputRowDataAttribute, Attribute a, boolean enabledNamespaces ) {
if ( data.pos_xml_data_name != -1 ) {
outputRowDataAttribute[data.pos_xml_data_name] = getAttributeName( a, enabledNamespaces );
}
if ( data.pos_xml_data_value != -1 ) {
outputRowDataAttribute[data.pos_xml_data_value] = a.getValue();
}
}
/**
* Returns the qualified name of the attribute
*
* @param a
* an attribute event
* @param enabledNamespaces
* indicates if namespaces should be added or not
* @return the qualified name of the attribute
*/
private String getAttributeName( Attribute a, boolean enabledNamespaces ) {
if ( !enabledNamespaces ) {
return a.getName().getLocalPart();
} else {
return getName( a.getName().getPrefix(), a.getName().getLocalPart() );
}
}
/**
* Returns the qualified name in the format: <code>prefix:localPart</code> if the prefix is present otherwise just
* <code>localPart</code>
*
* @param prefix
* the namespace prefix part of the qualified name
* @param localPart
* the local part of the qualified name
* @return the qualified name
*/
private String getName( String prefix, String localPart ) {
return ( !Utils.isEmpty( prefix ) ) ? prefix + ":" + localPart : localPart;
}
private void resetElementCounters() {
data.rowNumber = new Long( 0 );
data.elementLevel = 0;
data.elementID = new Long( 0 ); // init value, could be parameterized later on
data.elementLevelID = new Long[PARENT_ID_ALLOCATE_SIZE];
data.elementLevelID[0] = data.elementID; // inital id for level 0
data.elementParentID = new Long[PARENT_ID_ALLOCATE_SIZE];
data.elementName = new String[PARENT_ID_ALLOCATE_SIZE];
data.elementPath = new String[PARENT_ID_ALLOCATE_SIZE];
data.elementPath[0] = ""; // initial empty
}
@Override
public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
meta = (XMLInputStreamMeta) smi;
data = (XMLInputStreamData) sdi;
if ( super.init( smi, sdi ) ) {
data.staxInstance = XMLInputFactory.newInstance(); // could select the parser later on
data.filenr = 0;
if ( getTransMeta().findNrPrevSteps( getStepMeta() ) == 0 && !meta.sourceFromInput ) {
String filename = environmentSubstitute( meta.getFilename() );
if ( Utils.isEmpty( filename ) ) {
logError( BaseMessages.getString( PKG, "XMLInputStream.MissingFilename.Message" ) );
return false;
}
data.filenames = new String[] { filename, };
} else {
data.filenames = null;
}
data.nrRowsToSkip = Const.toLong( this.environmentSubstitute( meta.getNrRowsToSkip() ), 0 );
data.rowLimit = Const.toLong( this.environmentSubstitute( meta.getRowLimit() ), 0 );
data.encoding = this.environmentSubstitute( meta.getEncoding() );
data.outputRowMeta = new RowMeta();
meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore );
// get and save field positions
data.pos_xml_filename = data.outputRowMeta.indexOfValue( meta.getFilenameField() );
data.pos_xml_row_number = data.outputRowMeta.indexOfValue( meta.getRowNumberField() );
data.pos_xml_data_type_numeric = data.outputRowMeta.indexOfValue( meta.getXmlDataTypeNumericField() );
data.pos_xml_data_type_description = data.outputRowMeta.indexOfValue( meta.getXmlDataTypeDescriptionField() );
data.pos_xml_location_line = data.outputRowMeta.indexOfValue( meta.getXmlLocationLineField() );
data.pos_xml_location_column = data.outputRowMeta.indexOfValue( meta.getXmlLocationColumnField() );
data.pos_xml_element_id = data.outputRowMeta.indexOfValue( meta.getXmlElementIDField() );
data.pos_xml_parent_element_id = data.outputRowMeta.indexOfValue( meta.getXmlParentElementIDField() );
data.pos_xml_element_level = data.outputRowMeta.indexOfValue( meta.getXmlElementLevelField() );
data.pos_xml_path = data.outputRowMeta.indexOfValue( meta.getXmlPathField() );
data.pos_xml_parent_path = data.outputRowMeta.indexOfValue( meta.getXmlParentPathField() );
data.pos_xml_data_name = data.outputRowMeta.indexOfValue( meta.getXmlDataNameField() );
data.pos_xml_data_value = data.outputRowMeta.indexOfValue( meta.getXmlDataValueField() );
return true;
}
return false;
}
@Override
public void dispose( StepMetaInterface smi, StepDataInterface sdi ) {
meta = (XMLInputStreamMeta) smi;
data = (XMLInputStreamData) sdi;
// free resources
closeFile();
data.staxInstance = null;
super.dispose( smi, sdi );
}
}