/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.getxmldata; import java.io.InputStream; import java.io.StringReader; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.zip.GZIPInputStream; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.vfs2.FileObject; import org.apache.commons.vfs2.FileSystemException; import org.dom4j.Element; import org.dom4j.ElementHandler; import org.dom4j.ElementPath; import org.dom4j.Namespace; import org.dom4j.Node; import org.dom4j.XPath; import org.dom4j.io.SAXReader; import org.dom4j.tree.AbstractNode; import org.pentaho.di.core.Const; import org.pentaho.di.core.util.Utils; import org.pentaho.di.core.ResultFile; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.fileinput.FileInputList; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.row.value.ValueMetaFactory; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.core.xml.XMLParserFactoryProducer; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; /** * Read XML files, parse them and convert them to rows and writes these to one or more output streams. * * @author Samatar,Brahim * @since 20-06-2007 */ public class GetXMLData extends BaseStep implements StepInterface { private static Class<?> PKG = GetXMLDataMeta.class; // for i18n purposes, needed by Translator2!! private GetXMLDataMeta meta; private GetXMLDataData data; private Object[] prevRow = null; // A pre-allocated spot for the previous row public GetXMLData( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); } protected boolean setDocument( String StringXML, FileObject file, boolean IsInXMLField, boolean readurl ) throws KettleException { this.prevRow = buildEmptyRow(); // pre-allocate previous row try { SAXReader reader = XMLParserFactoryProducer.getSAXReader( null ); data.stopPruning = false; // Validate XML against specified schema? if ( meta.isValidating() ) { reader.setValidation( true ); reader.setFeature( "http://apache.org/xml/features/validation/schema", true ); } else { // Ignore DTD declarations reader.setEntityResolver( new IgnoreDTDEntityResolver() ); } // Ignore comments? if ( meta.isIgnoreComments() ) { reader.setIgnoreComments( true ); } if ( data.prunePath != null ) { // when pruning is on: reader.read() below will wait until all is processed in the handler if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.StreamingMode.Activated" ) ); } if ( data.PathValue.equals( data.prunePath ) ) { // Edge case, but if true, there will only ever be one item in the list data.an = new ArrayList<AbstractNode>( 1 ); // pre-allocate array and sizes data.an.add( null ); } reader.addHandler( data.prunePath, new ElementHandler() { public void onStart( ElementPath path ) { // do nothing here... } public void onEnd( ElementPath path ) { if ( isStopped() ) { // when a large file is processed and it should be stopped it is still reading the hole thing // the only solution I see is to prune / detach the document and this will lead into a // NPE or other errors depending on the parsing location - this will be treated in the catch part below // any better idea is welcome if ( log.isBasic() ) { logBasic( BaseMessages.getString( PKG, "GetXMLData.Log.StreamingMode.Stopped" ) ); } data.stopPruning = true; path.getCurrent().getDocument().detach(); // trick to stop reader return; } // process a ROW element if ( log.isDebug() ) { logDebug( BaseMessages.getString( PKG, "GetXMLData.Log.StreamingMode.StartProcessing" ) ); } Element row = path.getCurrent(); try { // Pass over the row instead of just the document. If // if there's only one row, there's no need to // go back to the whole document. processStreaming( row ); } catch ( Exception e ) { // catch the KettleException or others and forward to caller, e.g. when applyXPath() has a problem throw new RuntimeException( e ); } // prune the tree row.detach(); if ( log.isDebug() ) { logDebug( BaseMessages.getString( PKG, "GetXMLData.Log.StreamingMode.EndProcessing" ) ); } } } ); } if ( IsInXMLField ) { // read string to parse data.document = reader.read( new StringReader( StringXML ) ); } else if ( readurl ) { // read url as source HttpClient client = new HttpClient(); HttpMethod method = new GetMethod( StringXML ); method.addRequestHeader( "Accept-Encoding", "gzip" ); client.executeMethod( method ); Header contentEncoding = method.getResponseHeader( "Content-Encoding" ); if ( contentEncoding != null ) { String acceptEncodingValue = contentEncoding.getValue(); if ( acceptEncodingValue.indexOf( "gzip" ) != -1 ) { GZIPInputStream in = new GZIPInputStream( method.getResponseBodyAsStream() ); data.document = reader.read( in ); } } else { data.document = reader.read( method.getResponseBodyAsStream() ); } } else { // get encoding. By default UTF-8 String encoding = "UTF-8"; if ( !Utils.isEmpty( meta.getEncoding() ) ) { encoding = meta.getEncoding(); } InputStream is = KettleVFS.getInputStream( file ); try { data.document = reader.read( is, encoding ); } finally { BaseStep.closeQuietly( is ); } } if ( meta.isNamespaceAware() ) { prepareNSMap( data.document.getRootElement() ); } } catch ( Exception e ) { if ( data.stopPruning ) { // ignore error when pruning return false; } else { throw new KettleException( e ); } } return true; } /** * Process chunk of data in streaming mode. Called only by the handler when pruning is true. Not allowed in * combination with meta.getIsInFields(), but could be redesigned later on. * */ private void processStreaming( Element row ) throws KettleException { data.document = row.getDocument(); if ( meta.isNamespaceAware() ) { prepareNSMap( data.document.getRootElement() ); } if ( log.isDebug() ) { logDebug( BaseMessages.getString( PKG, "GetXMLData.Log.StreamingMode.ApplyXPath" ) ); } // If the prune path and the path are the same, then // we're processing one row at a time through here. if ( data.PathValue.equals( data.prunePath ) ) { data.an.set( 0, (AbstractNode) row ); data.nodesize = 1; // it's always just one row. data.nodenr = 0; if ( log.isDebug() ) { logDebug( BaseMessages.getString( PKG, "GetXMLData.Log.StreamingMode.ProcessingRows" ) ); } Object[] r = getXMLRowPutRowWithErrorhandling(); if ( !data.errorInRowButContinue ) { // do not put out the row but continue putRowOut( r ); // false when limit is reached, functionality is there but we can not stop reading the hole file // (slow but works) } data.nodesize = 0; data.nodenr = 0; return; } else { if ( !applyXPath() ) { throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.UnableApplyXPath" ) ); } } // main loop through the data until limit is reached or transformation is stopped // similar functionality like in BaseStep.runStepThread if ( log.isDebug() ) { logDebug( BaseMessages.getString( PKG, "GetXMLData.Log.StreamingMode.ProcessingRows" ) ); } boolean cont = true; while ( data.nodenr < data.nodesize && cont && !isStopped() ) { Object[] r = getXMLRowPutRowWithErrorhandling(); if ( data.errorInRowButContinue ) { continue; // do not put out the row but continue } cont = putRowOut( r ); // false when limit is reached, functionality is there but we can not stop reading the hole // file (slow but works) } if ( log.isDebug() ) { logDebug( BaseMessages.getString( PKG, "GetXMLData.Log.StreamingMode.FreeMemory" ) ); } // free allocated memory data.an.clear(); data.nodesize = data.an.size(); data.nodenr = 0; } public void prepareNSMap( Element l ) { @SuppressWarnings( "unchecked" ) List<Namespace> namespacesList = l.declaredNamespaces(); for ( Namespace ns : namespacesList ) { if ( ns.getPrefix().trim().length() == 0 ) { data.NAMESPACE.put( "pre" + data.NSPath.size(), ns.getURI() ); String path = ""; Element element = l; while ( element != null ) { if ( element.getNamespacePrefix() != null && element.getNamespacePrefix().length() > 0 ) { path = GetXMLDataMeta.N0DE_SEPARATOR + element.getNamespacePrefix() + ":" + element.getName() + path; } else { path = GetXMLDataMeta.N0DE_SEPARATOR + element.getName() + path; } element = element.getParent(); } data.NSPath.add( path ); } else { data.NAMESPACE.put( ns.getPrefix(), ns.getURI() ); } } @SuppressWarnings( "unchecked" ) List<Element> elementsList = l.elements(); for ( Element e : elementsList ) { prepareNSMap( e ); } } /** * Build an empty row based on the meta-data. * * @return empty row built */ private Object[] buildEmptyRow() { return RowDataUtil.allocateRowData( data.outputRowMeta.size() ); } private void handleMissingFiles() throws KettleException { List<FileObject> nonExistantFiles = data.files.getNonExistantFiles(); if ( nonExistantFiles.size() != 0 ) { String message = FileInputList.getRequiredFilesDescription( nonExistantFiles ); logError( BaseMessages.getString( PKG, "GetXMLData.Log.RequiredFilesTitle" ), BaseMessages.getString( PKG, "GetXMLData.Log.RequiredFiles", message ) ); throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.RequiredFilesMissing", message ) ); } List<FileObject> nonAccessibleFiles = data.files.getNonAccessibleFiles(); if ( nonAccessibleFiles.size() != 0 ) { String message = FileInputList.getRequiredFilesDescription( nonAccessibleFiles ); logError( BaseMessages.getString( PKG, "GetXMLData.Log.RequiredFilesTitle" ), BaseMessages.getString( PKG, "GetXMLData.Log.RequiredNotAccessibleFiles", message ) ); throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.RequiredNotAccessibleFilesMissing", message ) ); } } private boolean ReadNextString() { try { // Grab another row ... data.readrow = getRow(); if ( data.readrow == null ) { // finished processing! if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.FinishedProcessing" ) ); } return false; } if ( first ) { first = false; data.nrReadRow = getInputRowMeta().size(); data.inputRowMeta = getInputRowMeta(); data.outputRowMeta = data.inputRowMeta.clone(); meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore ); // Get total previous fields data.totalpreviousfields = data.inputRowMeta.size(); // Create convert meta-data objects that will contain Date & Number formatters data.convertRowMeta = new RowMeta(); for ( ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList() ) { data.convertRowMeta .addValueMeta( ValueMetaFactory.cloneValueMeta( valueMeta, ValueMetaInterface.TYPE_STRING ) ); } // For String to <type> conversions, we allocate a conversion meta data row as well... // data.convertRowMeta = data.outputRowMeta.cloneToType( ValueMetaInterface.TYPE_STRING ); // Check is XML field is provided if ( Utils.isEmpty( meta.getXMLField() ) ) { logError( BaseMessages.getString( PKG, "GetXMLData.Log.NoField" ) ); throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.NoField" ) ); } // cache the position of the field if ( data.indexOfXmlField < 0 ) { data.indexOfXmlField = getInputRowMeta().indexOfValue( meta.getXMLField() ); if ( data.indexOfXmlField < 0 ) { // The field is unreachable ! logError( BaseMessages.getString( PKG, "GetXMLData.Log.ErrorFindingField", meta.getXMLField() ) ); throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Exception.CouldnotFindField", meta .getXMLField() ) ); } } } if ( meta.isInFields() ) { // get XML field value String Fieldvalue = getInputRowMeta().getString( data.readrow, data.indexOfXmlField ); if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.XMLStream", meta.getXMLField(), Fieldvalue ) ); } if ( meta.getIsAFile() ) { FileObject file = null; try { // XML source is a file. file = KettleVFS.getFileObject( Fieldvalue, getTransMeta() ); if ( meta.isIgnoreEmptyFile() && file.getContent().getSize() == 0 ) { logBasic( BaseMessages.getString( PKG, "GetXMLData.Error.FileSizeZero", "" + file.getName() ) ); return ReadNextString(); } // Open the XML document if ( !setDocument( null, file, false, false ) ) { throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.UnableCreateDocument" ) ); } if ( !applyXPath() ) { throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.UnableApplyXPath" ) ); } addFileToResultFilesname( file ); if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.LoopFileOccurences", "" + data.nodesize, file .getName().getBaseName() ) ); } } catch ( Exception e ) { throw new KettleException( e ); } finally { try { if ( file != null ) { file.close(); } } catch ( Exception e ) { // Ignore close errors } } } else { boolean url = false; boolean xmltring = true; if ( meta.isReadUrl() ) { url = true; xmltring = false; } // Open the XML document if ( !setDocument( Fieldvalue, null, xmltring, url ) ) { throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.UnableCreateDocument" ) ); } // Apply XPath and set node list if ( !applyXPath() ) { throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.UnableApplyXPath" ) ); } if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.LoopFileOccurences", "" + data.nodesize ) ); } } } } catch ( Exception e ) { logError( BaseMessages.getString( PKG, "GetXMLData.Log.UnexpectedError", e.toString() ) ); stopAll(); logError( Const.getStackTracker( e ) ); setErrors( 1 ); return false; } return true; } private void addFileToResultFilesname( FileObject file ) throws Exception { if ( meta.addResultFile() ) { // Add this to the result file names... ResultFile resultFile = new ResultFile( ResultFile.FILE_TYPE_GENERAL, file, getTransMeta().getName(), getStepname() ); resultFile.setComment( BaseMessages.getString( PKG, "GetXMLData.Log.FileAddedResult" ) ); addResultFile( resultFile ); } } public String addNSPrefix( String path, String loopPath ) { if ( data.NSPath.size() > 0 ) { String fullPath = loopPath; if ( !path.equals( fullPath ) ) { for ( String tmp : path.split( GetXMLDataMeta.N0DE_SEPARATOR ) ) { if ( tmp.equals( ".." ) ) { fullPath = fullPath.substring( 0, fullPath.lastIndexOf( GetXMLDataMeta.N0DE_SEPARATOR ) ); } else { fullPath += GetXMLDataMeta.N0DE_SEPARATOR + tmp; } } } int[] indexs = new int[fullPath.split( GetXMLDataMeta.N0DE_SEPARATOR ).length - 1]; java.util.Arrays.fill( indexs, -1 ); int length = 0; for ( int i = 0; i < data.NSPath.size(); i++ ) { if ( data.NSPath.get( i ).length() > length && fullPath.startsWith( data.NSPath.get( i ) ) ) { java.util.Arrays.fill( indexs, data.NSPath.get( i ).split( GetXMLDataMeta.N0DE_SEPARATOR ).length - 2, indexs.length, i ); length = data.NSPath.get( i ).length(); } } StringBuilder newPath = new StringBuilder(); String[] pathStrs = path.split( GetXMLDataMeta.N0DE_SEPARATOR ); for ( int i = 0; i < pathStrs.length; i++ ) { String tmp = pathStrs[i]; if ( newPath.length() > 0 ) { newPath.append( GetXMLDataMeta.N0DE_SEPARATOR ); } if ( tmp.length() > 0 && !tmp.contains( ":" ) && !tmp.contains( "." ) && !tmp.contains( GetXMLDataMeta.AT ) ) { int index = indexs[i + indexs.length - pathStrs.length]; if ( index >= 0 ) { newPath.append( "pre" ).append( index ).append( ":" ).append( tmp ); } else { newPath.append( tmp ); } } else { newPath.append( tmp ); } } return newPath.toString(); } return path; } @SuppressWarnings( "unchecked" ) private boolean applyXPath() { try { XPath xpath = data.document.createXPath( data.PathValue ); if ( meta.isNamespaceAware() ) { xpath = data.document.createXPath( addNSPrefix( data.PathValue, data.PathValue ) ); xpath.setNamespaceURIs( data.NAMESPACE ); } // get nodes list data.an = xpath.selectNodes( data.document ); data.nodesize = data.an.size(); data.nodenr = 0; } catch ( Exception e ) { logError( BaseMessages.getString( PKG, "GetXMLData.Log.ErrorApplyXPath", e.getMessage() ) ); return false; } return true; } private boolean openNextFile() { try { if ( data.filenr >= data.files.nrOfFiles() ) { // finished processing! if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.FinishedProcessing" ) ); } return false; } // get file data.file = data.files.getFile( data.filenr ); data.filename = KettleVFS.getFilename( data.file ); // Add additional fields? if ( meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0 ) { data.shortFilename = data.file.getName().getBaseName(); } if ( meta.getPathField() != null && meta.getPathField().length() > 0 ) { data.path = KettleVFS.getFilename( data.file.getParent() ); } if ( meta.isHiddenField() != null && meta.isHiddenField().length() > 0 ) { data.hidden = data.file.isHidden(); } if ( meta.getExtensionField() != null && meta.getExtensionField().length() > 0 ) { data.extension = data.file.getName().getExtension(); } if ( meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0 ) { data.lastModificationDateTime = new Date( data.file.getContent().getLastModifiedTime() ); } if ( meta.getUriField() != null && meta.getUriField().length() > 0 ) { data.uriName = data.file.getName().getURI(); } if ( meta.getRootUriField() != null && meta.getRootUriField().length() > 0 ) { data.rootUriName = data.file.getName().getRootURI(); } // Check if file is empty long fileSize; try { fileSize = data.file.getContent().getSize(); } catch ( FileSystemException e ) { fileSize = -1; } if ( meta.getSizeField() != null && meta.getSizeField().length() > 0 ) { data.size = fileSize; } // Move file pointer ahead! data.filenr++; if ( meta.isIgnoreEmptyFile() && fileSize == 0 ) { // log only basic as a warning (was before logError) logBasic( BaseMessages.getString( PKG, "GetXMLData.Error.FileSizeZero", "" + data.file.getName() ) ); openNextFile(); } else { if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.OpeningFile", data.file.toString() ) ); } // Open the XML document if ( !setDocument( null, data.file, false, false ) ) { if ( data.stopPruning ) { return false; // ignore error when stopped while pruning } throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.UnableCreateDocument" ) ); } // Apply XPath and set node list if ( data.prunePath == null ) { // this was already done in processStreaming() if ( !applyXPath() ) { throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.UnableApplyXPath" ) ); } } addFileToResultFilesname( data.file ); if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.FileOpened", data.file.toString() ) ); logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.LoopFileOccurences", "" + data.nodesize, data.file .getName().getBaseName() ) ); } } } catch ( Exception e ) { logError( BaseMessages.getString( PKG, "GetXMLData.Log.UnableToOpenFile", "" + data.filenr, data.file.toString(), e.toString() ) ); stopAll(); setErrors( 1 ); return false; } return true; } public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { if ( first && !meta.isInFields() ) { first = false; data.files = meta.getFiles( this ); if ( !meta.isdoNotFailIfNoFile() && data.files.nrOfFiles() == 0 ) { throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Log.NoFiles" ) ); } handleMissingFiles(); // Create the output row meta-data data.outputRowMeta = new RowMeta(); meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore ); // Create convert meta-data objects that will contain Date & Number formatters // For String to <type> conversions, we allocate a conversion meta data row as well... // data.convertRowMeta = data.outputRowMeta.cloneToType( ValueMetaInterface.TYPE_STRING ); } // Grab a row Object[] r = getXMLRow(); if ( data.errorInRowButContinue ) { return true; // continue without putting the row out } if ( r == null ) { setOutputDone(); // signal end to receiver(s) return false; // end of data or error. } return putRowOut( r ); } private boolean putRowOut( Object[] r ) throws KettleException { if ( log.isRowLevel() ) { logRowlevel( BaseMessages.getString( PKG, "GetXMLData.Log.ReadRow", data.outputRowMeta.getString( r ) ) ); } incrementLinesInput(); data.rownr++; putRow( data.outputRowMeta, r ); // copy row to output rowset(s); if ( meta.getRowLimit() > 0 && data.rownr > meta.getRowLimit() ) { // limit has been reached: stop now. setOutputDone(); return false; } return true; } private Object[] getXMLRow() throws KettleException { if ( !meta.isInFields() ) { while ( ( data.nodenr >= data.nodesize || data.file == null ) ) { if ( !openNextFile() ) { data.errorInRowButContinue = false; // stop in all cases return null; } } } return getXMLRowPutRowWithErrorhandling(); } private Object[] getXMLRowPutRowWithErrorhandling() throws KettleException { // Build an empty row based on the meta-data Object[] r; data.errorInRowButContinue = false; try { if ( meta.isInFields() ) { while ( ( data.nodenr >= data.nodesize || data.readrow == null ) ) { if ( !ReadNextString() ) { return null; } if ( data.readrow == null ) { return null; } } } r = processPutRow( data.an.get( data.nodenr ) ); } catch ( Exception e ) { throw new KettleException( BaseMessages.getString( PKG, "GetXMLData.Error.UnableReadFile" ), e ); } return r; } private Object[] processPutRow( AbstractNode node ) throws KettleException { // Create new row... Object[] outputRowData = buildEmptyRow(); // Create new row or clone if ( meta.isInFields() ) { System.arraycopy( data.readrow, 0, outputRowData, 0, data.nrReadRow ); } try { data.nodenr++; // Read fields... for ( int i = 0; i < data.nrInputFields; i++ ) { // Get field GetXMLDataField xmlDataField = meta.getInputFields()[i]; // Get the Path to look for String XPathValue = xmlDataField.getXPath(); XPathValue = environmentSubstitute( XPathValue ); if ( xmlDataField.getElementType() == GetXMLDataField.ELEMENT_TYPE_ATTRIBUT ) { // We have an attribute // do we need to add leading @? // Only put @ to the last element in path, not in front at all int last = XPathValue.lastIndexOf( GetXMLDataMeta.N0DE_SEPARATOR ); if ( last > -1 ) { last++; String attribut = XPathValue.substring( last, XPathValue.length() ); if ( !attribut.startsWith( GetXMLDataMeta.AT ) ) { XPathValue = XPathValue.substring( 0, last ) + GetXMLDataMeta.AT + attribut; } } else { if ( !XPathValue.startsWith( GetXMLDataMeta.AT ) ) { XPathValue = GetXMLDataMeta.AT + XPathValue; } } } if ( meta.isuseToken() ) { // See if user use Token inside path field // The syntax is : @_Fieldname- // PDI will search for Fieldname value and replace it // Fieldname must be defined before the current node XPathValue = substituteToken( XPathValue, outputRowData ); if ( isDetailed() ) { logDetailed( XPathValue ); } } // Get node value String nodevalue; // Handle namespaces if ( meta.isNamespaceAware() ) { XPath xpathField = node.createXPath( addNSPrefix( XPathValue, data.PathValue ) ); xpathField.setNamespaceURIs( data.NAMESPACE ); if ( xmlDataField.getResultType() == GetXMLDataField.RESULT_TYPE_VALUE_OF ) { nodevalue = xpathField.valueOf( node ); } else { // nodevalue=xpathField.selectSingleNode(node).asXML(); Node n = xpathField.selectSingleNode( node ); if ( n != null ) { nodevalue = n.asXML(); } else { nodevalue = ""; } } } else { if ( xmlDataField.getResultType() == GetXMLDataField.RESULT_TYPE_VALUE_OF ) { nodevalue = node.valueOf( XPathValue ); } else { // nodevalue=node.selectSingleNode(XPathValue).asXML(); Node n = node.selectSingleNode( XPathValue ); if ( n != null ) { nodevalue = n.asXML(); } else { nodevalue = ""; } } } // Do trimming switch ( xmlDataField.getTrimType() ) { case GetXMLDataField.TYPE_TRIM_LEFT: nodevalue = Const.ltrim( nodevalue ); break; case GetXMLDataField.TYPE_TRIM_RIGHT: nodevalue = Const.rtrim( nodevalue ); break; case GetXMLDataField.TYPE_TRIM_BOTH: nodevalue = Const.trim( nodevalue ); break; default: break; } // Do conversions // ValueMetaInterface targetValueMeta = data.outputRowMeta.getValueMeta( data.totalpreviousfields + i ); ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta( data.totalpreviousfields + i ); outputRowData[data.totalpreviousfields + i] = targetValueMeta.convertData( sourceValueMeta, nodevalue ); // Do we need to repeat this field if it is null? if ( meta.getInputFields()[i].isRepeated() ) { if ( data.previousRow != null && Utils.isEmpty( nodevalue ) ) { outputRowData[data.totalpreviousfields + i] = data.previousRow[data.totalpreviousfields + i]; } } } // End of loop over fields... int rowIndex = data.totalpreviousfields + data.nrInputFields; // See if we need to add the filename to the row... if ( meta.includeFilename() && !Utils.isEmpty( meta.getFilenameField() ) ) { outputRowData[rowIndex++] = data.filename; } // See if we need to add the row number to the row... if ( meta.includeRowNumber() && !Utils.isEmpty( meta.getRowNumberField() ) ) { outputRowData[rowIndex++] = data.rownr; } // Possibly add short filename... if ( meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0 ) { outputRowData[rowIndex++] = data.shortFilename; } // Add Extension if ( meta.getExtensionField() != null && meta.getExtensionField().length() > 0 ) { outputRowData[rowIndex++] = data.extension; } // add path if ( meta.getPathField() != null && meta.getPathField().length() > 0 ) { outputRowData[rowIndex++] = data.path; } // Add Size if ( meta.getSizeField() != null && meta.getSizeField().length() > 0 ) { outputRowData[rowIndex++] = data.size; } // add Hidden if ( meta.isHiddenField() != null && meta.isHiddenField().length() > 0 ) { outputRowData[rowIndex++] = Boolean.valueOf( data.path ); } // Add modification date if ( meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0 ) { outputRowData[rowIndex++] = data.lastModificationDateTime; } // Add Uri if ( meta.getUriField() != null && meta.getUriField().length() > 0 ) { outputRowData[rowIndex++] = data.uriName; } // Add RootUri if ( meta.getRootUriField() != null && meta.getRootUriField().length() > 0 ) { outputRowData[rowIndex] = data.rootUriName; } RowMetaInterface irow = getInputRowMeta(); if ( irow == null ) { data.previousRow = outputRowData; } else { // clone to previously allocated array to make sure next step doesn't // change it in between... System.arraycopy( outputRowData, 0, this.prevRow, 0, outputRowData.length ); // Pick up everything else that needs a real deep clone data.previousRow = irow.cloneRow( outputRowData, this.prevRow ); } } catch ( Exception e ) { if ( getStepMeta().isDoingErrorHandling() ) { // Simply add this row to the error row putError( data.outputRowMeta, outputRowData, 1, e.toString(), null, "GetXMLData001" ); data.errorInRowButContinue = true; return null; } else { logError( e.toString() ); throw new KettleException( e.toString() ); } } return outputRowData; } public String substituteToken( String aString, Object[] outputRowData ) { if ( aString == null ) { return null; } StringBuilder buffer = new StringBuilder(); String rest = aString; // search for closing string int i = rest.indexOf( data.tokenStart ); while ( i > -1 ) { int j = rest.indexOf( data.tokenEnd, i + data.tokenStart.length() ); // search for closing string if ( j > -1 ) { String varName = rest.substring( i + data.tokenStart.length(), j ); Object Value = varName; for ( int k = 0; k < data.nrInputFields; k++ ) { GetXMLDataField Tmp_xmlInputField = meta.getInputFields()[k]; if ( Tmp_xmlInputField.getName().equalsIgnoreCase( varName ) ) { Value = "'" + outputRowData[data.totalpreviousfields + k] + "'"; } } buffer.append( rest.substring( 0, i ) ); buffer.append( Value ); rest = rest.substring( j + data.tokenEnd.length() ); } else { // no closing tag found; end the search buffer.append( rest ); rest = ""; } // keep searching i = rest.indexOf( data.tokenEnd ); } buffer.append( rest ); return buffer.toString(); } public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { meta = (GetXMLDataMeta) smi; data = (GetXMLDataData) sdi; if ( super.init( smi, sdi ) ) { data.rownr = 1L; data.nrInputFields = meta.getInputFields().length; data.PathValue = environmentSubstitute( meta.getLoopXPath() ); if ( Utils.isEmpty( data.PathValue ) ) { logError( BaseMessages.getString( PKG, "GetXMLData.Error.EmptyPath" ) ); return false; } if ( !data.PathValue.substring( 0, 1 ).equals( GetXMLDataMeta.N0DE_SEPARATOR ) ) { data.PathValue = GetXMLDataMeta.N0DE_SEPARATOR + data.PathValue; } if ( log.isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "GetXMLData.Log.LoopXPath", data.PathValue ) ); } data.prunePath = environmentSubstitute( meta.getPrunePath() ); if ( data.prunePath != null ) { if ( Utils.isEmpty( data.prunePath.trim() ) ) { data.prunePath = null; } else { // ensure a leading slash if ( !data.prunePath.startsWith( GetXMLDataMeta.N0DE_SEPARATOR ) ) { data.prunePath = GetXMLDataMeta.N0DE_SEPARATOR + data.prunePath; } // check if other conditions apply that do not allow pruning if ( meta.isInFields() ) { data.prunePath = null; // not possible by design, could be changed later on } } } return true; } return false; } public void dispose( StepMetaInterface smi, StepDataInterface sdi ) { meta = (GetXMLDataMeta) smi; data = (GetXMLDataData) sdi; if ( data.file != null ) { try { data.file.close(); } catch ( Exception e ) { // Ignore close errors } } if ( data.an != null ) { data.an.clear(); data.an = null; } if ( data.NAMESPACE != null ) { data.NAMESPACE.clear(); data.NAMESPACE = null; } if ( data.NSPath != null ) { data.NSPath.clear(); data.NSPath = null; } if ( data.readrow != null ) { data.readrow = null; } if ( data.document != null ) { data.document = null; } if ( data.fr != null ) { BaseStep.closeQuietly( data.fr ); } if ( data.is != null ) { BaseStep.closeQuietly( data.is ); } if ( data.files != null ) { data.files = null; } super.dispose( smi, sdi ); } }