/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.poi;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.processor.io.OutputStreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
@Tags({"excel", "csv", "poi"})
@CapabilityDescription("Consumes a Microsoft Excel document and converts each worksheet to csv. Each sheet from the incoming Excel " +
"document will generate a new Flowfile that will be output from this processor. Each output Flowfile's contents will be formatted as a csv file " +
"where the each row from the excel sheet is output as a newline in the csv file. This processor is currently only capable of processing .xlsx " +
"(XSSF 2007 OOXML file format) Excel documents and not older .xls (HSSF '97(-2007) file format) documents. This processor also expects well formatted " +
"CSV content and will not escape cell's containing invalid content such as newlines or additional commas.")
@WritesAttributes({@WritesAttribute(attribute="sheetname", description="The name of the Excel sheet that this particular row of data came from in the Excel document"),
@WritesAttribute(attribute="numrows", description="The number of rows in this Excel Sheet"),
@WritesAttribute(attribute="sourcefilename", description="The name of the Excel document file that this data originated from"),
@WritesAttribute(attribute="convertexceltocsvprocessor.error", description="Error message that was encountered on a per Excel sheet basis. This attribute is" +
" only populated if an error was occured while processing the particular sheet. Having the error present at the sheet level will allow for the end" +
" user to better understand what syntax errors in their excel doc on a larger scale caused the error.")})
public class ConvertExcelToCSVProcessor
extends AbstractProcessor {
private static final String CSV_MIME_TYPE = "text/csv";
public static final String SHEET_NAME = "sheetname";
public static final String ROW_NUM = "numrows";
public static final String SOURCE_FILE_NAME = "sourcefilename";
private static final String SAX_CELL_REF = "c";
private static final String SAX_CELL_TYPE = "t";
private static final String SAX_CELL_STRING = "s";
private static final String SAX_CELL_CONTENT_REF = "v";
private static final String SAX_ROW_REF = "row";
private static final String SAX_SHEET_NAME_REF = "sheetPr";
private static final String DESIRED_SHEETS_DELIMITER = ",";
private static final String UNKNOWN_SHEET_NAME = "UNKNOWN";
private static final String SAX_PARSER = "org.apache.xerces.parsers.SAXParser";
public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor
.Builder().name("extract-sheets")
.displayName("Sheets to Extract")
.description("Comma separated list of Excel document sheet names that should be extracted from the excel document. If this property" +
" is left blank then all of the sheets will be extracted from the Excel document. The list of names is case in-sensitive. Any sheets not " +
"specified in this value will be ignored.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final Relationship ORIGINAL = new Relationship.Builder()
.name("original")
.description("Original Excel document received by this processor")
.build();
public static final Relationship SUCCESS = new Relationship.Builder()
.name("success")
.description("Excel data converted to csv")
.build();
public static final Relationship FAILURE = new Relationship.Builder()
.name("failure")
.description("Failed to parse the Excel document")
.build();
private List<PropertyDescriptor> descriptors;
private Set<Relationship> relationships;
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<>();
descriptors.add(DESIRED_SHEETS);
this.descriptors = Collections.unmodifiableList(descriptors);
final Set<Relationship> relationships = new HashSet<>();
relationships.add(ORIGINAL);
relationships.add(SUCCESS);
relationships.add(FAILURE);
this.relationships = Collections.unmodifiableSet(relationships);
}
@Override
public Set<Relationship> getRelationships() {
return this.relationships;
}
@Override
public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return descriptors;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final FlowFile flowFile = session.get();
if ( flowFile == null ) {
return;
}
try {
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(InputStream inputStream) throws IOException {
try {
String desiredSheetsDelimited = context.getProperty(DESIRED_SHEETS)
.evaluateAttributeExpressions().getValue();
OPCPackage pkg = OPCPackage.open(inputStream);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) r.getSheetsData();
if (desiredSheetsDelimited != null) {
String[] desiredSheets = StringUtils
.split(desiredSheetsDelimited, DESIRED_SHEETS_DELIMITER);
if (desiredSheets != null) {
while (iter.hasNext()) {
InputStream sheet = iter.next();
String sheetName = iter.getSheetName();
for (int i = 0; i < desiredSheets.length; i++) {
//If the sheetName is a desired one parse it
if (sheetName.equalsIgnoreCase(desiredSheets[i])) {
handleExcelSheet(session, flowFile, sst, sheet, sheetName);
break;
}
}
}
} else {
getLogger().debug("Excel document was parsed but no sheets with the specified desired names were found.");
}
} else {
//Get all of the sheets in the document.
while (iter.hasNext()) {
handleExcelSheet(session, flowFile, sst, iter.next(), iter.getSheetName());
}
}
} catch (InvalidFormatException ife) {
getLogger().error("Only .xlsx Excel 2007 OOXML files are supported", ife);
throw new UnsupportedOperationException("Only .xlsx Excel 2007 OOXML files are supported", ife);
} catch (OpenXML4JException e) {
getLogger().error("Error occurred while processing Excel document metadata", e);
}
}
});
session.transfer(flowFile, ORIGINAL);
} catch (RuntimeException ex) {
getLogger().error("Failed to process incoming Excel document", ex);
FlowFile failedFlowFile = session.putAttribute(flowFile,
ConvertExcelToCSVProcessor.class.getName() + ".error", ex.getMessage());
session.transfer(failedFlowFile, FAILURE);
}
}
/**
* Handles an individual Excel sheet from the entire Excel document. Each sheet will result in an individual flowfile.
*
* @param session
* The NiFi ProcessSession instance for the current invocation.
*/
private void handleExcelSheet(ProcessSession session, FlowFile originalParentFF,
SharedStringsTable sst, final InputStream sheetInputStream, String sName) throws IOException {
FlowFile ff = session.create();
try {
XMLReader parser =
XMLReaderFactory.createXMLReader(
SAX_PARSER
);
ExcelSheetRowHandler handler = new ExcelSheetRowHandler(sst);
parser.setContentHandler(handler);
ff = session.write(ff, new OutputStreamCallback() {
@Override
public void process(OutputStream out) throws IOException {
InputSource sheetSource = new InputSource(sheetInputStream);
ExcelSheetRowHandler eh = null;
try {
eh = (ExcelSheetRowHandler) parser.getContentHandler();
eh.setFlowFileOutputStream(out);
parser.setContentHandler(eh);
parser.parse(sheetSource);
sheetInputStream.close();
} catch (SAXException se) {
getLogger().error("Error occurred while processing Excel sheet {}", new Object[]{eh.getSheetName()}, se);
}
}
});
if (handler.getSheetName().equals(UNKNOWN_SHEET_NAME)) {
//Used the named parsed from the handler. This logic is only here because IF the handler does find a value that should take precedence.
ff = session.putAttribute(ff, SHEET_NAME, sName);
} else {
ff = session.putAttribute(ff, SHEET_NAME, handler.getSheetName());
sName = handler.getSheetName();
}
ff = session.putAttribute(ff, ROW_NUM, new Long(handler.getRowCount()).toString());
if (StringUtils.isNotEmpty(originalParentFF.getAttribute(CoreAttributes.FILENAME.key()))) {
ff = session.putAttribute(ff, SOURCE_FILE_NAME, originalParentFF.getAttribute(CoreAttributes.FILENAME.key()));
} else {
ff = session.putAttribute(ff, SOURCE_FILE_NAME, UNKNOWN_SHEET_NAME);
}
//Update the CoreAttributes.FILENAME to have the .csv extension now. Also update MIME.TYPE
ff = session.putAttribute(ff, CoreAttributes.FILENAME.key(), updateFilenameToCSVExtension(ff.getAttribute(CoreAttributes.UUID.key()),
ff.getAttribute(CoreAttributes.FILENAME.key()), sName));
ff = session.putAttribute(ff, CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE);
session.transfer(ff, SUCCESS);
} catch (SAXException saxE) {
getLogger().error("Failed to create instance of SAXParser {}", new Object[]{SAX_PARSER}, saxE);
ff = session.putAttribute(ff,
ConvertExcelToCSVProcessor.class.getName() + ".error", saxE.getMessage());
session.transfer(ff, FAILURE);
} finally {
sheetInputStream.close();
}
}
/**
* Extracts every row from an Excel Sheet and generates a corresponding JSONObject whose key is the Excel CellAddress and value
* is the content of that CellAddress converted to a String
*/
private class ExcelSheetRowHandler
extends DefaultHandler {
private SharedStringsTable sst;
private String currentContent;
private boolean nextIsString;
private OutputStream outputStream;
private boolean firstColInRow;
long rowCount;
String sheetName;
private ExcelSheetRowHandler(SharedStringsTable sst) {
this.sst = sst;
this.firstColInRow = true;
this.rowCount = 0l;
this.sheetName = UNKNOWN_SHEET_NAME;
}
public void setFlowFileOutputStream(OutputStream outputStream) {
this.outputStream = outputStream;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
if (name.equals(SAX_CELL_REF)) {
String cellType = attributes.getValue(SAX_CELL_TYPE);
if(cellType != null && cellType.equals(SAX_CELL_STRING)) {
nextIsString = true;
} else {
nextIsString = false;
}
} else if (name.equals(SAX_ROW_REF)) {
firstColInRow = true;
} else if (name.equals(SAX_SHEET_NAME_REF)) {
sheetName = attributes.getValue(0);
}
currentContent = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
if (nextIsString) {
int idx = Integer.parseInt(currentContent);
currentContent = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
nextIsString = false;
}
if (name.equals(SAX_CELL_CONTENT_REF)) {
if (firstColInRow) {
firstColInRow = false;
try {
outputStream.write(currentContent.getBytes());
} catch (IOException e) {
getLogger().error("IO error encountered while writing content of parsed cell " +
"value from sheet {}", new Object[]{getSheetName()}, e);
}
} else {
try {
outputStream.write(("," + currentContent).getBytes());
} catch (IOException e) {
getLogger().error("IO error encountered while writing content of parsed cell " +
"value from sheet {}", new Object[]{getSheetName()}, e);
}
}
}
if (name.equals(SAX_ROW_REF)) {
//If this is the first row and the end of the row element has been encountered then that means no columns were present.
if (!firstColInRow) {
try {
rowCount++;
outputStream.write("\n".getBytes());
} catch (IOException e) {
getLogger().error("IO error encountered while writing new line indicator", e);
}
}
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
currentContent += new String(ch, start, length);
}
public long getRowCount() {
return rowCount;
}
public String getSheetName() {
return sheetName;
}
}
/**
* Takes the original input filename and updates it by removing the file extension and replacing it with
* the .csv extension.
*
* @param origFileName
* Original filename from the input file.
*
* @return
* The new filename with the .csv extension that should be place in the output flowfile's attributes
*/
private String updateFilenameToCSVExtension(String nifiUUID, String origFileName, String sheetName) {
StringBuilder stringBuilder = new StringBuilder();
if (StringUtils.isNotEmpty(origFileName)) {
String ext = FilenameUtils.getExtension(origFileName);
if (StringUtils.isNotEmpty(ext)) {
stringBuilder.append(StringUtils.replace(origFileName, ("." + ext), ""));
} else {
stringBuilder.append(origFileName);
}
} else {
stringBuilder.append(nifiUUID);
}
stringBuilder.append("_");
stringBuilder.append(sheetName);
stringBuilder.append(".");
stringBuilder.append("csv");
return stringBuilder.toString();
}
}