/* * Created on Sep 3, 2011 * Copyright 2010 by Eduard Weissmann (edi.weissmann@gmail.com). * * This file is part of the Sejda source code * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.sejda.conversion; import static org.apache.commons.io.FilenameUtils.getExtension; import static org.sejda.common.XMLUtils.nullSafeGetStringAttribute; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathException; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.sejda.conversion.exception.ConversionException; import org.sejda.model.exception.SejdaRuntimeException; import org.sejda.model.input.PdfFileSource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * Adapter for a list of {@link PdfFileSource}s. Provides a filePath based constructor. Will parse xml, csv config file formats, and list a directory contents * * @author Eduard Weissmann * */ public class PdfFileSourceListAdapter { private static final Logger LOG = LoggerFactory.getLogger(PdfFileSourceListAdapter.class); private final PdfInputFilesSourceFactory parserFactory = new PdfInputFilesSourceFactory(); private final List<PdfFileSource> fileSourceList = new ArrayList<>(); private final File file; private Pattern pattern = Pattern.compile(".+"); public PdfFileSourceListAdapter(String filePath) { file = new File(filePath); if (!file.exists()) { throw new ConversionException("File '" + file.getPath() + "' does not exist"); } } public PdfFileSourceListAdapter filter(String filterRegExp) { if (StringUtils.isNotBlank(filterRegExp)) { LOG.debug("Applying regular expression: {}", filterRegExp); pattern = Pattern.compile(filterRegExp); } return this; } public List<PdfFileSource> getFileSourceList() { fileSourceList.addAll(parserFactory.createSource(file).getInputFiles(file)); if (fileSourceList.isEmpty()) { throw new ConversionException("No input files specified in '" + file.getPath() + "'"); } return fileSourceList; } /** * Factory for {@link PdfInputFilesSource}s. Depending on input {@link File} (folder, csv file, xml file), a different source will be created. * * @author Eduard Weissmann * */ class PdfInputFilesSourceFactory { private static final String XML_EXTENSION = "xml"; private static final String CSV_EXTENSION = "csv"; PdfInputFilesSource createSource(File file) { String extension = getExtension(file.getName()); if (file.isDirectory()) { return new FolderFileSourceListParser(PdfFileSourceListAdapter.this.pattern); } else if (CSV_EXTENSION.equalsIgnoreCase(extension)) { return new CsvFileSourceListParser(); } else if (XML_EXTENSION.equalsIgnoreCase(extension)) { return new XmlFileSourceListParser(); } throw new SejdaRuntimeException("Cannot read input file names from config file '" + file.getName() + "'. Unsupported file format: " + extension); } } } /** * Source for {@link PdfFileSource} input files * * @author Eduard Weissmann * */ interface PdfInputFilesSource { List<PdfFileSource> getInputFiles(File file); } /** * Abstract base class of {@link PdfInputFilesSource}s * * @author Eduard Weissmann * */ abstract class AbstractPdfInputFilesSource implements PdfInputFilesSource { private static final Logger LOG = LoggerFactory.getLogger(AbstractPdfInputFilesSource.class); @Override public List<PdfFileSource> getInputFiles(File file) { List<String> filenames = parseFileNames(file); LOG.trace("Input files: '" + StringUtils.join(filenames, "', '") + "'"); try { return PdfFileSourceAdapter.fromStrings(filenames); } catch (SejdaRuntimeException e) { throw new ConversionException("Invalid filename found: " + e.getMessage(), e); } } protected abstract List<String> parseFileNames(File file); } /** * Produces the list of input files by listing a directory for files with pdf extension * * @author Eduard Weissmann * */ class FolderFileSourceListParser extends AbstractPdfInputFilesSource { protected static final String PDF_EXTENSION = "pdf"; private Pattern pattern; FolderFileSourceListParser(Pattern pattern) { this.pattern = pattern; } @Override protected List<String> parseFileNames(File file) { List<File> files = Arrays.asList(file.listFiles((dir, filename) -> { return StringUtils.equalsIgnoreCase(getExtension(filename), PDF_EXTENSION) && pattern.matcher(filename).matches(); })); List<String> filenames = new ArrayList<>(); for (File current : files) { filenames.add(current.getAbsolutePath()); } Collections.sort(filenames); return filenames; } } /** * Produces the list of input files by parsing a csv file * * @author Eduard Weissmann * */ class CsvFileSourceListParser extends AbstractPdfInputFilesSource { private static final Logger LOG = LoggerFactory.getLogger(CsvFileSourceListParser.class); @Override protected List<String> parseFileNames(File file) { try { return doParseFileNames(file); } catch (Exception e) { LOG.error("Can't extract filesnames", e); throw new ConversionException("Can't extract filenames from '" + file.getName() + "'. Reason:" + e.getMessage(), e); } } protected List<String> doParseFileNames(File file) throws IOException { List<String> resultingFileNames = new ArrayList<>(); List<String> lines = IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset()); for (String eachLine : lines) { String[] splitLine = StringUtils.split(eachLine.toString(), ","); resultingFileNames.addAll(Arrays.asList(splitLine)); } return resultingFileNames; } } /** * Produces the list of input files by parsing a xml file * * @author Eduard Weissmann * */ class XmlFileSourceListParser extends AbstractPdfInputFilesSource { private static final Logger LOG = LoggerFactory.getLogger(XmlFileSourceListParser.class); private XPathFactory xpathFactory = XPathFactory.newInstance(); @Override protected List<String> parseFileNames(File file) { try { return doParseFileNames(file); } catch (Exception e) { LOG.error("Can't extract filenames", e); throw new ConversionException("Can't extract filenames from '" + file.getName() + "'. Reason:" + e.getMessage(), e); } } protected List<String> doParseFileNames(File file) throws IOException, SAXException, ParserConfigurationException, XPathException { DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); domFactory.setNamespaceAware(true); DocumentBuilder builder = domFactory.newDocumentBuilder(); Document doc = builder.parse(file); List<String> result = new ArrayList<>(); result.addAll(parseSingleFiles(doc)); result.addAll(parseFileSets(doc, file)); return result; } /** * Parse fileset definitions <filelist><fileset>[...]</fileset></filelist> ignoring the rest of the document * * @param doc * @return a list of string matching the contents of the <filelist><fileset> tags in the document * @throws XPathExpressionException */ private List<String> parseFileSets(Document doc, File configFile) throws XPathExpressionException { List<String> result = new ArrayList<>(); NodeList nodeList = getNodeListMatchingXpath("//filelist/fileset/file", doc); for (int i = 0; i < nodeList.getLength(); i++) { Node node = nodeList.item(i); Node fileSet = node.getParentNode(); String parentDirPath = nullSafeGetStringAttribute(fileSet, "dir"); if (parentDirPath == null) { parentDirPath = configFile.getAbsoluteFile().getParent(); } String filePath = extractFilePath(node); // warn if file in fileset is using absolute path mode if (FilenameUtils.getPrefixLength(filePath) > 0) { LOG.warn("File " + filePath + " in fileset " + StringUtils.defaultIfBlank(nullSafeGetStringAttribute(fileSet, "dir"), "") + " seems to be an absolute path. Will _not_ be resolved relative to the <fileset>, but as an absolute path. Normally you would want to use relative paths in a //filelist/fileset/file, and absolute paths in a //filelist/file."); } result.add(FilenameUtils.concat(parentDirPath, filePath)); } return result; } private String extractFilePath(Node fileNode) { String password = nullSafeGetStringAttribute(fileNode, "password"); String value = nullSafeGetStringAttribute(fileNode, "value"); return value + (password == null ? "" : PdfFileSourceAdapter.PASSWORD_SEPARATOR_CHARACTER + password); } /** * Parse single file definitions <filelist><file>[...]</file></filelist> ignoring the rest of the document * * @param doc * @return a list of string matching the contents of the <filelist><file> tags in the document * @throws XPathExpressionException */ private List<String> parseSingleFiles(Document doc) throws XPathExpressionException { List<String> result = new ArrayList<>(); NodeList nodeList = getNodeListMatchingXpath("//filelist/file", doc); for (int i = 0; i < nodeList.getLength(); i++) { result.add(extractFilePath(nodeList.item(i))); } return result; } private NodeList getNodeListMatchingXpath(String xpathString, Document doc) throws XPathExpressionException { return (NodeList) xpathFactory.newXPath().evaluate(xpathString, doc, XPathConstants.NODESET); } }