/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.util.FileWalker;
import org.jetbrains.annotations.NotNull;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA. User: elacin Date: Apr 8, 2010 Time: 6:50:25 AM To change this template
* use File | Settings | File Templates.
*/
public class TextExtractor {
// ------------------------------ FIELDS ------------------------------
public static final Logger log = Logger.getLogger("pdfextract-interface");
private final File destination;
private final int endPage;
private final String password;
private final List<File> pdfFiles;
private final int startPage;
private final boolean arc;
// --------------------------- CONSTRUCTORS ---------------------------
public TextExtractor(final List<File> pdfFiles, final File destination, final int startPage,
final int endPage, final String password, final boolean arc) {
this.pdfFiles = pdfFiles;
this.destination = destination;
this.startPage = startPage;
this.endPage = endPage;
this.password = password;
this.arc = arc;
}
// -------------------------- STATIC METHODS --------------------------
@NotNull
protected static List<File> findAllPdfFilesUnderDirectory(final String filename) {
List<File> ret = new ArrayList<File>();
File file = new File(filename);
if (!file.exists()) {
throw new RuntimeException("File " + file + " does not exist");
} else if (file.isDirectory()) {
try {
ret.addAll(FileWalker.getFileListing(file, ".pdf"));
} catch (FileNotFoundException e) {
log.error("Could not find file " + filename);
}
} else if (file.isFile()) {
ret.add(file);
}
return ret;
}
@NotNull
private static Options getOptions() {
Options options = new Options();
options.addOption("p", "password", true, "Password for decryption of document");
options.addOption("s", "startpage", true, "First page to parse");
options.addOption("e", "endpage", true, "Last page to parse");
options.addOption("a", "arc", false, "Activate ARC extensions");
return options;
}
@NotNull
private static CommandLine parseParameters(final String[] args) {
Options options = getOptions();
CommandLineParser parser = new PosixParser();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
log.error("Could not parse command line options: " + e.getMessage());
usage();
System.exit(1);
}
return cmd;
}
private static void usage() {
new HelpFormatter().printHelp(TextExtractor.class.getSimpleName()
+ "<PDF file/dir> <XML output file/dir>", getOptions());
}
// -------------------------- PUBLIC METHODS --------------------------
public final void processFiles() {
for (File pdfFile : pdfFiles) {
try {
ProcessDocument processDocument = new ProcessDocument(pdfFile, destination, password,
startPage, endPage, arc);
processDocument.processFile();
} catch (Exception e) {
log.error("Error while processing PDF:", e);
}
}
}
// --------------------------- main() method ---------------------------
public static void main(String[] args) {
CommandLine cmd = parseParameters(args);
if (cmd.getArgs().length != 2) {
usage();
return;
}
int startPage = -1;
if (cmd.hasOption("startpage")) {
startPage = Integer.valueOf(cmd.getOptionValue("startpage"));
log.info("LOG00140:Reading from page " + startPage);
}
int endPage = Integer.MAX_VALUE;
if (cmd.hasOption("endpage")) {
endPage = Integer.valueOf(cmd.getOptionValue("endpage"));
log.info("LOG00150:Reading until page " + endPage);
}
String password = null;
if (cmd.hasOption("password")) {
password = cmd.getOptionValue("password");
}
final boolean arc = cmd.hasOption("arc");
List<File> pdfFiles = findAllPdfFilesUnderDirectory(cmd.getArgs()[0]);
final File destination = new File(cmd.getArgs()[1]);
if (pdfFiles.size() > 1) {
/* if we have more than one input file, demand that the output be a directory */
if (destination.exists()) {
if (!destination.isDirectory()) {
log.error("When specifying multiple input files, output needs to be a directory");
return;
}
} else {
if (!destination.mkdirs()) {
log.error("Could not create output directory");
return;
}
}
}
final TextExtractor textExtractor = new TextExtractor(pdfFiles, destination, startPage, endPage,
password, arc);
textExtractor.processFiles();
}
}