/* * Copyright 2010-2011 Øyvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract; import org.apache.log4j.Logger; import org.elacin.pdfextract.datasource.DocumentContent; import org.elacin.pdfextract.datasource.PDFSource; import org.elacin.pdfextract.datasource.pdfbox.PDFBoxSource; import org.elacin.pdfextract.logical.LogicalAnalysis; import org.elacin.pdfextract.physical.GeometricAnalysis; import org.elacin.pdfextract.renderer.PageRenderer; import org.elacin.pdfextract.tree.DocumentNode; import org.elacin.pdfextract.xml.SimpleXMLOutput; import org.elacin.pdfextract.xml.TEIOutput; import org.jetbrains.annotations.NotNull; import java.io.File; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; import static org.elacin.pdfextract.Constants.*; /** * Created by IntelliJ IDEA. User: elacin Date: 15.01.11 Time: 19.55 To change this template use * File | Settings | File Templates. */ public class ProcessDocument { // ------------------------------ FIELDS ------------------------------ private static final Logger log = Logger.getLogger(ProcessDocument.class); @NotNull public final File pdfFile; @NotNull private final File dest; public String password; public int startPage; public int endPage; final boolean arc; // --------------------------- CONSTRUCTORS --------------------------- public ProcessDocument(File pdfFile, File dest, String password, int startPage, int endPage, final boolean arc) { this.dest = dest; this.pdfFile = pdfFile; this.password = password; this.startPage = startPage; this.endPage = endPage; this.arc = arc; } // -------------------------- STATIC METHODS -------------------------- @NotNull private static File getOutputFile(@NotNull File destination, @NotNull File baseFile, String extension) { final File output; if (destination.isDirectory()) { output = new File(destination, baseFile.getName().replace(".pdf", extension)); } else { output = new File(destination.getAbsolutePath().replace(".pdf", extension)); } return output; } static void renderPDF(PDFSource source, @NotNull DocumentNode root, @NotNull File destination) { long t0 = System.currentTimeMillis(); final PageRenderer renderer = new PageRenderer(source, root, RENDER_RESOLUTION); DateFormat dateFormat = new SimpleDateFormat("MMddHHmm"); Date date = new Date(); for (int i = 0; i < root.getChildren().size(); i++) { /* one indexed pages */ final int pageNum = root.getChildren().get(i).getPageNumber(); /* then open and write to file */ String path = destination.getAbsolutePath(); path = path.replace("%p", String.valueOf(pageNum)); path = path.replace("%d", dateFormat.format(date)); final File outputFile = new File(path); renderer.renderToFile(pageNum, outputFile); } log.debug("Rendering of pdf took " + (System.currentTimeMillis() - t0) + " ms"); } // -------------------------- PUBLIC METHODS -------------------------- public DocumentNode processFile() { PDFSource source = null; DocumentNode documentNode; try { source = new PDFBoxSource(pdfFile, startPage, endPage, password); final DocumentContent content = source.readPages(); documentNode = GeometricAnalysis.analyzeDocument(content); if (SIMPLE_OUTPUT_ENABLED) { File xmlOutFile = getOutputFile(dest, pdfFile, SIMPLE_OUTPUT_EXTENSION); new SimpleXMLOutput().writeTree(documentNode, xmlOutFile); } LogicalAnalysis.analyzeDocument(documentNode, arc); if (RENDER_ENABLED) { renderPDF(source, documentNode, getOutputFile(dest, pdfFile, ".%d.%p.png")); } if (TEI_OUTPUT_ENABLED) { File teiOutFile = getOutputFile(dest, pdfFile, TEI_OUTPUT_EXTENSION); new TEIOutput().writeTree(documentNode, teiOutFile); } } finally { if (source != null) { source.closeSource(); } } return documentNode; } }