package edu.berkeley.cs.nlp.ocular.data; import java.awt.image.BufferedImage; import java.io.File; import edu.berkeley.cs.nlp.ocular.util.FileUtil; /** * A document that reads a page from a pdf file only as it is needed * (and then stores the contents in memory for later use). * * @author Dan Garrette (dhgarrette@gmail.com) */ public class LazyRawPdfImageDocument extends LazyRawImageDocument { private final File pdfFile; private final int pageNumber; // starts at one! public LazyRawPdfImageDocument(File pdfFile, int pageNumber, String inputPath, int lineHeight, double binarizeThreshold, boolean crop, String extractedLinesPath) { super(inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath); this.pdfFile = pdfFile; this.pageNumber = pageNumber; } protected BufferedImage doLoadBufferedImage() { System.out.println("Extracting text line images from " + pdfFile + ", page " + pageNumber); return PdfImageReader.readPdfPageAsImage(pdfFile, pageNumber); } protected File file() { return pdfFile; } protected String preext() { return new File(baseName()).getName(); } protected String ext() { return "png"; } public String baseName() { return FileUtil.withoutExtension(pdfFile.getPath()) + "_pdf_page" + String.format("%05d", pageNumber); } }