package edu.berkeley.cs.nlp.ocular.data; import java.awt.Graphics2D; import java.awt.Image; import java.awt.Rectangle; import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.List; import com.sun.pdfview.PDFFile; import com.sun.pdfview.PDFPage; /** * @author Dan Garrette (dhgarrette@gmail.com) */ public class PdfImageReader { public static int numPagesInPdf(File pdfFile) { try { RandomAccessFile raf = new RandomAccessFile(pdfFile, "r"); FileChannel channel = raf.getChannel(); ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); PDFFile pdf = new PDFFile(buf); int numPages = pdf.getNumPages(); raf.close(); return numPages; } catch (IOException e) { throw new RuntimeException(e); } } public static List<BufferedImage> readPdfAsImages(File pdfFile) { try { RandomAccessFile raf = new RandomAccessFile(pdfFile, "r"); FileChannel channel = raf.getChannel(); ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); PDFFile pdf = new PDFFile(buf); List<BufferedImage> images = new ArrayList<BufferedImage>(); for (int pageNumber = 1; pageNumber <= pdf.getNumPages(); ++pageNumber) { images.add(readPage(pdf, pageNumber)); } raf.close(); return images; } catch (IOException e) { throw new RuntimeException(e); } } /** * * @param pdfFile * Path to the pdf file. * @param pageNumber * One-based page number to read * @return */ public static BufferedImage readPdfPageAsImage(File pdfFile, int pageNumber) { if (pageNumber < 1) throw new RuntimeException("page numbering starts with 1; '" + pageNumber + "' given"); try { RandomAccessFile raf = new RandomAccessFile(pdfFile, "r"); FileChannel channel = raf.getChannel(); ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); PDFFile pdf = new PDFFile(buf); BufferedImage image = readPage(pdf, pageNumber); raf.close(); return image; } catch (IOException e) { throw new RuntimeException(e); } } private static BufferedImage readPage(PDFFile pdf, int pageNumber) { double scale = 2.5; // because otherwise the image comes out really tiny PDFPage page = pdf.getPage(pageNumber); Rectangle rect = new Rectangle(0, 0, (int) page.getBBox().getWidth(), (int) page.getBBox().getHeight()); BufferedImage bufferedImage = new BufferedImage((int)(rect.width * scale), (int)(rect.height * scale), BufferedImage.TYPE_INT_RGB); Image image = page.getImage((int)(rect.width * scale), (int)(rect.height * scale), rect, null, true, true); Graphics2D bufImageGraphics = bufferedImage.createGraphics(); bufImageGraphics.drawImage(image, 0, 0, null); return bufferedImage; } }