/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.tools; import java.awt.geom.Point2D; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.graphics.image.PDImage; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.tools.imageio.ImageIOUtil; import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup; import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask; /** * Extracts the images from a PDF file. * * @author Ben Litchfield */ public final class ExtractImages { private static final String PASSWORD = "-password"; private static final String PREFIX = "-prefix"; private static final String DIRECTJPEG = "-directJPEG"; private static final List<String> JPEG = Arrays.asList( COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName()); private boolean directJPEG; private String prefix; private final Set<COSStream> seen = new HashSet<>(); private int imageCounter = 1; private ExtractImages() { } /** * Entry point for the application. * * @param args The command-line arguments. * @throws IOException if there is an error reading the file or extracting the images. */ public static void main(String[] args) throws IOException { // suppress the Dock icon on OS X System.setProperty("apple.awt.UIElement", "true"); ExtractImages extractor = new ExtractImages(); extractor.run(args); } private void run(String[] args) throws IOException { if (args.length < 1 || args.length > 4) { usage(); } else { String pdfFile = null; String password = ""; for(int i = 0; i < args.length; i++) { switch (args[i]) { case PASSWORD: i++; if (i >= args.length) { usage(); } password = args[i]; break; case PREFIX: i++; if (i >= args.length) { usage(); } prefix = args[i]; break; case DIRECTJPEG: directJPEG = true; break; default: if (pdfFile == null) { pdfFile = args[i]; } break; } } if (pdfFile == null) { usage(); } else { if (prefix == null && pdfFile.length() >4) { prefix = pdfFile.substring(0, pdfFile.length() -4); } extract(pdfFile, password); } } } /** * Print the usage requirements and exit. */ private static void usage() { String message = "Usage: java " + ExtractImages.class.getName() + " [options] <inputfile>\n" + "\nOptions:\n" + " -password <password> : Password to decrypt document\n" + " -prefix <image-prefix> : Image prefix(default to pdf name)\n" + " -directJPEG : Forces the direct extraction of JPEG images " + "regardless of colorspace\n" + " <inputfile> : The PDF document to use\n"; System.err.println(message); System.exit(1); } private void extract(String pdfFile, String password) throws IOException { try (PDDocument document = PDDocument.load(new File(pdfFile), password)) { AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract images"); } for (int i = 0; i < document.getNumberOfPages(); i++) // todo: ITERATOR would be much better { PDPage page = document.getPage(i); ImageGraphicsEngine extractor = new ImageGraphicsEngine(page); extractor.run(); } } } private class ImageGraphicsEngine extends PDFGraphicsStreamEngine { protected ImageGraphicsEngine(PDPage page) throws IOException { super(page); } public void run() throws IOException { PDPage page = getPage(); processPage(page); PDResources res = page.getResources(); for (COSName name : res.getExtGStateNames()) { PDSoftMask softMask = res.getExtGState(name).getSoftMask(); if (softMask != null) { PDTransparencyGroup group = softMask.getGroup(); if (group != null) { processSoftMask(group); } } } } @Override public void drawImage(PDImage pdImage) throws IOException { if (pdImage instanceof PDImageXObject) { PDImageXObject xobject = (PDImageXObject)pdImage; if (seen.contains(xobject.getCOSObject())) { // skip duplicate image return; } seen.add(xobject.getCOSObject()); } // save image String name = prefix + "-" + imageCounter; imageCounter++; System.out.println("Writing image: " + name); write2file(pdImage, name, directJPEG); } @Override public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException { } @Override public void clip(int windingRule) throws IOException { } @Override public void moveTo(float x, float y) throws IOException { } @Override public void lineTo(float x, float y) throws IOException { } @Override public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException { } @Override public Point2D getCurrentPoint() throws IOException { return new Point2D.Float(0, 0); } @Override public void closePath() throws IOException { } @Override public void endPath() throws IOException { } @Override public void strokePath() throws IOException { } @Override public void fillPath(int windingRule) throws IOException { } @Override public void fillAndStrokePath(int windingRule) throws IOException { } @Override public void shadingFill(COSName shadingName) throws IOException { } } private boolean hasMasks(PDImage pdImage) throws IOException { if (pdImage instanceof PDImageXObject) { PDImageXObject ximg = (PDImageXObject) pdImage; return ximg.getMask() != null || ximg.getSoftMask() != null; } return false; } /** * Writes the image to a file with the filename prefix + an appropriate suffix, like "Image.jpg". * The suffix is automatically set depending on the image compression in the PDF. * @param pdImage the image. * @param prefix the filename prefix. * @param directJPEG if true, force saving JPEG streams as they are in the PDF file. * @throws IOException When something is wrong with the corresponding file. */ private void write2file(PDImage pdImage, String filename, boolean directJPEG) throws IOException { String suffix = pdImage.getSuffix(); if (suffix == null) { suffix = "png"; } try (FileOutputStream out = new FileOutputStream(filename + "." + suffix)) { BufferedImage image = pdImage.getImage(); if (image != null) { if ("jpg".equals(suffix)) { String colorSpaceName = pdImage.getColorSpace().getName(); if (directJPEG || !hasMasks(pdImage) && (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) { // RGB or Gray colorspace: get and write the unmodified JPEG stream InputStream data = pdImage.createInputStream(JPEG); IOUtils.copy(data, out); IOUtils.closeQuietly(data); } else { // for CMYK and other "unusual" colorspaces, the JPEG will be converted ImageIOUtil.writeImage(image, suffix, out); } } else { ImageIOUtil.writeImage(image, suffix, out); } } out.flush(); } } }