/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.io.OutputStream; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.common.PDStream; /** * A simple command line utility for reducing the size of the ref-guide PDF. * <p> * Currently this script focuses on using {@link COSName#FLATE_DECODE} to compress the (decoded) Objects * in the source PDF, but other improvements may be possible in the future. * </p> * <p> * This code is originally based on the <code>WriteDecodedDoc</code> example provided with <a href="https://pdfbox.apache.org/">Apache PDFBox</a>. * </p> * <p> * <b>NOTE:</b> This class should <em>NOT</em> be considered a general purpose tool for reducing the size of * <em>any</em> PDF. * Decisions made in this code can and will be focused explicitly on serving the purpose of reducing the size of the * Solr Reference Guide PDF, as originally produced by asciidoctor, and may not be generally useful for all PDFs * "in the wild". * </p> */ public class ReducePDFSize { public static void main(String[] args) throws IOException { if (2 != args.length) { throw new RuntimeException("arg0 must be input file, org1 must be output file"); } String in = args[0]; String out = args[1]; PDDocument doc = null; try { doc = PDDocument.load(new File(in)); doc.setAllSecurityToBeRemoved(true); for (COSObject cosObject : doc.getDocument().getObjects()) { COSBase base = cosObject.getObject(); // if it's a stream: decode it, then re-write it using FLATE_DECODE if (base instanceof COSStream) { COSStream stream = (COSStream) base; byte[] bytes; try { bytes = new PDStream(stream).toByteArray(); } catch (IOException ex) { // NOTE: original example code from PDFBox just logged & "continue;"d here, 'skipping' this stream. // If this type of failure ever happens, we can (perhaps) consider (re)ignoring this type of failure? // // IIUC then that will leave the original (non-decoded / non-flated) stream in place? throw new RuntimeException("can't serialize byte[] from: " + cosObject.getObjectNumber() + " " + cosObject.getGenerationNumber() + " obj: " + ex.getMessage(), ex); } stream.removeItem(COSName.FILTER); OutputStream streamOut = stream.createOutputStream(COSName.FLATE_DECODE); streamOut.write(bytes); streamOut.close(); } } doc.getDocumentCatalog(); doc.save( out ); } finally { if ( doc != null ) { doc.close(); } } } }