/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.examples.util; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdfwriter.ContentStreamWriter; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.util.PDFOperator; import java.util.ArrayList; import java.util.List; /** * This is an example on how to remove all text from PDF document. * * Usage: java org.apache.pdfbox.examples.util.RemoveAllText <input-pdf> <output-pdf> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.2 $ */ public class RemoveAllText { /** * Default constructor. */ private RemoveAllText() { //example class should not be instantiated } /** * This will remove all text from a PDF document. * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main( String[] args ) throws Exception { if( 1==2 ) { usage(); } else { PDDocument document = null; try { document = PDDocument.load( "/home/michel/Documents/etudes/Tsinghua/M2/arnetminer/pdfBox2html/data/pdf/test3.pdf" ); if( document.isEncrypted() ) { System.err.println( "Error: Encrypted documents are not supported for this example." ); System.exit( 1 ); } List allPages = document.getDocumentCatalog().getAllPages(); for( int i=0; i<allPages.size(); i++ ) { PDPage page = (PDPage)allPages.get( i ); PDFStreamParser parser = new PDFStreamParser(page.getContents()); parser.parse(); List tokens = parser.getTokens(); List newTokens = new ArrayList(); for( int j=0; j<tokens.size(); j++) { Object token = tokens.get( j ); if( token instanceof PDFOperator ) { PDFOperator op = (PDFOperator)token; if( op.getOperation().equals( "TJ") || op.getOperation().equals( "Tj" )) { //remove the one argument to this operator newTokens.remove( newTokens.size() -1 ); continue; } } newTokens.add( token ); } PDStream newContents = new PDStream( document ); ContentStreamWriter writer = new ContentStreamWriter( newContents.createOutputStream() ); writer.writeTokens( newTokens ); newContents.addCompression(); page.setContents( newContents ); } document.save( "/home/michel/Documents/etudes/Tsinghua/M2/arnetminer/pdfBox2html/data/pdf/test3NoText.pdf" ); } finally { if( document != null ) { document.close(); } } } } /** * This will print the usage for this document. */ private static void usage() { System.err.println( "Usage: java org.apache.pdfbox.examples.pdmodel.RemoveAllText <input-pdf> <output-pdf>" ); } }