package org.icepdf.os.examples.extraction; /* * Copyright 2006-2017 ICEsoft Technologies Canada Corp. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the * License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language * governing permissions and limitations under the License. */ import org.icepdf.core.exceptions.PDFException; import org.icepdf.core.exceptions.PDFSecurityException; import org.icepdf.core.pobjects.Document; import org.icepdf.core.pobjects.graphics.text.LineText; import org.icepdf.core.pobjects.graphics.text.PageText; import org.icepdf.ri.util.FontPropertiesManager; import org.icepdf.ri.util.PropertiesManager; import java.io.File; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.ResourceBundle; /** * The <code>PageTextExtraction</code> class is an example of how to extract * text from a PDF document. A file specified at the command line is opened * and any text in the first page's content is saved to a text file. * * @since 2.0 */ public class PageTextExtraction { public static void main(String[] args) { // Get a file from the command line to open String filePath = args[0]; // read/store the font cache. ResourceBundle messageBundle = ResourceBundle.getBundle( PropertiesManager.DEFAULT_MESSAGE_BUNDLE); PropertiesManager properties = new PropertiesManager(System.getProperties(), ResourceBundle.getBundle(PropertiesManager.DEFAULT_MESSAGE_BUNDLE)); new FontPropertiesManager(properties, System.getProperties(), messageBundle); // open the url Document document = new Document(); try { document.setFile(filePath); } catch (PDFException ex) { System.out.println("Error parsing PDF document " + ex); } catch (PDFSecurityException ex) { System.out.println("Error encryption not supported " + ex); } catch (FileNotFoundException ex) { System.out.println("Error file not found " + ex); } catch (IOException ex) { System.out.println("Error handling PDF document " + ex); } try { // create a file to write the extracted text to File file = new File("extracted_text.txt"); FileWriter fileWriter = new FileWriter(file); // Get text from the first page of the document, assuming that there // is text to extract. for (int pageNumber = 0, max = document.getNumberOfPages(); pageNumber < max; pageNumber++) { PageText pageText = document.getPageText(pageNumber); System.out.println("Extracting page text: " + pageNumber); if (pageText != null && pageText.getPageLines() != null) { ArrayList<LineText> pageLines = pageText.getPageLines(); for (LineText lineText : pageLines) { fileWriter.write(lineText.toString()); fileWriter.write('\n'); } } } // close the writer fileWriter.close(); } catch (IOException ex) { System.out.println("Error writing to file " + ex); } catch (InterruptedException ex) { System.out.println("Error paring page " + ex); } // clean up resources document.dispose(); } }