package org.icepdf.os.examples.extraction;
/*
* Copyright 2006-2017 ICEsoft Technologies Canada Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an "AS
* IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
import org.icepdf.core.exceptions.PDFException;
import org.icepdf.core.exceptions.PDFSecurityException;
import org.icepdf.core.pobjects.Document;
import org.icepdf.core.pobjects.graphics.text.LineText;
import org.icepdf.core.pobjects.graphics.text.PageText;
import org.icepdf.ri.util.FontPropertiesManager;
import org.icepdf.ri.util.PropertiesManager;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.ResourceBundle;
/**
* The <code>PageTextExtraction</code> class is an example of how to extract
* text from a PDF document. A file specified at the command line is opened
* and any text in the first page's content is saved to a text file.
*
* @since 2.0
*/
public class PageTextExtraction {
public static void main(String[] args) {
// Get a file from the command line to open
String filePath = args[0];
// read/store the font cache.
ResourceBundle messageBundle = ResourceBundle.getBundle(
PropertiesManager.DEFAULT_MESSAGE_BUNDLE);
PropertiesManager properties = new PropertiesManager(System.getProperties(),
ResourceBundle.getBundle(PropertiesManager.DEFAULT_MESSAGE_BUNDLE));
new FontPropertiesManager(properties, System.getProperties(), messageBundle);
// open the url
Document document = new Document();
try {
document.setFile(filePath);
} catch (PDFException ex) {
System.out.println("Error parsing PDF document " + ex);
} catch (PDFSecurityException ex) {
System.out.println("Error encryption not supported " + ex);
} catch (FileNotFoundException ex) {
System.out.println("Error file not found " + ex);
} catch (IOException ex) {
System.out.println("Error handling PDF document " + ex);
}
try {
// create a file to write the extracted text to
File file = new File("extracted_text.txt");
FileWriter fileWriter = new FileWriter(file);
// Get text from the first page of the document, assuming that there
// is text to extract.
for (int pageNumber = 0, max = document.getNumberOfPages();
pageNumber < max; pageNumber++) {
PageText pageText = document.getPageText(pageNumber);
System.out.println("Extracting page text: " + pageNumber);
if (pageText != null && pageText.getPageLines() != null) {
ArrayList<LineText> pageLines = pageText.getPageLines();
for (LineText lineText : pageLines) {
fileWriter.write(lineText.toString());
fileWriter.write('\n');
}
}
}
// close the writer
fileWriter.close();
} catch (IOException ex) {
System.out.println("Error writing to file " + ex);
} catch (InterruptedException ex) {
System.out.println("Error paring page " + ex);
}
// clean up resources
document.dispose();
}
}