package filetools.pdf; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import org.apache.commons.io.FilenameUtils; import org.apache.pdfbox.pdmodel.PDDocument; public class PdfTextExtraction { public static void main(String args[]) throws IOException { String examinedFolder = utilities.BrowserDialogs.chooseFolder(); PrintWriter outputXml = new PrintWriter(new FileWriter(examinedFolder + "\\PdfTextEctraction.xml")); if (examinedFolder != null) { ArrayList<File> files = utilities.ListsFiles.getPaths(new File(examinedFolder), new ArrayList<File>()); if (files != null) { for (int i = 0; i < files.size(); i++) { String extension = FilenameUtils.getExtension(files.get(i).toString()).toLowerCase(); if (extension.equals("pdf")) { outputXml.println (i+1); outputXml.println(files.get(i).toString()); long filesizePdf = files.get(i).length(); // PDDocument testfileOrg = PDDocument.load(OrgPdf); String[] linesPdf = PdfAnalysis.extractsPdfLines(files.get(i).toString()); int lenPdf = linesPdf.length; for (int j = 0; j < lenPdf; j++) { outputXml.println(linesPdf[j]); } } } } } outputXml.close(); } }