/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.text; import difflib.ChangeDelta; import difflib.DeleteDelta; import difflib.DiffUtils; import difflib.InsertDelta; import difflib.Patch; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintStream; import java.io.Writer; import java.net.URISyntaxException; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import junit.framework.Test; import junit.framework.TestCase; import junit.framework.TestSuite; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.TestPDPageTree; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; /** * Test suite for PDFTextStripper. * * FILE SET VALIDATION * * This test suite is designed to test PDFTextStripper using a set of PDF * files and known good output for each. The default mode of testAll() * is to process each *.pdf file in "src/test/resources/input". An output * file is created in "target/test-output" with the same name as the PDF file, * plus an additional ".txt" suffix. * * The output file is then tested against a known good result file from * the input directory (again, with the same name as the tested PDF file, * but with the additional ".txt" suffix). The process is performed both * with and without sorting enabled. The sorted files have a "-sorted.txt" * suffix. * * So for the file "src/test/resources/input/hello.pdf", an output file will * be generated named "target/test-output/hello.pdf.txt". Then that file * will be compared to the known good file * "src/test/resources/input/hello.pdf.txt", if it exists. * * To support testing with files that are not officially distributed * with PDFBox, this test will also look in the "target/test-input-ext" * directory. * * Any errors are logged, and at the end of processing all *.pdf files, if * there were any errors, the test fails. The logging is at INFO, as the * general goal is overall validation, and on failure, the indication of * which file or files failed. * * When processing new PDF files, you may use testAll() to generate output, * verify the output manually, then move the output file to the test input * directory to use as the basis for future validations. * * SINGLE FILE VALIDATION * * To further research individual failures, the org.apache.pdfbox.util.TextStripper.file * system property may be set with the name of a single file in the "test/input" * directory. In this mode, testAll() will evaluate only that file, and will * do so with DEBUG level logging. * * @author Robert Dickinson * @author Ben Litchfield */ public class TestTextStripper extends TestCase { /** * Logger instance. */ private static final Log log = LogFactory.getLog(TestTextStripper.class); private boolean bFail = false; private PDFTextStripper stripper = null; private static final String ENCODING = "UTF-8"; /** * Test class constructor. * * @param name The name of the test class. * * @throws IOException If there is an error creating the test. */ public TestTextStripper( String name ) throws IOException { super( name ); stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); } /** * Test suite setup. */ @Override public void setUp() { // If you want to test a single file using DEBUG logging, from an IDE, // you can do something like this: // // System.setProperty("org.apache.pdfbox.util.TextStripper.file", "FVS318Ref.pdf"); } /** * Determine whether two strings are equal, where two null strings are * considered equal. * * @param expected Expected string * @param actual Actual String * @return <code>true</code> is the strings are both null, * or if their contents are the same, otherwise <code>false</code>. */ private boolean stringsEqual(String expected, String actual) { boolean equals = true; if( (expected == null) && (actual == null) ) { return true; } else if( expected != null && actual != null ) { expected = expected.trim(); actual = actual.trim(); char[] expectedArray = expected.toCharArray(); char[] actualArray = actual.toCharArray(); int expectedIndex = 0; int actualIndex = 0; while( expectedIndex<expectedArray.length && actualIndex<actualArray.length ) { if( expectedArray[expectedIndex] != actualArray[actualIndex] ) { equals = false; log.warn("Lines differ at index" + " expected:" + expectedIndex + "-" + (int)expectedArray[expectedIndex] + " actual:" + actualIndex + "-" + (int)actualArray[actualIndex] ); break; } expectedIndex = skipWhitespace( expectedArray, expectedIndex ); actualIndex = skipWhitespace( actualArray, actualIndex ); expectedIndex++; actualIndex++; } if( equals ) { if( expectedIndex != expectedArray.length ) { equals = false; log.warn("Expected line is longer at:" + expectedIndex ); } if( actualIndex != actualArray.length ) { equals = false; log.warn("Actual line is longer at:" + actualIndex ); } } } else { equals = (expected == null && actual != null && actual.trim().isEmpty()) || (actual == null && expected != null && expected.trim().isEmpty()); } return equals; } /** * If the current index is whitespace then skip any subsequent whitespace. */ private int skipWhitespace( char[] array, int index ) { //if we are at a space character then skip all space //characters, but when all done rollback 1 because stringsEqual //will roll forward 1 if( array[index] == ' ' || array[index] > 256 ) { while( index < array.length && (array[index] == ' ' || array[index] > 256)) { index++; } index--; } return index; } /** * Validate text extraction on a single file. * * @param inFile The PDF file to validate * @param outDir The directory to store the output in * @param bLogResult Whether to log the extracted text * @param bSort Whether or not the extracted text is sorted * @throws Exception when there is an exception */ public void doTestFile(File inFile, File outDir, boolean bLogResult, boolean bSort) throws Exception { if(bSort) { log.info("Preparing to parse " + inFile.getName() + " for sorted test"); } else { log.info("Preparing to parse " + inFile.getName() + " for standard test"); } if (!outDir.exists()) { if (!outDir.mkdirs()) { throw (new Exception("Error creating " + outDir.getAbsolutePath() + " directory")); } } //System.out.println(" " + inFile + (bSort ? " (sorted)" : "")); try (PDDocument document = PDDocument.load(inFile)) { File outFile; File diffFile; File expectedFile; if(bSort) { outFile = new File(outDir, inFile.getName() + "-sorted.txt"); diffFile = new File(outDir, inFile.getName() + "-sorted-diff.txt"); expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt"); } else { outFile = new File(outDir, inFile.getName() + ".txt"); diffFile = new File(outDir, inFile.getName() + "-diff.txt"); expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt"); } // delete possible leftover diffFile.delete(); try (OutputStream os = new FileOutputStream(outFile)) { os.write (0xEF); os.write (0xBB); os.write (0xBF); try (Writer writer = new BufferedWriter(new OutputStreamWriter(os, ENCODING))) { //Allows for sorted tests stripper.setSortByPosition(bSort); stripper.writeText(document, writer); // close the written file before reading it again } } if (bLogResult) { log.info("Text for " + inFile.getName() + ":"); log.info(stripper.getText(document)); } if (!expectedFile.exists()) { this.bFail = true; log.error("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + " did not exist"); return; } boolean localFail = false; try (LineNumberReader expectedReader = new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), ENCODING)); LineNumberReader actualReader = new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), ENCODING))) { while (true) { String expectedLine = expectedReader.readLine(); while( expectedLine != null && expectedLine.trim().length() == 0 ) { expectedLine = expectedReader.readLine(); } String actualLine = actualReader.readLine(); while( actualLine != null && actualLine.trim().length() == 0 ) { actualLine = actualReader.readLine(); } if (!stringsEqual(expectedLine, actualLine)) { this.bFail = true; localFail = true; log.error("FAILURE: Line mismatch for file " + inFile.getName() + " (sort = "+bSort+")" + " at expected line: " + expectedReader.getLineNumber() + " at actual line: " + actualReader.getLineNumber() + "\nexpected line was: \"" + expectedLine + "\"" + "\nactual line was: \"" + actualLine + "\"" + "\n"); //lets report all lines, even though this might produce some verbose logging //break; } if( expectedLine == null || actualLine==null) { break; } } } if (!localFail) { outFile.delete(); } else { // https://code.google.com/p/java-diff-utils/wiki/SampleUsage List<String> original = fileToLines(expectedFile); List<String> revised = fileToLines(outFile); // Compute diff. Get the Patch object. Patch is the container for computed deltas. Patch patch = DiffUtils.diff(original, revised); try (PrintStream diffPS = new PrintStream(diffFile, ENCODING)) { for (Object delta : patch.getDeltas()) { if (delta instanceof ChangeDelta) { ChangeDelta cdelta = (ChangeDelta) delta; diffPS.println("Org: " + cdelta.getOriginal()); diffPS.println("New: " + cdelta.getRevised()); diffPS.println(); } else if (delta instanceof DeleteDelta) { DeleteDelta ddelta = (DeleteDelta) delta; diffPS.println("Org: " + ddelta.getOriginal()); diffPS.println("New: " + ddelta.getRevised()); diffPS.println(); } else if (delta instanceof InsertDelta) { InsertDelta idelta = (InsertDelta) delta; diffPS.println("Org: " + idelta.getOriginal()); diffPS.println("New: " + idelta.getRevised()); diffPS.println(); } else { diffPS.println(delta); } } } } } } // Helper method for get the file content private static List<String> fileToLines(File file) { List<String> lines = new LinkedList<>(); String line; try { try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), ENCODING))) { while ((line = in.readLine()) != null) { lines.add(line); } } } catch (IOException e) { e.printStackTrace(); } return lines; } private int findOutlineItemDestPageNum(PDDocument doc, PDOutlineItem oi) throws IOException { PDPageDestination pageDest = (PDPageDestination) oi.getDestination(); // two methods to get the page index, the result should be identical! int indexOfPage = doc.getPages().indexOf(oi.findDestinationPage(doc)); int pageNum = pageDest.retrievePageNumber(); assertEquals(indexOfPage, pageNum); return pageNum; } /** * Test whether stripping controlled by outline items works properly. The test file has 4 * outline items at the top level, that point to 0-based pages 0, 2, 3 and 4. We are testing * text stripping by outlines pointing to 0-based pages 2 and 3, and also text stripping of the * 0-based page 2. The test makes sure that the output is different to a complete strip, not * empty, different to each other when different bookmark intervals are used, but identical from * bookmark intervals to strips with page intervals. When fed with orphan bookmarks, stripping * must be empty. * * @throws IOException * @throws URISyntaxException */ public void testStripByOutlineItems() throws IOException, URISyntaxException { PDDocument doc = PDDocument.load(new File(TestPDPageTree.class.getResource("with_outline.pdf").toURI())); PDDocumentOutline outline = doc.getDocumentCatalog().getDocumentOutline(); Iterable<PDOutlineItem> children = outline.children(); Iterator<PDOutlineItem> it = children.iterator(); PDOutlineItem oi0 = it.next(); PDOutlineItem oi2 = it.next(); PDOutlineItem oi3 = it.next(); PDOutlineItem oi4 = it.next(); assertEquals(0, findOutlineItemDestPageNum(doc, oi0)); assertEquals(2, findOutlineItemDestPageNum(doc, oi2)); assertEquals(3, findOutlineItemDestPageNum(doc, oi3)); assertEquals(4, findOutlineItemDestPageNum(doc, oi4)); String textFull = stripper.getText(doc); assertFalse(textFull.isEmpty()); String expectedTextFull = "First level 1\n" + "First level 2\n" + "Fist level 3\n" + "Some content\n" + "Some other content\n" + "Second at level 1\n" + "Second level 2\n" + "Content\n" + "Third level 1\n" + "Third level 2\n" + "Third level 3\n" + "Content\n" + "Fourth level 1\n" + "Content\n" + "Content\n"; assertEquals(expectedTextFull, textFull.replaceAll("\r", "")); // this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4 // by their bookmarks stripper.setStartBookmark(oi2); stripper.setEndBookmark(oi3); String textoi23 = stripper.getText(doc); assertFalse(textoi23.isEmpty()); assertFalse(textoi23.equals(textFull)); String expectedTextoi23 = "Second at level 1\n" + "Second level 2\n" + "Content\n" + "Third level 1\n" + "Third level 2\n" + "Third level 3\n" + "Content\n"; assertEquals(expectedTextoi23, textoi23.replaceAll("\r", "")); // this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4 // by their page numbers stripper.setStartBookmark(null); stripper.setEndBookmark(null); stripper.setStartPage(3); stripper.setEndPage(4); String textp34 = stripper.getText(doc); assertFalse(textp34.isEmpty()); assertFalse(textoi23.equals(textFull)); assertTrue(textoi23.equals(textp34)); // this should grab 0-based page 2, i.e. 1-based page 3 // by the bookmark stripper.setStartBookmark(oi2); stripper.setEndBookmark(oi2); String textoi2 = stripper.getText(doc); assertFalse(textoi2.isEmpty()); assertFalse(textoi2.equals(textoi23)); assertFalse(textoi23.equals(textFull)); String expectedTextoi2 = "Second at level 1\n" + "Second level 2\n" + "Content\n"; assertEquals(expectedTextoi2, textoi2.replaceAll("\r", "")); // this should grab 0-based page 2, i.e. 1-based page 3 // by the page number stripper.setStartBookmark(null); stripper.setEndBookmark(null); stripper.setStartPage(3); stripper.setEndPage(3); String textp3 = stripper.getText(doc); assertFalse(textp3.isEmpty()); assertFalse(textp3.equals(textp34)); assertFalse(textoi23.equals(textFull)); assertTrue(textoi2.equals(textp3)); // Test with orphan bookmark PDOutlineItem oiOrphan = new PDOutlineItem(); stripper.setStartBookmark(oiOrphan); stripper.setEndBookmark(oiOrphan); String textOiOrphan = stripper.getText(doc); assertTrue(textOiOrphan.isEmpty()); } /** * Process each file in the specified directory. * @param inDir Input directory search for PDF files in. * @param outDir Output directory where the temp files will be created. */ private void doTestDir(File inDir, File outDir) throws Exception { File[] testFiles = inDir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return (name.endsWith(".pdf")); } }); for (File testFile : testFiles) { //Test without sorting doTestFile(testFile, outDir, false, false); //Test with sorting doTestFile(testFile, outDir, false, true); } } /** * Test to validate text extraction of file set. * * @throws Exception when there is an exception */ public void testExtract() throws Exception { String filename = System.getProperty("org.apache.pdfbox.util.TextStripper.file"); File inDir = new File("src/test/resources/input"); File outDir = new File("target/test-output"); File inDirExt = new File("target/test-input-ext"); File outDirExt = new File("target/test-output-ext"); if ((filename == null) || (filename.length() == 0)) { doTestDir(inDir, outDir); if (inDirExt.exists()) { doTestDir(inDirExt, outDirExt); } } else { //Test without sorting doTestFile(new File(inDir, filename), outDir, true, false); //Test with sorting doTestFile(new File(inDir, filename), outDir, true, true); } if (this.bFail) { fail("One or more failures, see test log for details"); } } /** * Set the tests in the suite for this test class. * * @return the Suite. */ public static Test suite() { return new TestSuite( TestTextStripper.class ); } /** * Command line execution. * * @param args Command line arguments. */ public static void main( String[] args ) { String[] arg = {TestTextStripper.class.getName() }; junit.textui.TestRunner.main( arg ); } }