/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
/**
* Test suite for PDFTextStripper.
*
* FILE SET VALIDATION
*
* This test suite is designed to test PDFTextStripper using a set of PDF
* files and known good output for each. The default mode of testAll()
* is to process each *.pdf file in "src/test/resources/input". An output
* file is created in "target/test-output" with the same name as the PDF file,
* plus an additional ".txt" suffix.
*
* The output file is then tested against a known good result file from
* the input directory (again, with the same name as the tested PDF file,
* but with the additional ".txt" suffix). The process is performed both
* with and without sorting enabled. The sorted files have a "-sorted.txt"
* suffix.
*
* So for the file "src/test/resources/input/hello.pdf", an output file will
* be generated named "target/test-output/hello.pdf.txt". Then that file
* will be compared to the known good file
* "src/test/resources/input/hello.pdf.txt", if it exists.
*
* To support testing with files that are not officially distributed
* with PDFBox, this test will also look in the "target/test-input-ext"
* directory.
*
* Any errors are logged, and at the end of processing all *.pdf files, if
* there were any errors, the test fails. The logging is at INFO, as the
* general goal is overall validation, and on failure, the indication of
* which file or files failed.
*
* When processing new PDF files, you may use testAll() to generate output,
* verify the output manually, then move the output file to the test input
* directory to use as the basis for future validations.
*
* SINGLE FILE VALIDATION
*
* To further research individual failures, the org.apache.pdfbox.util.TextStripper.file
* system property may be set with the name of a single file in the "test/input"
* directory. In this mode, testAll() will evaluate only that file, and will
* do so with DEBUG level logging. You can set this property from ant by
* defining "file", as in:
*
* ant testextract -Dfile=hello.pdf
*
* @author Robert Dickinson (bob@brutesquadlabs.com)
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.19 $
*/
public class TestTextStripper extends TestCase
{
/**
* Logger instance.
*/
private static final Log log = LogFactory.getLog(TestTextStripper.class);
private boolean bFail = false;
private PDFTextStripper stripper = null;
private final String encoding = "UTF-16LE";
/**
* Test class constructor.
*
* @param name The name of the test class.
*
* @throws IOException If there is an error creating the test.
*/
public TestTextStripper( String name ) throws IOException
{
super( name );
stripper = new PDFTextStripper(encoding);
stripper.setLineSeparator("\n");
}
/**
* Test suite setup.
*/
public void setUp()
{
// If you want to test a single file using DEBUG logging, from an IDE,
// you can do something like this:
//
// System.setProperty("org.apache.pdfbox.util.TextStripper.file", "FVS318Ref.pdf");
}
/**
* Determine whether two strings are equal, where two null strings are
* considered equal.
*
* @param expected Expected string
* @param actual Actual String
* @return <code>true</code> is the strings are both null,
* or if their contents are the same, otherwise <code>false</code>.
*/
private boolean stringsEqual(String expected, String actual)
{
boolean equals = true;
if( (expected == null) && (actual == null) )
{
return true;
}
else if( expected != null && actual != null )
{
expected = expected.trim();
actual = actual.trim();
char[] expectedArray = expected.toCharArray();
char[] actualArray = actual.toCharArray();
int expectedIndex = 0;
int actualIndex = 0;
while( expectedIndex<expectedArray.length && actualIndex<actualArray.length )
{
if( expectedArray[expectedIndex] != actualArray[actualIndex] )
{
equals = false;
log.warn("Lines differ at index"
+ " expected:" + expectedIndex + "-" + (int)expectedArray[expectedIndex]
+ " actual:" + actualIndex + "-" + (int)actualArray[actualIndex] );
break;
}
expectedIndex = skipWhitespace( expectedArray, expectedIndex );
actualIndex = skipWhitespace( actualArray, actualIndex );
expectedIndex++;
actualIndex++;
}
if( equals )
{
if( expectedIndex != expectedArray.length )
{
equals = false;
log.warn("Expected line is longer at:" + expectedIndex );
}
if( actualIndex != actualArray.length )
{
equals = false;
log.warn("Actual line is longer at:" + actualIndex );
}
}
}
else if( ( expected == null && actual != null && actual.trim().equals( "" ) ) ||
( actual == null && expected != null && expected.trim().equals( "" ) ) )
{
//basically there are some cases where pdfbox will put an extra line
//at the end of the file, who cares, this is not enough to report
// a failure
equals = true;
}
else
{
equals = false;
}
return equals;
}
/**
* If the current index is whitespace then skip any subsequent whitespace.
*/
private int skipWhitespace( char[] array, int index )
{
//if we are at a space character then skip all space
//characters, but when all done rollback 1 because stringsEqual
//will roll forward 1
if( array[index] == ' ' || array[index] > 256 )
{
while( index < array.length && (array[index] == ' ' || array[index] > 256))
{
index++;
}
index--;
}
return index;
}
/**
* Validate text extraction on a single file.
*
* @param inFile The PDF file to validate
* @param outDir The directory to store the output in
* @param bLogResult Whether to log the extracted text
* @param bSort Whether or not the extracted text is sorted
* @throws Exception when there is an exception
*/
public void doTestFile(File inFile, File outDir, boolean bLogResult, boolean bSort)
throws Exception
{
if(bSort)
{
log.info("Preparing to parse " + inFile.getName() + " for sorted test");
}
else
{
log.info("Preparing to parse " + inFile.getName() + " for standard test");
}
if (!outDir.exists())
{
if (!outDir.mkdirs())
{
throw (new Exception("Error creating " + outDir.getAbsolutePath() + " directory"));
}
}
PDDocument document = PDDocument.load(inFile);
try
{
File outFile = null;
File expectedFile = null;
if(bSort)
{
outFile = new File(outDir, inFile.getName() + "-sorted.txt");
expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt");
}
else
{
outFile = new File(outDir, inFile.getName() + ".txt");
expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
}
OutputStream os = new FileOutputStream(outFile);
try {
os.write( 0xFF );
os.write( 0xFE );
Writer writer = new OutputStreamWriter(os, encoding);
try {
//Allows for sorted tests
stripper.setSortByPosition(bSort);
stripper.writeText(document, writer);
} finally {
// close the written file before reading it again
writer.close();
}
} finally {
os.close();
}
if (bLogResult)
{
log.info("Text for " + inFile.getName() + ":");
log.info(stripper.getText(document));
}
if (!expectedFile.exists())
{
this.bFail = true;
log.error(
"FAILURE: Input verification file: " + expectedFile.getAbsolutePath() +
" did not exist");
return;
}
LineNumberReader expectedReader =
new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), encoding));
LineNumberReader actualReader =
new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), encoding));
while (true)
{
String expectedLine = expectedReader.readLine();
while( expectedLine != null && expectedLine.trim().length() == 0 )
{
expectedLine = expectedReader.readLine();
}
String actualLine = actualReader.readLine();
while( actualLine != null && actualLine.trim().length() == 0 )
{
actualLine = actualReader.readLine();
}
if (!stringsEqual(expectedLine, actualLine))
{
this.bFail = true;
log.error("FAILURE: Line mismatch for file " + inFile.getName() +
" ( sort = "+bSort+")" +
" at expected line: " + expectedReader.getLineNumber() +
" at actual line: " + actualReader.getLineNumber());
log.error(" expected line was: \"" + expectedLine + "\"");
log.error(" actual line was: \"" + actualLine + "\"" + "\n");
//lets report all lines, even though this might produce some verbose logging
//break;
}
if( expectedLine == null || actualLine==null)
{
break;
}
}
}
finally
{
document.close();
}
}
/**
* Process each file in the specified directory.
* @param inDir Input directory search for PDF files in.
* @param outDir Output directory where the temp files will be created.
*/
private void doTestDir(File inDir, File outDir) throws Exception
{
File[] testFiles = inDir.listFiles(new FilenameFilter()
{
public boolean accept(File dir, String name)
{
return (name.endsWith(".pdf"));
}
});
for (int n = 0; n < testFiles.length; n++)
{
//Test without sorting
doTestFile(testFiles[n], outDir, false, false);
//Test with sorting
doTestFile(testFiles[n], outDir, false, true);
}
}
/**
* Test to validate text extraction of file set.
*
* @throws Exception when there is an exception
*/
public void testExtract()
throws Exception
{
String filename = System.getProperty("org.apache.pdfbox.util.TextStripper.file");
File inDir = new File("src/test/resources/input");
File outDir = new File("target/test-output");
File inDirExt = new File("target/test-input-ext");
File outDirExt = new File("target/test-output-ext");
if ((filename == null) || (filename.length() == 0))
{
doTestDir(inDir, outDir);
if (inDirExt.exists())
{
doTestDir(inDirExt, outDirExt);
}
}
else
{
//Test without sorting
doTestFile(new File(inDir, filename), outDir, true, false);
//Test with sorting
doTestFile(new File(inDir, filename), outDir, true, true);
}
if (this.bFail)
{
fail("One or more failures, see test log for details");
}
}
/**
* Set the tests in the suite for this test class.
*
* @return the Suite.
*/
public static Test suite()
{
return new TestSuite( TestTextStripper.class );
}
/**
* Command line execution.
*
* @param args Command line arguments.
*/
public static void main( String[] args )
{
String[] arg = {TestTextStripper.class.getName() };
junit.textui.TestRunner.main( arg );
}
}