package technology.tabula; import java.io.File; import java.io.FileWriter; import java.io.FilenameFilter; import java.io.IOException; import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; import static org.junit.Assert.*; import com.google.gson.Gson; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.w3c.dom.*; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.pdfbox.pdmodel.PDDocument; import technology.tabula.detectors.NurminenDetectionAlgorithm; /** * Created by matt on 2015-12-14. */ @RunWith(Parameterized.class) public class TestTableDetection { private static int numTests = 0; private static int numPassingTests = 0; private static int totalExpectedTables = 0; private static int totalCorrectlyDetectedTables = 0; private static int totalErroneouslyDetectedTables = 0; private static Level defaultLogLevel; private static final class TestStatus { public int numExpectedTables; public int numCorrectlyDetectedTables; public int numErroneouslyDetectedTables; public boolean expectedFailure; private transient boolean firstRun; private transient String pdfFilename; public TestStatus() { this(null); } public TestStatus(String pdfFilename) { this.numExpectedTables = 0; this.numCorrectlyDetectedTables = 0; this.expectedFailure = false; this.pdfFilename = pdfFilename; } public static TestStatus load(String pdfFilename) { TestStatus status; try { String json = UtilsForTesting.loadJson(jsonFilename(pdfFilename)); status = new Gson().fromJson(json, TestStatus.class); status.pdfFilename = pdfFilename; } catch (IOException ioe) { status = new TestStatus(pdfFilename); status.firstRun = true; } return status; } public void save() { try { FileWriter w = new FileWriter(jsonFilename(this.pdfFilename)); Gson gson = new Gson(); w.write(gson.toJson(this)); w.close(); } catch (Exception e) { } } public boolean isFirstRun() { return this.firstRun; } private static String jsonFilename(String pdfFilename) { return pdfFilename.replace(".pdf", ".json"); } } @BeforeClass public static void disableLogging() { Logger pdfboxLogger = Logger.getLogger("org.apache.pdfbox"); defaultLogLevel = pdfboxLogger.getLevel(); pdfboxLogger.setLevel(Level.OFF); } @AfterClass public static void enableLogging() { Logger.getLogger("org.apache.pdfbox").setLevel(defaultLogLevel); } @Parameterized.Parameters public static Collection<Object[]> data() { String[] regionCodes = {"eu", "us"}; ArrayList<Object[]> data = new ArrayList<Object[]>(); for (String regionCode : regionCodes) { String directoryName = "src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-" + regionCode + "/"; File dir = new File(directoryName); File[] pdfs = dir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.toLowerCase().endsWith(".pdf"); } }); for (File pdf : pdfs) { data.add(new Object[] {pdf}); } } return data; } private File pdf; private DocumentBuilder builder; private TestStatus status; private int numCorrectlyDetectedTables = 0; private int numErroneouslyDetectedTables = 0; public TestTableDetection(File pdf) { this.pdf = pdf; this.status = TestStatus.load(pdf.getAbsolutePath()); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); try { this.builder = factory.newDocumentBuilder(); } catch (Exception e) { } } private void printTables(Map<Integer, List<Rectangle>> tables) { for (Integer page : tables.keySet()) { System.out.println("Page " + page.toString()); for (Rectangle table : tables.get(page)) { System.out.println(table); } } } @Test public void testDetectionOfTables() throws Exception { numTests++; // xml parsing stuff for ground truth Document regionDocument = this.builder.parse(this.pdf.getAbsolutePath().replace(".pdf", "-reg.xml")); NodeList tables = regionDocument.getElementsByTagName("table"); // tabula extractors PDDocument pdfDocument = PDDocument.load(this.pdf); ObjectExtractor extractor = new ObjectExtractor(pdfDocument); // parse expected tables from the ground truth dataset Map<Integer, List<Rectangle>> expectedTables = new HashMap<Integer, List<Rectangle>>(); int numExpectedTables = 0; for (int i=0; i<tables.getLength(); i++) { Element table = (Element) tables.item(i); Element region = (Element) table.getElementsByTagName("region").item(0); Element boundingBox = (Element) region.getElementsByTagName("bounding-box").item(0); // we want to know where tables appear in the document - save the page and areas where tables appear Integer page = Integer.decode(region.getAttribute("page")); float x1 = Float.parseFloat(boundingBox.getAttribute("x1")); float y1 = Float.parseFloat(boundingBox.getAttribute("y1")); float x2 = Float.parseFloat(boundingBox.getAttribute("x2")); float y2 = Float.parseFloat(boundingBox.getAttribute("y2")); List<Rectangle> pageTables = expectedTables.get(page); if (pageTables == null) { pageTables = new ArrayList<Rectangle>(); expectedTables.put(page, pageTables); } // have to invert y co-ordinates // unfortunately the ground truth doesn't contain page dimensions // do some extra work to extract the page with tabula and get the dimensions from there Page extractedPage = extractor.extractPage(page); float top = (float)extractedPage.getHeight() - y2; float left = x1; float width = x2 - x1; float height = y2 - y1; pageTables.add(new Rectangle(top, left, width, height)); numExpectedTables++; } // now find tables detected by tabula-java Map<Integer, List<Rectangle>> detectedTables = new HashMap<Integer, List<Rectangle>>(); // the algorithm we're going to be testing NurminenDetectionAlgorithm detectionAlgorithm = new NurminenDetectionAlgorithm(); PageIterator pages = extractor.extract(); while (pages.hasNext()) { Page page = pages.next(); List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page); if (tablesOnPage.size() > 0) { detectedTables.put(new Integer(page.getPageNumber()), tablesOnPage); } } // now compare System.out.println("Testing " + this.pdf.getName()); List<String> errors = new ArrayList<String>(); this.status.numExpectedTables = numExpectedTables; totalExpectedTables += numExpectedTables; for (Integer page : expectedTables.keySet()) { List<Rectangle> expectedPageTables = expectedTables.get(page); List<Rectangle> detectedPageTables = detectedTables.get(page); if (detectedPageTables == null) { errors.add("Page " + page.toString() + ": " + expectedPageTables.size() + " expected tables not found"); continue; } errors.addAll(this.comparePages(page, detectedPageTables, expectedPageTables)); detectedTables.remove(page); } // leftover pages means we detected extra tables for (Integer page : detectedTables.keySet()) { List<Rectangle> detectedPageTables = detectedTables.get(page); errors.add("Page " + page.toString() + ": " + detectedPageTables.size() + " tables detected where there are none"); this.numErroneouslyDetectedTables += detectedPageTables.size(); totalErroneouslyDetectedTables += detectedPageTables.size(); } boolean failed = errors.size() > 0; if (failed) { System.out.println("==== CURRENT TEST ERRORS ===="); for (String error : errors) { System.out.println(error); } } else { numPassingTests++; } System.out.println("==== CUMULATIVE TEST STATISTICS ===="); System.out.println(numPassingTests + " out of " + numTests + " currently passing"); System.out.println(totalCorrectlyDetectedTables + " out of " + totalExpectedTables + " expected tables detected"); System.out.println(totalErroneouslyDetectedTables + " tables incorrectly detected"); if(this.status.isFirstRun()) { // make the baseline this.status.expectedFailure = failed; this.status.numCorrectlyDetectedTables = this.numCorrectlyDetectedTables; this.status.numErroneouslyDetectedTables = this.numErroneouslyDetectedTables; this.status.save(); } else { // compare to baseline if (this.status.expectedFailure) { // make sure the failure didn't get worse assertTrue("This test is an expected failure, but it now detects even fewer tables.", this.numCorrectlyDetectedTables >= this.status.numCorrectlyDetectedTables); assertTrue("This test is an expected failure, but it now detects more bad tables.", this.numErroneouslyDetectedTables <= this.status.numErroneouslyDetectedTables); assertTrue("This test used to fail but now it passes! Hooray! Please update the test's JSON file accordingly.", failed); } else { assertFalse("Table detection failed. Please see the error messages for more information.", failed); } } } private List<String> comparePages(Integer page, List<Rectangle> detected, List<Rectangle> expected) { ArrayList<String> errors = new ArrayList<String>(); // go through the detected tables and try to match them with expected tables // from http://www.orsigiorgio.net/wp-content/papercite-data/pdf/gho*12.pdf (comparing regions): // for other (e.g.“black-box”) algorithms, bounding boxes and content are used. A region is correct if it // contains the minimal bounding box of the ground truth without intersecting additional content. for (Iterator<Rectangle> detectedIterator = detected.iterator(); detectedIterator.hasNext();) { Rectangle detectedTable = detectedIterator.next(); for (int i=0; i<expected.size(); i++) { if (detectedTable.contains(expected.get(i))) { // we have a candidate for the detected table, make sure it doesn't intersect any others boolean intersectsOthers = false; for (int j=0; j<expected.size(); j++) { if (i == j) continue; if (detectedTable.intersects(expected.get(j))) { intersectsOthers = true; break; } } if (!intersectsOthers) { // success detectedIterator.remove(); expected.remove(i); this.numCorrectlyDetectedTables++; totalCorrectlyDetectedTables++; break; } } } } // any expected tables left over weren't detected for (Rectangle expectedTable : expected) { errors.add("Page " + page.toString() + ": " + expectedTable.toString() + " not detected"); } // any detected tables left over were detected erroneously for (Rectangle detectedTable : detected) { errors.add("Page " + page.toString() + ": " + detectedTable.toString() + " detected where there is no table"); this.numErroneouslyDetectedTables++; totalErroneouslyDetectedTables++; } return errors; } }