package edu.isi.bmkeg.lapdf.controller; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import org.apache.log4j.Logger; import org.jpedal.exception.PdfException; import edu.isi.bmkeg.lapdf.classification.ruleBased.RuleBasedChunkClassifier; import edu.isi.bmkeg.lapdf.extraction.exceptions.AccessException; import edu.isi.bmkeg.lapdf.extraction.exceptions.ClassificationException; import edu.isi.bmkeg.lapdf.extraction.exceptions.EncryptionException; import edu.isi.bmkeg.lapdf.model.Block; import edu.isi.bmkeg.lapdf.model.ChunkBlock; import edu.isi.bmkeg.lapdf.model.LapdfDocument; import edu.isi.bmkeg.lapdf.model.PageBlock; import edu.isi.bmkeg.lapdf.model.RTree.RTModelFactory; import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering; import edu.isi.bmkeg.lapdf.parser.RuleBasedParser; import edu.isi.bmkeg.lapdf.text.SectionsTextWriter; import edu.isi.bmkeg.lapdf.text.SpatialLayoutFeaturesReportGenerator; import edu.isi.bmkeg.lapdf.text.SpatiallyOrderedChunkTextWriter; import edu.isi.bmkeg.lapdf.text.SpatiallyOrderedChunkTypeFilteredTextWriter; import edu.isi.bmkeg.lapdf.utils.JPedalPDFRenderer; import edu.isi.bmkeg.lapdf.utils.PageImageOutlineRenderer; import edu.isi.bmkeg.lapdf.xml.OpenAccessXMLWriter; import edu.isi.bmkeg.lapdf.xml.SpatialXMLWriter; import edu.isi.bmkeg.utils.Converters; /** * Basic Java API to high-level LAPDFText functionality, including: * * 1) Gathering layout statistics for the PDF file * 2) Running Block-based spatial chunker on PDF. * 3) Classifying texts of blocks in the file to categories based on a rule file. * 4) Outputting text or XML to file * 5) Rendering pages images of text layout or the original PDF file as PNG files * 6) Serializing LAPDFText object to a VPDMf database record. * * @author burns * */ public class LapdfEngine { private static Logger logger = Logger.getLogger(LapdfEngine.class); private RuleBasedParser parser; private File ruleFile; private boolean imgFlag = false; private JPedalPDFRenderer imagifier = new JPedalPDFRenderer(); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public LapdfEngine() throws Exception { this.parser = new RuleBasedParser(new RTModelFactory()); URL u = this.getClass().getClassLoader().getResource("rules/general.drl"); this.setRuleFile(new File(u.getPath())); } public LapdfEngine(File ruleFile) throws Exception { this.parser = new RuleBasedParser(new RTModelFactory()); this.setRuleFile(ruleFile); } public LapdfEngine(boolean imgFlag) throws Exception { this.parser = new RuleBasedParser(new RTModelFactory()); URL u = this.getClass().getClassLoader().getResource("rules/general.drl"); this.setRuleFile(new File(u.getPath())); this.setImgFlag(imgFlag); } public LapdfEngine(File ruleFile, boolean imgFlag) throws Exception { this.parser = new RuleBasedParser(new RTModelFactory()); this.setRuleFile(ruleFile); this.setImgFlag(imgFlag); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public RuleBasedParser getParser() { return parser; } public void setParser(RuleBasedParser parser) { this.parser = parser; } public boolean isImgFlag() { return imgFlag; } public void setImgFlag(boolean imgFlag) { this.imgFlag = imgFlag; } public File getRuleFile() { return ruleFile; } public void setRuleFile(File ruleFile) { this.ruleFile = ruleFile; } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public void processBlocks(File inFile, File outDir, boolean reportBlocks, boolean extractUnclassified) throws Exception { String stem = inFile.getName(); stem = stem.substring(0, stem.lastIndexOf(".")); this.parser.setPath(outDir.getPath()); LapdfDocument doc = blockifyPdfFile(inFile); if (doc == null) { logger.info("Error encountered while performing block detection." + " Skipping " + inFile.getPath() + " because doc is null"); return; } logger.info("Writing spatial block xml to " + outDir.getPath() + "/" + stem + "_spatial.xml"); if( this.isImgFlag() ) this.renderImageOutlines(doc, outDir, stem, LapdfMode.BLOCK_ONLY); SpatialXMLWriter sxw = new SpatialXMLWriter(); sxw.write(doc, outDir.getPath() + "/" + stem + "_spatial.xml"); if (reportBlocks) { logger.info("Running block feature reporter on " + inFile.getPath()); SpatialLayoutFeaturesReportGenerator slfrg = new SpatialLayoutFeaturesReportGenerator(); slfrg.write(doc, outDir.getPath() + "/" + stem + "_spatialFeatures.dat"); } if (extractUnclassified) { SpatiallyOrderedChunkTextWriter soctw = new SpatiallyOrderedChunkTextWriter(); soctw.write(doc, outDir.getPath() + "/" + stem + "_unclassifiedFlowAwareText.dat"); } } public void processClassify(File inFile, File outDir, boolean reportBlocks, boolean extractUnclassified) throws Exception { String stem = inFile.getName(); stem = stem.substring(0, stem.lastIndexOf(".")); this.parser.setPath(outDir.getPath()); LapdfDocument doc = blockifyPdfFile(inFile); if (doc == null) { logger.info("Error encountered while performing block detection. Skipping " + inFile.getPath() + " because doc is null"); return; } logger.info("Writing spatial block xml to " + outDir.getPath() + "/" + stem + "_spatial.xml"); SpatialXMLWriter sxw = new SpatialXMLWriter(); sxw.write(doc, outDir.getPath() + "/" + stem + "_spatial.xml"); logger.info("Running block classification on " + inFile.getPath()); classifyDocument(doc, this.getRuleFile()); if( this.isImgFlag() ) this.renderImageOutlines(doc, outDir, stem, LapdfMode.CLASSIFY); logger.info("Writing block classified XML in OpenAccess format " + outDir.getPath() + "/" + stem + "_rhetorical.xml"); OpenAccessXMLWriter oaxw = new OpenAccessXMLWriter(); oaxw.write(doc, outDir.getPath() + "/" + stem + "_rhetorical.xml"); if (reportBlocks) { logger.info("Running block feature reporter on " + inFile.getPath()); SpatialLayoutFeaturesReportGenerator slfrg = new SpatialLayoutFeaturesReportGenerator(); slfrg.write(doc, outDir.getPath() + "/" + stem + "_spatialFeatures.dat"); } if (extractUnclassified) { SpatiallyOrderedChunkTextWriter soctw = new SpatiallyOrderedChunkTextWriter(); soctw.write(doc, outDir.getPath() + "/" + stem + "_unclassifiedFlowAwareText.dat"); } } public void processSectionFilter(File inFile, File outDir, boolean reportBlocks, boolean extractUnclassified) throws Exception { String stem = inFile.getName(); stem = stem.substring(0, stem.lastIndexOf(".")); this.parser.setPath(outDir.getPath()); logger.info("Running block detection on " + inFile.getPath()); LapdfDocument doc = blockifyPdfFile(inFile); if (doc == null) { logger.info("Error encountered while performing block detection. Skipping " + inFile.getPath() + " because doc is null"); return; } logger.info("Running block classification on " + inFile.getPath()); classifyDocument(doc, this.getRuleFile()); if( this.isImgFlag() ) this.renderImageOutlines(doc, outDir, stem, LapdfMode.SECTION_FILTER); SpatiallyOrderedChunkTypeFilteredTextWriter soctftw = new SpatiallyOrderedChunkTypeFilteredTextWriter(true, true); soctftw.write(doc, outDir.getPath() + "/" + stem + "_spatialFiltered.txt"); logger.info("Writing block classified XML in OpenAccess format " + outDir.getPath() + "/" + stem + "_rhetorical.xml"); if (reportBlocks) { logger.info("Running block feature reporter on " + inFile.getPath()); SpatialLayoutFeaturesReportGenerator slfrg = new SpatialLayoutFeaturesReportGenerator(); slfrg.write(doc, outDir.getPath() + "/" + stem + "_spatialFeatures.dat"); } if (extractUnclassified) { SpatiallyOrderedChunkTextWriter soctw = new SpatiallyOrderedChunkTextWriter(); soctw.write(doc, outDir.getPath() + "/" + stem + "_unclassifiedFlowAwareText.dat"); } } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // File Processing functions // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /** * Extracts the blocks within file to generate a LapdfDocument Object * @param file - input file * @return * @throws PdfException * @throws AccessException * @throws EncryptionException * @throws IOException */ public LapdfDocument blockifyPdfFile(File pdf) throws Exception { LapdfDocument doc = null; doc = parser.parse(pdf); doc.setPdfFile( pdf ); if (doc.hasjPedalDecodeFailed()) { return null; } return doc; } public void classifyDocumentWithBaselineRules(LapdfDocument document) throws ClassificationException, IOException, URISyntaxException { File f = Converters .extractFileFromJarClasspath("rules/general.drl"); this.classifyDocument(document, f); if( this.isImgFlag() ) this.renderImageOutlines(document, new File("."), "debug", LapdfMode.CLASSIFY); } /** * Classifies the chunks in a file based on the rule file * @param document - an instantiated LapdfDocument * @param ruleFile - a rule file on disk * @throws IOException */ public void classifyDocument(LapdfDocument document, File ruleFile) throws ClassificationException, IOException { RuleBasedChunkClassifier classfier = new RuleBasedChunkClassifier( ruleFile.getPath(), new RTModelFactory()); for (int i = 1; i <= document.getTotalNumberOfPages(); i++) { PageBlock page = document.getPage(i); List<ChunkBlock> chunkList = page.getAllChunkBlocks( SpatialOrdering.COLUMN_AWARE_MIXED_MODE); classfier.classify(chunkList); } } public String readBasicText(LapdfDocument document) throws IOException,FileNotFoundException { List<Set<String>> stack = new ArrayList<Set<String>>(); Set<String> sections = new HashSet<String>(); sections.add(Block.TYPE_BODY); sections.add(Block.TYPE_HEADING); stack.add(sections); sections = new HashSet<String>(); sections.add(Block.TYPE_FIGURE_LEGEND); stack.add(sections); return this.readClassifiedText(document, stack); } public String readClassifiedText(LapdfDocument document, List<Set<String>> stack) throws IOException,FileNotFoundException { StringBuilder text = new StringBuilder(); Iterator<Set<String>> it = stack.iterator(); while( it.hasNext() ) { Set<String> sections = it.next(); text.append( this.readClassifiedText(document, sections) ); } return text.toString(); } public String readClassifiedText(LapdfDocument document, Set<String> sections) throws IOException,FileNotFoundException { StringBuilder sb = new StringBuilder(); int n = document.getTotalNumberOfPages(); for (int i = 1; i <= n; i++) { PageBlock page = document.getPage(i); List<ChunkBlock> chunksPerPage = page.getAllChunkBlocks( SpatialOrdering.PAGE_COLUMN_AWARE_MIXED_MODE ); for(ChunkBlock chunkBlock:chunksPerPage){ if( sections.contains( chunkBlock.getType() ) ) { sb.append(chunkBlock.readChunkText() + "\n"); } } } return sb.toString(); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Output functions // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /** * Write out the blocked LapdfDocument object to XML * @param doc * @param out */ public void writeSpatialXmlToFile(LapdfDocument doc, File out) { logger.info("Writing spatial block XML to " + out.getPath() ); SpatialXMLWriter sxw = new SpatialXMLWriter(); sxw.write(doc, out.getPath() ); } /** * Write an LapdfDocument out to an OpenAccess-compatible XML format * @param doc * @param out */ public void writeSectionsToOpenAccessXmlFile(LapdfDocument doc, File out) { logger.info("Writing block-classified XML in OpenAccess format " + out.getPath() ); OpenAccessXMLWriter oaxw = new OpenAccessXMLWriter(); oaxw.write(doc, out.getPath() ); } /** * Write an LapdfDocument out to an OpenAccess-compatible XML format * @param doc * @param out * @throws IOException */ public void writeBlockStatisticsReport(LapdfDocument doc, File out) throws IOException { logger.info("Writing spatial features report to " + out.getPath() ); SpatialLayoutFeaturesReportGenerator slfrg = new SpatialLayoutFeaturesReportGenerator(); slfrg.write(doc, out.getPath()); } /** * Render images of the pages of the PDF file * @param pdfFile * @param outputDir * @throws Exception */ public void renderPageImages(File pdfFile, File outputDir) throws Exception { this.imagifier.generateImages(pdfFile, outputDir); } /** * Render images of the positions of words on each page of pdf, color coded by section * @param doc * @param dir * @param stem * @param mode * @throws IOException */ public void renderImageOutlines(LapdfDocument doc, File dir, String stem, int lapdfMode) throws IOException { for (int i = 1; i <= doc.getTotalNumberOfPages(); i++) { PageBlock page = doc.getPage(i); File imgFile = new File(dir.getPath() + "/" + stem + "_" + page.getPageNumber() + ".png"); PageImageOutlineRenderer.createPageImage(page, imgFile, stem + "_" + page.getPageNumber(), lapdfMode); } } /** * Writing text based report of spatial features of the PDF file * @param doc * @param out * @throws IOException */ public void writeSpatialFeaturesReport(LapdfDocument doc, File out) throws IOException { logger.info("Writing block feature report of " + doc.getPdfFile().getPath() + " to " + out.getPath()); SpatialLayoutFeaturesReportGenerator slfrg = new SpatialLayoutFeaturesReportGenerator(); slfrg.write(doc, out.getPath()); } public void writeTextToFile(LapdfDocument doc, Set<String> sections, File out) throws Exception { logger.info("Writing text of "+ doc.getPdfFile().getPath() + " to " + out.getPath()); SectionsTextWriter stw = new SectionsTextWriter(); stw.addToStack(sections); stw.write(doc, out.getPath() ); } public void writeTextToFile(LapdfDocument doc, List<Set<String>> stack, File out) throws Exception { logger.info("Writing text of "+ doc.getPdfFile().getPath() + " to " + out.getPath()); SectionsTextWriter stw = new SectionsTextWriter(); Iterator<Set<String>> it = stack.iterator(); while( it.hasNext() ) { Set<String> sections = it.next(); stw.addToStack(sections); } stw.write(doc, out.getPath() ); } }