package edu.isi.bmkeg.lapdf.extraction; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.Vector; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.jpedal.PdfDecoder; import org.jpedal.exception.PdfException; import org.jpedal.grouping.PdfGroupingAlgorithms; import org.jpedal.objects.PdfPageData; import org.jpedal.utils.Strip; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.xml.sax.SAXException; import edu.isi.bmkeg.lapdf.extraction.exceptions.AccessException; import edu.isi.bmkeg.lapdf.extraction.exceptions.EmptyPDFException; import edu.isi.bmkeg.lapdf.extraction.exceptions.EncryptionException; import edu.isi.bmkeg.lapdf.model.WordBlock; import edu.isi.bmkeg.lapdf.model.factory.AbstractModelFactory; import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering; public class JPedalExtractor implements Extractor { Set<WordBlock> wordListPerPage = null; PdfDecoder PDFDecoder = null; int currentPage = 1; int pageCount; private static Document xmlDocument; private static DocumentBuilder docBuilder; private static int pageHeight; private static int pageWidth; private AbstractModelFactory modelFactory; public JPedalExtractor(AbstractModelFactory modelFactory) throws Exception { DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance(); docBuilder = dbfac.newDocumentBuilder(); this.modelFactory = modelFactory; this.PDFDecoder = new PdfDecoder(false); PDFDecoder.setExtractionMode(PdfDecoder.TEXT); // extract just text PDFDecoder.init(true); PdfGroupingAlgorithms.useUnrotatedCoords = true; // if you do not require XML content, pure text extraction is much // faster. PDFDecoder.useXMLExtraction(); System.setProperty("hacked", "true"); } public void init(File file) throws Exception { if (PDFDecoder.isOpen()) { PDFDecoder.flushObjectValues(true); PDFDecoder.closePdfFile(); } PDFDecoder.openPdfFile(file.getPath()); currentPage = 1; pageCount = PDFDecoder.getPageCount(); if (!PDFDecoder.isExtractionAllowed()) { throw new AccessException(file.getPath()); } else if (PDFDecoder.isEncrypted()) { throw new EncryptionException(file.getPath()); } } private int[] generatePageBoundaries(PdfPageData currentPageData) { // 0:TLX, 1:TLY, 2:BRX, 3:BRY int[] dimensions = new int[4]; // Using just cropbox if( currentPageData.getCropBoxHeight(currentPage) != currentPageData.getMediaBoxHeight(currentPage) ) { dimensions[0] = currentPageData.getCropBoxX(currentPage); dimensions[2] = currentPageData.getCropBoxWidth(currentPage) + dimensions[0]; dimensions[3] = currentPageData.getCropBoxY(currentPage); dimensions[1] = currentPageData.getCropBoxHeight(currentPage) + dimensions[3]; } else { dimensions[0] = currentPageData.getMediaBoxX(currentPage); dimensions[2] = currentPageData.getMediaBoxWidth(currentPage) + dimensions[0]; dimensions[3] = currentPageData.getMediaBoxY(currentPage); dimensions[1] = currentPageData.getMediaBoxHeight(currentPage) + dimensions[3]; } return dimensions; } private String getFontData(String xml, String item) throws UnsupportedEncodingException, IOException { xml = "<root>" + xml + "</root>"; try { xmlDocument = docBuilder.parse(new ByteArrayInputStream(xml .getBytes("UTF-8"))); } catch (SAXException e) { // e.printStackTrace(); return null; } Element font = (Element) xmlDocument.getElementsByTagName("font").item( 0); return font.getAttribute(item); } private void decodeFile() throws Exception { String font = null; String currentWord; String style = null; PDFDecoder.decodePage(currentPage); PdfGroupingAlgorithms currentGrouping = PDFDecoder.getGroupingObject(); PdfPageData currentPageData = PDFDecoder.getPdfPageData(); int[] dimensions; // pageHeight.add(currentPageData.getCropBoxHeight(page)); // pageWidth.add(currentPageData.getCropBoxWidth(page)); dimensions = generatePageBoundaries(currentPageData); pageWidth = Math.abs(dimensions[2] - dimensions[0]); pageHeight = Math.abs(dimensions[1] - dimensions[3]); //currentGrouping.extractTextAsWordlist(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7) List words = currentGrouping.extractTextAsWordlist( dimensions[0], dimensions[1], dimensions[2], dimensions[3], currentPage, true, "&:=()!;\\/\"\"\'\'" ); Iterator<String> wordIterator = words.iterator(); if (wordListPerPage == null) wordListPerPage = new TreeSet<WordBlock>(new SpatialOrdering(SpatialOrdering.MIXED_MODE_ABSOLUTE)); else { wordListPerPage.clear(); } while (wordIterator.hasNext()) { currentWord = wordIterator.next(); font = getFontData(currentWord, "face"); style = getFontData(currentWord, "style"); currentWord = Strip.convertToText(currentWord, true); int wx1 = roundUp(Float.parseFloat((wordIterator.next() + ""))); int wy1 = roundUp(Float.parseFloat((wordIterator.next() + ""))); int wx2 = roundUp(Float.parseFloat((wordIterator.next() + ""))); int wy2 = roundUp(Float.parseFloat((wordIterator.next() + ""))); wy1 = dimensions[1] - wy1; wy2 = dimensions[1] - wy2; WordBlock wordBlock = modelFactory.createWordBlock(wx1, wy1, wx2, wy2, 1, font, style, currentWord); wordListPerPage.add(wordBlock); } currentPage++; PDFDecoder.flushObjectValues(false); } @Override public boolean hasNext() { if (currentPage <= pageCount) { try { decodeFile(); } catch (EmptyPDFException e) { return false; } catch (Exception e) { return false; } return true; } else { PDFDecoder.flushObjectValues(true); PDFDecoder.closePdfFile(); return false; } } @Override public List<WordBlock> next() { return new ArrayList<WordBlock>(wordListPerPage); } @Override public void remove() { } @Override public int getCurrentPageBoxHeight() { return pageHeight; } @Override public int getCurrentPageBoxWidth() { return pageWidth; } private int roundUp(double value) { value = Math.floor(value); return (int) value; } }