JPedalExtractor.java example

Explorer

lapdftext-master
- src
  - main
    - autogen
      - edu
        isi
        bmkeg
        pdf
        DocumentInformation.java
        DocumentInformation_Type.java
    - java
      - edu
        isi
        bmkeg
        lapdf
        bin
        BlockStatistics.java
        Blockify.java
        BlockifyClassify.java
        CommandLineTool.java
        CustomBatchProcessingTool.java
        ImagifyBlocks.java
        ImagifySections.java
        PDFExtractionConstants.java
        ReadSectionText.java
        TextPanel.java
        WatchDirectory.java
        classification
        Classifier.java
        ruleBased
        RuleBasedChunkClassifier.java
        controller
        LapdfEngine.java
        LapdfMode.java
        extraction
        Extractor.java
        JPedalExtractor.java
        JPedalPageImageExtractor.java
        exceptions
        AccessException.java
        ClassificationException.java
        EmptyPDFException.java
        EncryptionException.java
        InvalidPopularSpaceValueException.java
        features
        ChunkFeatures.java
        HorizontalSplitFeature.java
        WordFeatures.java
        model
        Block.java
        ChunkBlock.java
        LapdfDirection.java
        LapdfDocument.java
        PageBlock.java
        RTree
        RTChunkBlock.java
        RTDummyProcedure.java
        RTModelFactory.java
        RTPageBlock.java
        RTProcedure.java
        RTSpatialEntity.java
        RTSpatialRepresentation.java
        RTWordBlock.java
        WordBlock.java
        factory
        AbstractModelFactory.java
        ordering
        SpatialOrdering.java
        spatial
        SpatialEntity.java
        SpatialRepresentation.java
        parser
        Parser.java
        RuleBasedParser.java
        text
        SectionBasedTextExtractor.java
        SectionsTextWriter.java
        SpatialLayoutFeaturesReportGenerator.java
        SpatiallyOrderedChunkTextWriter.java
        SpatiallyOrderedChunkTypeFilteredTextWriter.java
        TextWriter.java
        uima
        ae
        ParserRuleBasedClassfierAE.java
        cpe
        CommandLineFitPipeline.java
        cr
        DirectoryCollectionReader.java
        utils
        JPedalPDFRenderer.java
        PageImageOutlineRenderer.java
        PdfDirWatcher.java
        xml
        OpenAccessXMLWriter.java
        SpatialXMLWriter.java
        XMLWriter.java
        utils
        FileUtils.java
        FrequencyCounter.java
        ISI_UIMA_PDFUtils.java
        IntegerFrequencyCounter.java
        PipelineLauncher.java
        ReflectionUtils.java
        StringCleaner.java
        StringCleanerException.java
  - test
    - java
      - edu
        isi
        bmkeg
        CommandLineToolTest.java
        ladpdf
        bin
        BlockifyClassifyTest.java
        BlockifyTest.java
        BlocksStatisticsTest.java
        ImagifyBlocksTest.java
        ImagifySectionsTest.java
        ReadSectionTextTest.java
        dirWatchers
        WatchDirectory_BLOCKIFY_CLASSIFY_Test.java
        WatchDirectory_BLOCKIFY_Test.java
        WatchDirectory_IMAGIFY_BLOCKS_Test.java
        WatchDirectory_IMAGIFY_SECTIONS_Test.java
        WatchDirectory_READ_SECTION_TEXT_Test.java

package edu.isi.bmkeg.lapdf.extraction;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.Vector;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Strip;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.SAXException;

import edu.isi.bmkeg.lapdf.extraction.exceptions.AccessException;
import edu.isi.bmkeg.lapdf.extraction.exceptions.EmptyPDFException;
import edu.isi.bmkeg.lapdf.extraction.exceptions.EncryptionException;
import edu.isi.bmkeg.lapdf.model.WordBlock;
import edu.isi.bmkeg.lapdf.model.factory.AbstractModelFactory;
import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering;

public class JPedalExtractor implements Extractor {

	Set<WordBlock> wordListPerPage = null;
	PdfDecoder PDFDecoder = null;
	int currentPage = 1;
	int pageCount;
	private static Document xmlDocument;
	private static DocumentBuilder docBuilder;

	private static int pageHeight;
	private static int pageWidth;
	private AbstractModelFactory modelFactory;

	public JPedalExtractor(AbstractModelFactory modelFactory)
			throws Exception {

		DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance();
		docBuilder = dbfac.newDocumentBuilder();
		
		this.modelFactory = modelFactory;
		this.PDFDecoder = new PdfDecoder(false);

		PDFDecoder.setExtractionMode(PdfDecoder.TEXT); // extract just text
		PDFDecoder.init(true);
		PdfGroupingAlgorithms.useUnrotatedCoords = true;
		// if you do not require XML content, pure text extraction is much
		// faster.
		PDFDecoder.useXMLExtraction();

		System.setProperty("hacked", "true");

	}

	public void init(File file) throws Exception {
		
		if (PDFDecoder.isOpen()) {
			PDFDecoder.flushObjectValues(true);
			PDFDecoder.closePdfFile();

		}

		PDFDecoder.openPdfFile(file.getPath());
		currentPage = 1;
		pageCount = PDFDecoder.getPageCount();
		if (!PDFDecoder.isExtractionAllowed()) {
			throw new AccessException(file.getPath());
		} else if (PDFDecoder.isEncrypted()) {
			throw new EncryptionException(file.getPath());
		}

	}

	private int[] generatePageBoundaries(PdfPageData currentPageData) {
		
		// 0:TLX, 1:TLY, 2:BRX, 3:BRY
		int[] dimensions = new int[4];

		// Using just cropbox
		if( currentPageData.getCropBoxHeight(currentPage) 
				!= currentPageData.getMediaBoxHeight(currentPage) ) {
			
			dimensions[0] = currentPageData.getCropBoxX(currentPage);
			dimensions[2] = currentPageData.getCropBoxWidth(currentPage)
					+ dimensions[0];

			dimensions[3] = currentPageData.getCropBoxY(currentPage);
			dimensions[1] = currentPageData.getCropBoxHeight(currentPage)
					+ dimensions[3];
		} else {
			dimensions[0] = currentPageData.getMediaBoxX(currentPage);
			dimensions[2] = currentPageData.getMediaBoxWidth(currentPage)
					+ dimensions[0];

			dimensions[3] = currentPageData.getMediaBoxY(currentPage);
			dimensions[1] = currentPageData.getMediaBoxHeight(currentPage)
					+ dimensions[3];
		}
		return dimensions;

	}

	private String getFontData(String xml, String item)
			throws UnsupportedEncodingException, IOException {
		xml = "<root>" + xml + "</root>";

		try {
			xmlDocument = docBuilder.parse(new ByteArrayInputStream(xml
					.getBytes("UTF-8")));
		} catch (SAXException e) {

			// e.printStackTrace();
			return null;
		}

		Element font = (Element) xmlDocument.getElementsByTagName("font").item(
				0);
		return font.getAttribute(item);

	}

	private void decodeFile() throws Exception {

		String font = null;
		String currentWord;
		String style = null;

		PDFDecoder.decodePage(currentPage);
		
		PdfGroupingAlgorithms currentGrouping = PDFDecoder.getGroupingObject();

		PdfPageData currentPageData = PDFDecoder.getPdfPageData();
		int[] dimensions;

		// pageHeight.add(currentPageData.getCropBoxHeight(page));
		// pageWidth.add(currentPageData.getCropBoxWidth(page));

		dimensions = generatePageBoundaries(currentPageData);
		pageWidth = Math.abs(dimensions[2] - dimensions[0]);
		pageHeight = Math.abs(dimensions[1] - dimensions[3]);
		
		//currentGrouping.extractTextAsWordlist(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
		List words = currentGrouping.extractTextAsWordlist(
				dimensions[0], dimensions[1], dimensions[2], dimensions[3], 
				currentPage,
				true, 
				"&:=()!;\\/\"\"\'\'"
				);

		Iterator<String> wordIterator = words.iterator();

		if (wordListPerPage == null)
			wordListPerPage = new TreeSet<WordBlock>(new SpatialOrdering(SpatialOrdering.MIXED_MODE_ABSOLUTE));
		else {
			wordListPerPage.clear();
		}

		while (wordIterator.hasNext()) {

			currentWord = wordIterator.next();

			font = getFontData(currentWord, "face");
			style = getFontData(currentWord, "style");
			currentWord = Strip.convertToText(currentWord, true);
			
			int wx1 = roundUp(Float.parseFloat((wordIterator.next() + "")));
			int wy1 = roundUp(Float.parseFloat((wordIterator.next() + "")));
			int wx2 = roundUp(Float.parseFloat((wordIterator.next() + "")));
			int wy2 = roundUp(Float.parseFloat((wordIterator.next() + "")));

			wy1 = dimensions[1] - wy1;
			wy2 = dimensions[1] - wy2;

			WordBlock wordBlock = modelFactory.createWordBlock(wx1, wy1, wx2,
					wy2, 1, font, style, currentWord);

			wordListPerPage.add(wordBlock);
		
		}
		
		currentPage++;
		PDFDecoder.flushObjectValues(false);
	
	}

	@Override
	public boolean hasNext() {
		
		if (currentPage <= pageCount) {

			try {
		
				decodeFile();

			} catch (EmptyPDFException e) {
			
				return false;

			} catch (Exception e) {
			
				return false;
			
			}

			return true;
		
		} else {
		
			PDFDecoder.flushObjectValues(true);
			PDFDecoder.closePdfFile();
			return false;
		
		}

	}

	@Override
	public List<WordBlock> next() {
		return new ArrayList<WordBlock>(wordListPerPage);
	}

	@Override
	public void remove() {
	}

	@Override
	public int getCurrentPageBoxHeight() {

		return pageHeight;
	}

	@Override
	public int getCurrentPageBoxWidth() {

		return pageWidth;
	}

	private int roundUp(double value) {
		
		value = Math.floor(value);

		return (int) value;
	}

}