Java Examples for org.apache.pdfbox.pdmodel.PDDocument

The following java examples will help you to understand the usage of org.apache.pdfbox.pdmodel.PDDocument. These source code samples are taken from different open source projects.

Example 1

Project: PaperManager-master File: TestPDFBox.java View source code

/**
	 * @param args
	 */
public static void main(String[] args) {
    PDDocument document;
    try {
        document = PDDocument.load("test.pdf");
        PDDocumentInformation info = document.getDocumentInformation();
        System.out.println("Page Count=" + document.getNumberOfPages());
        System.out.println("Title=" + info.getTitle());
        System.out.println("Author=" + info.getAuthor());
    } catch (IOException e) {
        e.printStackTrace();
    }
}

Example 2

Project: aplikator-master File: PDFLoader.java View source code

public static BufferedImage load(InputStream stream) throws IOException {
    PDDocument document = null;
    try {
        document = PDDocument.load(stream);
        int resolution = 160;
        int page = 0;
        PDFRenderer renderer = new PDFRenderer(document);
        BufferedImage renderImage = renderer.renderImageWithDPI(page, resolution, ImageType.RGB);
        return renderImage;
    } finally {
        if (document != null) {
            document.close();
        }
        IOUtils.tryClose(stream);
    }
}

Example 3

Project: dss-master File: PdfBoxSignatureService.java View source code

@Override
public byte[] digest(final InputStream toSignDocument, final PAdESSignatureParameters parameters, final DigestAlgorithm digestAlgorithm) throws DSSException {
    final byte[] signatureValue = DSSUtils.EMPTY_BYTE_ARRAY;
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    PDDocument pdDocument = null;
    try {
        pdDocument = PDDocument.load(toSignDocument);
        PDSignature pdSignature = createSignatureDictionary(parameters);
        return signDocumentAndReturnDigest(parameters, signatureValue, outputStream, pdDocument, pdSignature, digestAlgorithm);
    } catch (IOException e) {
        throw new DSSException(e);
    } finally {
        Utils.closeQuietly(pdDocument);
        Utils.closeQuietly(outputStream);
    }
}

Example 4

Project: padaf-master File: SynchronizedMetaDataValidation.java View source code

/**
   * Check if document information entries and XMP information are synchronized
   * 
   * @param document
   *          the PDF Document
   * @param metadata
   *          the XMP MetaData
   * @return List of validation errors
   * @throws ValidationException
   */
public List<ValidationError> validateMetadataSynchronization(PDDocument document, XMPMetadata metadata) throws ValidationException {
    List<ValidationError> ve = new ArrayList<ValidationError>();
    if (document == null) {
        throw new ValidationException("Document provided is null");
    } else {
        PDDocumentInformation dico = document.getDocumentInformation();
        if (metadata == null) {
            throw new ValidationException("Metadata provided are null");
        } else {
            DublinCoreSchema dc = metadata.getDublinCoreSchema();
            // TITLE
            analyzeTitleProperty(dico, dc, ve);
            // AUTHOR
            analyzeAuthorProperty(dico, dc, ve);
            // SUBJECT
            analyzeSubjectProperty(dico, dc, ve);
            AdobePDFSchema pdf = metadata.getAdobePDFSchema();
            // KEYWORDS
            analyzeKeywordsProperty(dico, pdf, ve);
            // PRODUCER
            analyzeProducerProperty(dico, pdf, ve);
            XMPBasicSchema xmp = metadata.getXMPBasicSchema();
            // CREATOR TOOL
            analyzeCreatorToolProperty(dico, xmp, ve);
            // CREATION DATE
            analyzeCreationDateProperty(dico, xmp, ve);
            // MODIFY DATE
            analyzeModifyDateProperty(dico, xmp, ve);
        }
    }
    return ve;
}

Example 5

Project: preservation-tools-master File: PdfAValidator.java View source code

public static void main(String args[]) throws IOException {
    try {
        changecolor();
        String path = "D://Eclipse New//PDFBoxLogo.gif";
        String description = "PDFBox Logo";
        ImageIcon icon = new ImageIcon(path, description);
        JOptionPane.showMessageDialog(null, "Please choose the folder with PDF/A files to validate.", "PDFBox Validation", JOptionPane.QUESTION_MESSAGE, icon);
        examinedFolder = utilities.BrowserDialogs.chooseFolder();
        outputfile = new PrintWriter(new FileWriter(examinedFolder + "//" + "PdfAValidation.xml"));
        shortSummary = new PrintWriter(new FileWriter(examinedFolder + "//" + "PdfAValidationShortSummary.xml"));
        String xmlVersion = "xml version='1.0'";
        String xmlEncoding = "encoding='ISO-8859-1'";
        String xsltStyleSheet = "<?xml-stylesheet type=\"text/xsl\" href=\"PdfBoxValidationStyle.xsl\"?>";
        String xsltStyleSheetSummary = "<?xml-stylesheet type=\"text/xsl\" href=\"PdfBoxSummaryStyle.xsl\"?>";
        String xsltLocation = examinedFolder + "//" + "PdfBoxValidationStyle.xsl";
        String xsltLocationSum = examinedFolder + "//" + "PdfBoxSummaryStyle.xsl";
        output.XslStyleSheets.PdfBoxCustomizedXsl(xsltLocation);
        output.XslStyleSheets.PdfBoxSummaryCustomizedXsl(xsltLocationSum);
        outputfile.println("<?" + xmlVersion + " " + xmlEncoding + "?>");
        outputfile.println(xsltStyleSheet);
        outputfile.println("<PdfBoxValidation>");
        shortSummary.println("<?" + xmlVersion + " " + xmlEncoding + "?>");
        shortSummary.println(xsltStyleSheetSummary);
        shortSummary.println("<PdfBoxValidationSummary>");
        int examinedPdfa = 0;
        int validPdfa = 0;
        int invalidPdfa = 0;
        if (examinedFolder != null) {
            ArrayList<File> files = utilities.ListsFiles.getPaths(new File(examinedFolder), new ArrayList<File>());
            for (int i = 0; i < files.size(); i++) {
                if (files.get(i) != null) {
                    try {
                        if (PdfAnalysis.testPdfOk(files.get(i))) /*
							 * Test if the Pdf File is ok to be examined.
							 * Otherwise gives error in Console
							 */
                        {
                            String PdfType = PdfAnalysis.checkIfPdfA(files.get(i));
                            if (PdfType.contains("PDF/A")) {
                                outputfile.println("<PdfAFile>");
                                shortSummary.println("<PdfAFile>");
                                int syntaxError = 0;
                                int graphicError = 0;
                                int fontError = 0;
                                int transparencyError = 0;
                                int annotationError = 0;
                                int actionError = 0;
                                int metadataError = 0;
                                examinedPdfa++;
                                outputfile.println("<FileName>" + utilities.fileStringUtilities.getFileName(files.get(i)) + "</FileName>");
                                shortSummary.println("<FileName>" + utilities.fileStringUtilities.getFileName(files.get(i)) + "</FileName>");
                                PDDocument pd = new PDDocument();
                                pd = PDDocument.load(files.get(i));
                                PDDocumentInformation info = pd.getDocumentInformation();
                                getsomeMetadata(info);
                                pd.close();
                                /*
									 * the actual PdfAValidation starts here
									 */
                                ValidationResult result = null;
                                FileDataSource fd = new FileDataSource(files.get(i).toString());
                                PreflightParser parser = new PreflightParser(fd);
                                try {
                                    parser.parse();
                                    PreflightDocument document = parser.getPreflightDocument();
                                    try {
                                        document.validate();
                                        result = document.getResult();
                                        document.close();
                                    } catch (NullPointerException e) {
                                        outputfile.println("<Error>" + e + "</Error>");
                                        shortSummary.println("<Error>" + e + "</Error>");
                                        logger.error("Error analyzing " + files.get(i).getAbsolutePath(), e);
                                    }
                                } catch (SyntaxValidationException e) {
                                    result = e.getResult();
                                    logger.error("Error analyzing " + files.get(i).getAbsolutePath(), e);
                                }
                                if (result != null) {
                                    if (result.isValid()) {
                                        outputfile.println("<Status>" + "Valid" + "</Status>");
                                        shortSummary.println("<Status>" + "Valid" + "</Status>");
                                        validPdfa++;
                                    } else {
                                        int errorslen = 0;
                                        outputfile.println("<Status>" + "Invalid" + "</Status>");
                                        shortSummary.println("<Status>" + "Invalid" + "</Status>");
                                        invalidPdfa++;
                                        for (ValidationError error : result.getErrorsList()) {
                                            errorslen++;
                                            String errorCode = error.getErrorCode().toString();
                                            outputfile.println("<Code>" + error.getErrorCode() + "</Code>");
                                            String errorDetails = utilities.fileStringUtilities.reduceXmlEscapors(error.getDetails());
                                            if (errorCode.startsWith("1")) {
                                                outputfile.println("<Details Category=\"SyntaxError\">" + errorDetails + "</Details>");
                                                syntaxError++;
                                            }
                                            if (errorCode.startsWith("2")) {
                                                outputfile.println("<Details Category=\"GraphicError\">" + errorDetails + "</Details>");
                                                graphicError++;
                                            }
                                            if (errorCode.startsWith("3")) {
                                                outputfile.println("<Details Category=\"FontError\">" + errorDetails + "</Details>");
                                                fontError++;
                                            }
                                            if (errorCode.startsWith("4")) {
                                                outputfile.println("<Details Category=\"TransparencyError\">" + errorDetails + "</Details>");
                                                transparencyError++;
                                            }
                                            if (errorCode.startsWith("5")) {
                                                outputfile.println("<Details Category=\"AnnotationError\">" + errorDetails + "</Details>");
                                                annotationError++;
                                            }
                                            if (errorCode.startsWith("6")) {
                                                outputfile.println("<Details Category=\"ActionError\">" + errorDetails + "</Details>");
                                                actionError++;
                                            }
                                            if (errorCode.startsWith("7")) {
                                                outputfile.println("<Details Category=\"MetadataError\">" + errorDetails + "</Details>");
                                                metadataError++;
                                            }
                                        }
                                        outputfile.println("<SyntaxErrors>" + syntaxError + "</SyntaxErrors>");
                                        outputfile.println("<GraphicErrors>" + graphicError + "</GraphicErrors>");
                                        outputfile.println("<FontErrors>" + fontError + "</FontErrors>");
                                        outputfile.println("<TransparencyErrors>" + transparencyError + "</TransparencyErrors>");
                                        outputfile.println("<AnnotationErrors>" + annotationError + "</AnnotationErrors>");
                                        outputfile.println("<ActionErrors>" + actionError + "</ActionErrors>");
                                        outputfile.println("<MetadataErrors>" + metadataError + "</MetadataErrors>");
                                        shortSummary.println("<ErrorsCount>" + errorslen + "</ErrorsCount>");
                                    }
                                }
                                outputfile.println("</PdfAFile>");
                                shortSummary.println("</PdfAFile>");
                            }
                        }
                    } catch (IOException e) {
                        outputfile.println("<Error>" + e + "</Error>");
                        JOptionPane.showMessageDialog(null, e, "error message", JOptionPane.ERROR_MESSAGE);
                    }
                }
            }
        }
        shortSummary.println("<Summary>");
        shortSummary.println("<ExaminedPdfAFiles>" + examinedPdfa + "</ExaminedPdfAFiles>");
        shortSummary.println("<ValidPdfAFiles>" + validPdfa + "</ValidPdfAFiles>");
        shortSummary.println("<InvalidPdfAFiles>" + invalidPdfa + "</InvalidPdfAFiles>");
        shortSummary.println("</Summary>");
        outputfile.println("</PdfBoxValidation>");
        shortSummary.println("</PdfBoxValidationSummary>");
        shortSummary.close();
        outputfile.close();
    } catch (FileNotFoundException e) {
        logger.error("Error analyzing " + e);
        JOptionPane.showMessageDialog(null, e, "error message", JOptionPane.ERROR_MESSAGE);
    }
}

Example 6

Project: tizzit-master File: PdfPreviewFrameTest.java View source code

@Test
public void testSetDocumentContent() throws Exception {
    PDDocument document = new PDDocument();
    ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream();
    PDPage blankPage = new PDPage();
    document.addPage(blankPage);
    document.save(byteOutputStream);
    document.close();
    PdfPreviewFrame previewFrame = createPartialMockForAllMethodsExcept(PdfPreviewFrame.class, "setDocumentContent");
    previewFrame.setDocumentContent(byteOutputStream.toByteArray());
    Object object = Whitebox.getInternalState(previewFrame, "pdffile");
    assertNotNull(object);
    assertTrue(object instanceof PDFFile);
}

Example 7

Project: brigen-base-master File: PagesAppender.java View source code

@Override
public PDDocument append(int end, int start, PDDocument document) throws IOException {
    PDPageable pageable;
    try {
        pageable = new PDPageable(document);
    } catch (IllegalArgumentExceptionPrinterException |  e) {
        throw new IOException(e);
    }
    int pages = pageable.getNumberOfPages();
    if (0 < pages) {
        PDDocument overDoc = new PDDocument();
        PDDocument underDoc = new PDDocument();
        for (int i = 0; i < pages; i++) {
            int page = i + 1;
            PDPageContentStream overStream;
            {
                PDPage overPage = new PDPage();
                overDoc.addPage(overPage);
                overStream = new PDPageContentStream(overDoc, overPage, true, true);
            }
            PDPageContentStream underStream;
            {
                PDPage underPage = new PDPage();
                underDoc.addPage(underPage);
                underStream = new PDPageContentStream(underDoc, underPage, true, true);
            }
            PDRectangle rect;
            {
                PDPage pdPage = (PDPage) document.getDocumentCatalog().getAllPages().get(i);
                rect = pdPage.getMediaBox();
            }
            appendUnderContent(end, start, pages, page, underStream, rect);
            appendOverContent(end, start, pages, page, overStream, rect);
            underStream.close();
            overStream.close();
        }
        {
            Overlay overlay = new Overlay();
            document = overlay.overlay(document, underDoc);
        }
        {
            Overlay overlay = new Overlay();
            document = overlay.overlay(overDoc, document);
        }
    }
    return document;
}

Example 8

Project: com.revolsys.open-master File: PdfViewport.java View source code

private PDFont getFont(final String path) throws IOException {
    PDFont font = this.fonts.get(path);
    if (font == null) {
        final InputStream fontStream = PDDocument.class.getResourceAsStream("/org/apache/pdfbox/resources/ttf/ArialMT.ttf");
        font = PDTrueTypeFont.loadTTF(this.document, fontStream);
        this.fonts.put("/org/apache/pdfbox/resources/ttf/ArialMT.ttf", font);
    }
    return font;
}

Example 9

Project: GeoBI-master File: ImageOutputScalableFactory.java View source code

private List<ImageInfo> createImages(PJsonObject jsonSpec, File tmpFile, RenderingContext context) throws IOException {
    List<ImageInfo> images = new ArrayList<ImageInfo>();
    PDDocument pdf = PDDocument.load(tmpFile);
    try {
        List<PDPage> pages = pdf.getDocumentCatalog().getAllPages();
        for (PDPage page : pages) {
            BufferedImage img = page.convertToImage(BufferedImage.TYPE_INT_RGB, calculateDPI(context, jsonSpec));
            File file = File.createTempFile("pdfToImage", "tiff");
            ImageIO.write(img, "TIF", file);
            images.add(new ImageInfo(file, img.getWidth(), img.getHeight()));
        }
    } finally {
        pdf.close();
    }
    return images;
}

Example 10

Project: java-wkhtmltopdf-wrapper-master File: PdfTest.java View source code

@Test
public void testPdfFromStringTo() throws Exception {
    // GIVEN a html template containing special characters that java stores in utf-16 internally
    Pdf pdf = new Pdf();
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>MÃ¼ller</h1></html>", PageType.htmlAsString);
    // WHEN
    byte[] pdfBytes = pdf.getPDF();
    PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
    Assert.assertThat("document should contain the creditorName", pdfText, containsString("MÃ¼ller"));
}

Example 11

Project: josm-plugins-master File: PdfBoxParser.java View source code

public void parse(File file, int maxPaths, ProgressMonitor monitor) throws IOException {
    monitor.beginTask(tr("Parsing PDF", 1));
    try (PDDocument document = PDDocument.load(file)) {
        if (document.isEncrypted()) {
            throw new IllegalArgumentException(tr("Encrypted documents not supported."));
        }
        List<?> allPages = document.getDocumentCatalog().getAllPages();
        if (allPages.size() != 1) {
            throw new IllegalArgumentException(tr("The PDF file must have exactly one page."));
        }
        PDPage page = (PDPage) allPages.get(0);
        PDRectangle pageSize = page.findMediaBox();
        Integer rotationVal = page.getRotation();
        int rotation = 0;
        if (rotationVal != null) {
            rotation = rotationVal.intValue();
        }
        new PageDrawer().drawPage(new GraphicsProcessor(target, rotation, maxPaths, monitor), page);
        this.target.bounds = new Rectangle2D.Double(pageSize.getLowerLeftX(), pageSize.getLowerLeftY(), pageSize.getWidth(), pageSize.getHeight());
    }
    monitor.finishTask();
}

Example 12

Project: ontopia-master File: PDFFormatModule.java View source code

public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
    try {
        PDDocument pdoc = PDDocument.load(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
        PDFTextStripper stripper = new PDFTextStripper();
        String s = stripper.getText(pdoc);
        pdoc.close();
        char[] c = s.toCharArray();
        handler.startRegion("document");
        handler.text(c, 0, c.length);
        handler.endRegion();
    } catch (Exception e) {
        throw new OntopiaRuntimeException(e);
    }
}

Example 13

Project: OpenLegislation-master File: TranscriptPdfView.java View source code

public static void writeTranscriptPdf(Transcript transcript, OutputStream outputStream) throws IOException, COSVisitorException {
    if (transcript == null) {
        throw new IllegalArgumentException("Supplied transcript cannot be null when converting to pdf.");
    }
    try (PDDocument doc = new PDDocument()) {
        PDFont font = PDType1Font.COURIER;
        List<List<String>> pages = TranscriptTextUtils.getPdfFormattedPages(transcript.getText());
        for (List<String> page : pages) {
            PDPage pg = new PDPage(PDPage.PAGE_SIZE_LETTER);
            PDPageContentStream contentStream = new PDPageContentStream(doc, pg);
            drawBorder(contentStream);
            contentStream.beginText();
            contentStream.setFont(font, fontSize);
            moveStreamToTopOfPage(contentStream);
            int lineCount = drawPageText(page, contentStream);
            drawStenographer(transcript, contentStream, lineCount);
            contentStream.endText();
            contentStream.close();
            doc.addPage(pg);
        }
        doc.save(outputStream);
    }
}

Example 14

Project: pdfbox-master File: CatalogValidationProcess.java View source code

@Override
public void validate(PreflightContext ctx) throws ValidationException {
    PDDocument pdfbox = ctx.getDocument();
    this.catalog = pdfbox.getDocumentCatalog();
    if (this.catalog == null) {
        ctx.addValidationError(new ValidationError(ERROR_SYNTAX_NOCATALOG, "There are no Catalog entry in the Document"));
    } else {
        validateActions(ctx);
        validateLang(ctx);
        validateNames(ctx);
        validateOCProperties(ctx);
        validateOutputIntent(ctx);
    }
}

Example 15

Project: Plain-of-JARs-master File: webcomic2pdf.java View source code

public static void main(String[] args) throws Exception {
    String version = "1.0.3";
    String program = "Webcomic2PDF";
    System.out.println(program + " " + version);
    File directory = new File("pages");
    if (!directory.exists()) {
        directory.mkdir();
    } else {
        File[] files = directory.listFiles();
        if (files != null) {
            for (int i = 0; i < files.length; i++) {
                files[i].delete();
            }
        }
    }
    // end of if-else    
    try {
        int pages_count = 0;
        int current_page = 1;
        int i = 1;
        int current_pages_count = 0;
        System.out.println("");
        System.out.println("List of all supported comics:");
        ObjectMapper mapper = new ObjectMapper();
        BufferedReader fileReader = new BufferedReader(new InputStreamReader(webcomic2pdf.class.getResourceAsStream("jar_files/comics.json")));
        JsonNode rootNode = mapper.readTree(fileReader);
        JsonNode comics = rootNode.get("comics");
        System.out.println("#\tName");
        for (int comic = 0; comic < comics.size(); comic++) {
            System.out.println(comic + "\t" + comics.get(comic).get("name").textValue());
        }
        System.out.println("");
        System.out.print("Select comic #:");
        int comic = Integer.parseInt(System.console().readLine());
        String comic_name = comics.get(comic).get("name").textValue();
        String comic_url = comics.get(comic).get("url").textValue();
        String comic_image_selector = comics.get(comic).get("image_selector").textValue();
        String comic_last_page_selector = comics.get(comic).get("last_page_selector").textValue();
        String comic_first_page = comics.get(comic).get("first_page").textValue();
        String comic_url_parameter = comics.get(comic).get("url_parameter").textValue();
        pages_count = getPages(comic_first_page, comic_last_page_selector, comic_url_parameter);
        Logger log = LogManager.getLogManager().getLogger("");
        //Logger.getLogger( webcomic2pdf.class.getName() );
        for (Handler h : log.getHandlers()) {
            h.setLevel(Level.OFF);
        }
        System.out.println(comic_name);
        System.out.println("Total available pages:" + pages_count);
        System.out.print("From page #:");
        int from = Integer.parseInt(System.console().readLine());
        System.out.print("To page #:");
        int to = Integer.parseInt(System.console().readLine());
        current_page = from;
        current_pages_count = to - from + 1;
        while (current_page <= to) {
            String image_real = getPage(current_page, comic_url, comic_image_selector);
            String content_length = null;
            InputStream is = null;
            HttpURLConnection conn = (HttpURLConnection) (new URL(image_real.toString()).openConnection());
            conn.setConnectTimeout(60000);
            conn.setReadTimeout(60000);
            conn.connect();
            content_length = conn.getHeaderField("content-length");
            is = conn.getInputStream();
            String[] array = image_real.toString().split("/");
            String image_local = array[array.length - 1];
            OutputStream outstream = new FileOutputStream(new File("pages/" + current_page + "_" + image_local));
            long fileSize = Long.valueOf(content_length).longValue();
            long bytesRead = 0;
            int percentage = -1;
            byte[] buffer = new byte[4096];
            int len;
            while ((len = is.read(buffer)) > 0) {
                outstream.write(buffer, 0, len);
                bytesRead += len;
                int n = (int) (100 * bytesRead / fileSize);
                percentage = n;
                String n_perct = n + "%    ";
                System.out.print("\rDownloading page " + i + " of " + current_pages_count + " " + n_perct + "");
            }
            outstream.close();
            i++;
            current_page++;
        }
        File[] myarray = directory.listFiles(new FileFilter() {

            public boolean accept(File dir) {
                return dir.toString().endsWith(".jpg") && dir.isFile();
            }
        });
        System.out.println("");
        if (myarray.length > 0) {
            System.out.println("Generating PDF");
            PDDocument document = new PDDocument();
            for (int k = 0; k < myarray.length; k++) {
                InputStream in = new FileInputStream(myarray[k]);
                BufferedImage bimg = ImageIO.read(in);
                float width = bimg.getWidth();
                float height = bimg.getHeight();
                PDPage page = new PDPage(new PDRectangle(width, height));
                document.addPage(page);
                PDXObjectImage img = new PDJpeg(document, new FileInputStream(myarray[k]));
                PDPageContentStream contentStream = new PDPageContentStream(document, page);
                contentStream.drawImage(img, 0, 0);
                contentStream.close();
                in.close();
            }
            document.save(comic_name + ".pdf");
            document.close();
        }
        File[] files_del = directory.listFiles();
        if (files_del != null) {
            for (int k = 0; k < files_del.length; k++) {
                files_del[k].delete();
            }
        }
        directory.delete();
        System.out.println("Done");
    } catch (IOException e) {
    }
}

Example 16

Project: stocks-master File: CreateTextFromPDFHandler.java View source code

@Execute
public void execute(@Named(IServiceConstants.ACTIVE_PART) MPart part, @Named(IServiceConstants.ACTIVE_SHELL) Shell shell) throws IOException {
    // open file dialog to pick pdf files
    FileDialog fileDialog = new FileDialog(shell, SWT.OPEN | SWT.SINGLE);
    fileDialog.setText(Messages.PDFImportDebugTextExtraction);
    fileDialog.setFilterNames(new String[] { Messages.PDFImportFilterName });
    //$NON-NLS-1$
    fileDialog.setFilterExtensions(new String[] { "*.pdf" });
    fileDialog.open();
    String fileName = fileDialog.getFileName();
    if (fileName == null || fileName.isEmpty())
        return;
    File file = new File(fileDialog.getFilterPath(), fileName);
    try (PDDocument doc = PDDocument.load(file)) {
        PDFTextStripper textStripper = new PDFTextStripper();
        textStripper.setSortByPosition(true);
        String text = textStripper.getText(doc);
        new DisplayTextDialog(shell, text).open();
    } catch (IOException e) {
        PortfolioPlugin.log(e);
        MessageDialog.openError(shell, Messages.LabelError, e.getMessage());
    }
}

Example 17

Project: with-aes-master File: SynchronizedMetaDataValidation.java View source code

/**
   * Check if document information entries and XMP information are synchronized
   * 
   * @param document
   *          the PDF Document
   * @param metadata
   *          the XMP MetaData
   * @return List of validation errors
   * @throws ValidationException
   */
public List<ValidationError> validateMetadataSynchronization(PDDocument document, XMPMetadata metadata) throws ValidationException {
    List<ValidationError> ve = new ArrayList<ValidationError>();
    if (document == null) {
        throw new ValidationException("Document provided is null");
    } else {
        PDDocumentInformation dico = document.getDocumentInformation();
        if (metadata == null) {
            throw new ValidationException("Metadata provided are null");
        } else {
            DublinCoreSchema dc = metadata.getDublinCoreSchema();
            // TITLE
            analyzeTitleProperty(dico, dc, ve);
            // AUTHOR
            analyzeAuthorProperty(dico, dc, ve);
            // SUBJECT
            analyzeSubjectProperty(dico, dc, ve);
            AdobePDFSchema pdf = metadata.getAdobePDFSchema();
            // KEYWORDS
            analyzeKeywordsProperty(dico, pdf, ve);
            // PRODUCER
            analyzeProducerProperty(dico, pdf, ve);
            XMPBasicSchema xmp = metadata.getXMPBasicSchema();
            // CREATOR TOOL
            analyzeCreatorToolProperty(dico, xmp, ve);
            // CREATION DATE
            analyzeCreationDateProperty(dico, xmp, ve);
            // MODIFY DATE
            analyzeModifyDateProperty(dico, xmp, ve);
        }
    }
    return ve;
}

Example 18

Project: wkhtmltopdf-master File: PdfTest.java View source code

@Test
public void testPdfFromStringTo() throws Exception {
    // GIVEN a html template containing special characters that java stores in utf-16 internally
    Pdf pdf = new Pdf();
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>MÃ¼ller</h1></html>", PageType.htmlAsString);
    // WHEN
    byte[] pdfBytes = pdf.getPDF();
    PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
    Assert.assertThat("document should contain the creditorName", pdfText, containsString("MÃ¼ller"));
}

Example 19

Project: AGIA-master File: SplitPDFTasklet.java View source code

private int splitFile(Resource sSourceResource, ChunkContext sChunkContext) throws Exception {
    Map<String, Object> aDestinationParams = new HashMap<String, Object>();
    aDestinationParams.put(ResourceFactoryConstants.PARAM_SOURCE, sSourceResource);
    aDestinationParams.put(ResourceFactoryConstants.PARAM_STEP_EXEC, ((sChunkContext != null) && (sChunkContext.getStepContext() != null)) ? sChunkContext.getStepContext().getStepExecution() : null);
    Resource aDestination = null;
    int aResult = 0;
    PDDocumentContainer aDocumentContainer = null;
    try {
        aDocumentContainer = documentFactory.getDocument(sSourceResource.getFile());
        List<PDDocument> documents = aDocumentContainer.getParts();
        for (int i = 0; i < documents.size(); i++) {
            PDDocument doc = documents.get(i);
            // Output file factory
            int aTryCount = 10;
            do {
                aDestination = destinationFactory.getResource(aDestinationParams);
                aTryCount--;
            } while (!forceReplace && (aTryCount > 0) && (aDestination != null) && aDestination.exists());
            if ((aTryCount == 0) && !forceReplace) {
                throw new SplitPDFException("Cannot create a new destination filename");
            }
            if (aDestination != null) {
                if (aDestination.exists() && LOGGER.isWarnEnabled()) {
                    LOGGER.warn("Replacing {}", aDestination.getFile().getAbsolutePath());
                }
                writeDocument(doc, aDestination.getFile().getAbsolutePath());
                doc.close();
            } else {
                throw new SplitPDFException("No destination specified");
            }
            aResult++;
        }
    } finally {
        if (aDocumentContainer != null) {
            aDocumentContainer.close();
        }
    }
    return aResult;
}

Example 20

Project: batchers-master File: MonthlyTaxReportServiceTest.java View source code

@Test
public void generateReportWithCorrectData() throws PDFGenerationException, IOException {
    byte[] pdfBytes = monthlyTaxReportService.generateReport(3L, TEST_YEAR, TEST_MONTH);
    PDDocument pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
    assertThat(pdfDocument).containsText("WEBSERVICE RETURNS SUCCESS " + SUCCESS_AMOUNT + " euro");
    pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
    assertThat(pdfDocument).containsText("WEBSERVICE RETURNS FAILURE " + FAILED_AMOUNT + " euro");
    pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
    assertThat(pdfDocument).containsText("PERIOD: " + 5 + " " + TEST_YEAR);
}

Example 21

Project: BBAW_CMS-master File: PdfParserImpl.java View source code

/**
   * Parse a pdf-document and return the object returned by the
   * {@link ISaveStrategy} .
   * 
   * @return Object returned by the {@link ISaveStrategy}
   * @throws ApplicationException
   * @throws IllegalArgumentException
   *           if the uri is null or empty.
   * @throws IllegalStateException
   *           if the {@link ISaveStrategy} wasn't set before.
   */
public Object parse(final String startUri, final String uri) throws ApplicationException {
    if (uri == null || uri.isEmpty()) {
        throw new IllegalArgumentException("The value for the parameter parser in the method parse() in PdfParserImpl mustn't be empty.");
    }
    if (this.saveStrategy == null) {
        throw new IllegalStateException("You must define a saveStategy before calling the parse()-method in ResourceParser.");
    }
    try {
        PDDocument document;
        InputStream input = this.resourceReader.read(uri);
        document = PDDocument.load(input);
        List<String> pagesTexts = new ArrayList<String>();
        String text = "";
        PDFTextStripper stripper = new PDFTextStripper();
        for (int i = 1; i <= document.getNumberOfPages(); i++) {
            stripper.setStartPage(i);
            stripper.setEndPage(i);
            text = stripper.getText(document);
            ;
            pagesTexts.add(text);
        }
        document.close();
        input.close();
        PdfDocument doc = (PdfDocument) this.saveStrategy.generateDocumentModel(uri, uri, pagesTexts);
        // Set the standard metadata (page
        doc.setMetadata(new MetadataRecord());
        return doc;
    } catch (IOException e) {
        throw new ApplicationException("Problem while parsing file " + uri + "  -- exception: " + e.getMessage() + "\n");
    }
}

Example 22

Project: camel-master File: PdfProducer.java View source code

private Object doAppend(Exchange exchange) throws IOException, BadSecurityHandlerException, CryptographyException, InvalidPasswordException, COSVisitorException {
    LOG.debug("Got {} operation, going to append text to provided pdf.", pdfConfiguration.getOperation());
    String body = exchange.getIn().getBody(String.class);
    PDDocument document = exchange.getIn().getHeader(PDF_DOCUMENT_HEADER_NAME, PDDocument.class);
    if (document == null) {
        throw new IllegalArgumentException(String.format("%s header is expected for append operation", PDF_DOCUMENT_HEADER_NAME));
    }
    if (document.isEncrypted()) {
        DecryptionMaterial decryptionMaterial = exchange.getIn().getHeader(DECRYPTION_MATERIAL_HEADER_NAME, DecryptionMaterial.class);
        if (decryptionMaterial == null) {
            throw new IllegalArgumentException(String.format("%s header is expected for %s operation " + "on encrypted document", DECRYPTION_MATERIAL_HEADER_NAME, pdfConfiguration.getOperation()));
        }
        document.openProtection(decryptionMaterial);
        document.setAllSecurityToBeRemoved(true);
    }
    ProtectionPolicy protectionPolicy = exchange.getIn().getHeader(PROTECTION_POLICY_HEADER_NAME, ProtectionPolicy.class);
    appendToPdfDocument(body, document, protectionPolicy);
    OutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    document.save(byteArrayOutputStream);
    return byteArrayOutputStream;
}

Example 23

Project: ddf-master File: GeoPdfParserImpl.java View source code

/**
     * Generates a WKT compliant String from a PDF Document if it contains GeoPDF information.
     * Currently, only WGS84 Projections are supported (GEOGRAPHIC GeoPDF ProjectionType).
     *
     * @param pdfDocument - The PDF document
     * @return the WKT String
     * @throws IOException
     */
@Override
public String apply(PDDocument pdfDocument) throws IOException {
    ToDoubleVisitor toDoubleVisitor = new ToDoubleVisitor();
    LinkedList<String> polygons = new LinkedList<>();
    for (PDPage pdPage : pdfDocument.getPages()) {
        COSDictionary cosObject = pdPage.getCOSObject();
        COSBase lgiDictObject = cosObject.getObjectFromPath(LGIDICT);
        // Handle Multiple Map Frames
        if (lgiDictObject instanceof COSArray) {
            for (int i = 0; i < ((COSArray) lgiDictObject).size(); i++) {
                COSDictionary lgidict = (COSDictionary) cosObject.getObjectFromPath(LGIDICT + "/[" + i + "]");
                COSDictionary projectionArray = (COSDictionary) lgidict.getDictionaryObject(PROJECTION);
                if (projectionArray != null) {
                    String projectionType = ((COSString) projectionArray.getItem(PROJECTION_TYPE)).getString();
                    if (GEOGRAPHIC.equals(projectionType)) {
                        COSArray neatlineArray = (COSArray) cosObject.getObjectFromPath(LGIDICT + "/[" + i + "]/" + NEATLINE);
                        String wktString = getWktFromNeatLine(lgidict, neatlineArray, toDoubleVisitor);
                        polygons.add(wktString);
                    } else {
                        LOGGER.debug("Unsupported projection type {}.  Map Frame will be skipped.", projectionType);
                    }
                } else {
                    LOGGER.debug("No projection array found on the map frame.  Map Frame will be skipped.");
                }
            }
        // Handle One Map Frame
        } else if (lgiDictObject instanceof COSDictionary) {
            COSDictionary lgidict = (COSDictionary) lgiDictObject;
            COSDictionary projectionArray = (COSDictionary) lgidict.getDictionaryObject(PROJECTION);
            if (projectionArray != null) {
                String projectionType = ((COSString) projectionArray.getItem(PROJECTION_TYPE)).getString();
                if (GEOGRAPHIC.equals(projectionType)) {
                    COSArray neatlineArray = (COSArray) cosObject.getObjectFromPath(LGIDICT + "/" + NEATLINE);
                    if (neatlineArray == null) {
                        neatlineArray = generateNeatLineFromPDFDimensions(pdPage);
                    }
                    polygons.add(getWktFromNeatLine(lgidict, neatlineArray, toDoubleVisitor));
                } else {
                    LOGGER.debug("Unsupported projection type {}.  Map Frame will be skipped.", projectionType);
                }
            } else {
                LOGGER.debug("No projection array found on the map frame.  Map Frame will be skipped.");
            }
        }
    }
    if (polygons.size() == 0) {
        LOGGER.debug("No GeoPDF information found on PDF during transformation.  Metacard location will not be set.");
        return null;
    }
    if (polygons.size() == 1) {
        return POLYGON + polygons.get(0) + "))";
    } else {
        return polygons.stream().map( polygon -> "((" + polygon + "))").collect(Collectors.joining(",", MULTIPOLYGON, ")"));
    }
}

Example 24

Project: dlibrary-master File: CitationDocument.java View source code

/**
     * Creates a
     * cited document from the given bitstream of the given item. This
     * requires that bitstream is contained in item.
     * <p>
     * The Process for adding a cover page is as follows:
     * <ol>
     *  <li> Load source file into PdfReader and create a
     *     Document to put our cover page into.</li>
     *  <li> Create cover page and add content to it.</li>
     *  <li> Concatenate the coverpage and the source
     *     document.</li>
     * </p>
     *
     * @param bitstream The source bitstream being cited. This must be a PDF.
     * @return The temporary File that is the finished, cited document.
     * @throws java.io.FileNotFoundException
     * @throws SQLException
     * @throws org.dspace.authorize.AuthorizeException
     */
public File makeCitedDocument(Bitstream bitstream) throws IOException, SQLException, AuthorizeException, COSVisitorException {
    PDDocument document = new PDDocument();
    PDDocument sourceDocument = new PDDocument();
    try {
        Item item = (Item) bitstream.getParentObject();
        sourceDocument = sourceDocument.load(bitstream.retrieve());
        PDPage coverPage = new PDPage(PDPage.PAGE_SIZE_LETTER);
        generateCoverPage(document, coverPage, item);
        addCoverPageToDocument(document, sourceDocument, coverPage);
        document.save(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
        return new File(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
    } finally {
        sourceDocument.close();
        document.close();
    }
}

Example 25

Project: DSpace-master File: CitationDocumentServiceImpl.java View source code

@Override
public File makeCitedDocument(Context context, Bitstream bitstream) throws IOException, SQLException, AuthorizeException {
    PDDocument document = new PDDocument();
    PDDocument sourceDocument = new PDDocument();
    try {
        Item item = (Item) bitstreamService.getParentObject(context, bitstream);
        sourceDocument = sourceDocument.load(bitstreamService.retrieve(context, bitstream));
        // TODO: needs to be configurable
        PDPage coverPage = new PDPage(PDRectangle.LETTER);
        generateCoverPage(context, document, coverPage, item);
        addCoverPageToDocument(document, sourceDocument, coverPage);
        document.save(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
        return new File(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
    } finally {
        sourceDocument.close();
        document.close();
    }
}

Example 26

Project: DSpace-SVN-Deprecated-master File: PDFPackager.java View source code

private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        item.addDC("title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            item.addDC("contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }
        value = docinfo.getCreator();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
        }
        value = docinfo.getProducer();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        }
        value = docinfo.getSubject();
        if (value != null) {
            item.addDC("description", "abstract", null, value);
        }
        value = docinfo.getKeywords();
        if (value != null) {
            item.addDC("subject", "other", null, value);
        }
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }
        if (calValue != null) {
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        item.update();
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}

Example 27

Project: Europeana-Cloud-master File: PdfBoxExtractor.java View source code

@Override
public String extractText(InputStream is) {
    if (is == null) {
        LOGGER.warn("No data for extraction.");
        return null;
    }
    PDFParser parser;
    String parsedText = null;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    try {
        parser = new PDFParser(is);
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        PDDocumentInformation info = pdDoc.getDocumentInformation();
        Set<String> mdKeys = info.getMetadataKeys();
        extractedMetadata = new HashMap<>();
        for (String key : mdKeys) {
            String value = (String) info.getPropertyStringValue(key);
            extractedMetadata.put(key, value);
        }
        //possible NULL pointer if document is encrypted
        parsedText = pdfStripper.getText(pdDoc);
    } catch (IOException ex) {
        LOGGER.warn("Can not extract text from pdf because: " + ex.getMessage());
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (IOException ex) {
        }
    }
    return parsedText;
}

Example 28

Project: grobid-master File: FigureTableVisualizer.java View source code

private static void processPdfFile(File input, File outputFolder) throws Exception {
    inputPdf = input;
    annotated = false;
    annotatedFigure = false;
    final PDDocument document = PDDocument.load(input);
    File outPdf = new File("/tmp/testFigures.pdf");
    final Engine engine = setupEngine();
    File contentDir = new File("/tmp/contentDir");
    FileUtils.deleteDirectory(contentDir);
    File assetPath = new File(contentDir, "tei");
    GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder().pdfAssetPath(assetPath).withPreprocessImages(false).withProcessVectorGraphics(true).build();
    DocumentSource documentSource = DocumentSource.fromPdf(input);
    File pdf2xmlDirectory = new File(contentDir, "pdf2xml");
    pdf2xmlDirectory.mkdirs();
    FileUtils.copyFileToDirectory(input, contentDir);
    FileUtils.copyFile(documentSource.getXmlFile(), new File(pdf2xmlDirectory, "input.xml"));
    FileUtils.copyDirectory(new File(documentSource.getXmlFile().getAbsolutePath() + "_data"), new File(pdf2xmlDirectory, documentSource.getXmlFile().getName() + "_data"));
    System.out.println(documentSource.getXmlFile());
    blacklistedPages = getVectorGraphicPages(pdf2xmlDirectory);
    Document teiDoc = engine.fullTextToTEIDoc(documentSource, config);
    PDDocument out = annotateFigureAndTables(document, documentSource.getXmlFile(), teiDoc, false, false, true, true);
    if (out != null) {
        out.save(outPdf);
        if (singleFile) {
            if (Desktop.isDesktopSupported()) {
                Desktop.getDesktop().open(outPdf);
            }
        }
    }
    if (outputFolder != null) {
        if (annotated) {
            Engine.getCntManager().i("TABLES_TEST", "ANNOTATED_PDFS");
            FileUtils.copyFile(outPdf, new File(outputFolder, annotated ? (annotatedFigure ? input.getName() + "_annotatedFigure.pdf" : input.getName() + "_annotated.pdf") : input.getName()));
        }
    }
}

Example 29

Project: iswc2012metadata-master File: TaskParsePdf.java View source code

private static void extractText(File f) throws IOException {
    PDDocument pddDocument = PDDocument.load(f);
    PDFTextStripper textStripper = new PDFTextStripper();
    String content = textStripper.getText(pddDocument);
    TreeMap<PROP, String> temp = new TreeMap<PROP, String>();
    temp.put(PROP.lineSeparator, textStripper.getLineSeparator());
    temp.put(PROP.paragraphStart, textStripper.getParagraphStart());
    System.out.println(temp);
    DataPaperInPdf parser = new DataPaperInPdf(f.getName());
    for (String line : content.split(temp.get(PROP.lineSeparator))) {
        parser.processLine(line);
        if (DataPaperInPdf.STATE.content.equals(parser.state)) {
            break;
        }
    }
    parser.printReport();
    System.out.println("-----");
//System.out.println(content.substring(0, 500));
/*
        PDDocumentInformation info = pddDocument.getDocumentInformation();
        System.out.println( "Page Count=" + pddDocument.getNumberOfPages() );
        System.out.println( "Title=" + info.getTitle() );
        System.out.println( "Author=" + info.getAuthor() );
        System.out.println( "Subject=" + info.getSubject() );
        System.out.println( "Keywords=" + info.getKeywords() );
        System.out.println( "Creator=" + info.getCreator() );
        System.out.println( "Producer=" + info.getProducer() );
        System.out.println( "Creation Date=" + info.getCreationDate() );
        System.out.println( "Modification Date=" + info.getModificationDate());
        System.out.println( "Trapped=" + info.getTrapped() );   
        */
}

Example 30

Project: liferay-portal-master File: PDFProcessorImpl.java View source code

private int _getPreviewFilesCount(File encryptedFile, File decryptedFile) {
    String[] decryptPasswords = ArrayUtil.append(PropsValues.DL_FILE_ENTRY_PREVIEW_GENERATION_DECRYPT_PASSWORDS_PDFBOX, StringPool.BLANK);
    for (String decryptPassword : decryptPasswords) {
        try (PDDocument pdDocument = PDDocument.load(encryptedFile, decryptPassword)) {
            pdDocument.setAllSecurityToBeRemoved(true);
            pdDocument.save(decryptedFile);
            return pdDocument.getNumberOfPages();
        } catch (IOException ioe) {
            if (!(ioe instanceof InvalidPasswordException)) {
                _log.error(ioe, ioe);
            }
        }
    }
    return 0;
}

Example 31

Project: modeshape-master File: PdfBasicMetadata.java View source code

/*
     * Check that given file is supported by this sequencer.
     */
public boolean check() throws Exception {
    try (PDDocument document = PDDocument.load(in)) {
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDFPageable pageable = new PDFPageable(document);
        PageFormat firstPage = pageable.getPageFormat(0);
        encrypted = document.isEncrypted();
        pageCount = document.getNumberOfPages();
        orientation = ORIENTATION_STRINGS[firstPage.getOrientation()];
        version = String.valueOf(document.getDocument().getVersion());
        String catalogVersion = catalog.getVersion();
        if (catalogVersion != null && !catalogVersion.isEmpty()) {
            // According to specs version saved here should be determining instead
            // the version in header. It is barely used, though.
            version = catalogVersion;
        }
        if (!encrypted) {
            PDDocumentInformation metadata = document.getDocumentInformation();
            author = metadata.getAuthor();
            creationDate = metadata.getCreationDate();
            creator = metadata.getCreator();
            keywords = metadata.getKeywords();
            modificationDate = metadata.getModificationDate();
            producer = metadata.getProducer();
            subject = metadata.getSubject();
            title = metadata.getTitle();
        }
        // extract all attached files from all pages
        int pageNumber = 0;
        for (Object page : catalog.getPages()) {
            pageNumber += 1;
            PdfPageMetadata pageMetadata = new PdfPageMetadata();
            pageMetadata.setPageNumber(pageNumber);
            for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) {
                if (annotation instanceof PDAnnotationFileAttachment) {
                    PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata();
                    PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                    PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();
                    attachmentMetadata.setSubject(fann.getSubject());
                    attachmentMetadata.setName(fileSpec.getFilename());
                    attachmentMetadata.setCreationDate(embeddedFile.getCreationDate());
                    attachmentMetadata.setModificationDate(embeddedFile.getModDate());
                    attachmentMetadata.setMimeType(embeddedFile.getSubtype());
                    attachmentMetadata.setData(embeddedFile.toByteArray());
                    pageMetadata.addAttachment(attachmentMetadata);
                }
            }
            pages.add(pageMetadata);
        }
        return true;
    }
}

Example 32

Project: NeighborNote-master File: PDFPreview.java View source code

public int getPageCount(String filePath) {
    try {
        String whichOS = System.getProperty("os.name");
        if (whichOS.contains("Windows")) {
            filePath = filePath.replace("\\", "/");
        }
        PDDocument document = null;
        document = PDDocument.load(filePath);
        return document.getNumberOfPages();
    } catch (Exception e) {
        return 0;
    }
}

Example 33

Project: nevernote-master File: PDFPreview.java View source code

public int getPageCount(String filePath) {
    try {
        String whichOS = System.getProperty("os.name");
        if (whichOS.contains("Windows")) {
            filePath = filePath.replace("\\", "/");
        }
        PDDocument document = null;
        document = PDDocument.load(filePath);
        return document.getNumberOfPages();
    } catch (Exception e) {
        return 0;
    }
}

Example 34

Project: pdf-image-compare-master File: PdfToImageConverter.java View source code

/**
     * Split a PDF document into images.
     *
     * @param pdDocument  the source document
     * @param imageFormat the requested image format, e.g. "jpeg"
     * @param startPage   the first extracted page
     * @param endPage     the las extracted page
     * @param resolution  the resolution of the extracted images
     * @param color       the color model, e.g. "rgb", "gray"
     * @return a list of images
     * @throws Exception the conversion failed
     */
@SuppressWarnings("unchecked")
public List<BufferedImage> toImages(PDDocument pdDocument, String imageFormat, int startPage, int endPage, int resolution, String color) throws Exception {
    /**
         Validate.notNull(pdDocument, "pdDocument is null");
         Validate.notEmpty(imageFormat, "imageFormat is null");
         Validate.isTrue(startPage > 0, "invalid start page : " + startPage);
         Validate.isTrue(endPage >= startPage, "invalid end page : " + endPage);
         Validate.isTrue(resolution >= 0, "invalid resolution : " + resolution);
         */
    List<BufferedImage> result = new ArrayList<BufferedImage>();
    int imageType = getImageType(color);
    List<PDPage> pages = pdDocument.getDocumentCatalog().getAllPages();
    int pagesSize = pages.size();
    for (int i = startPage - 1; i < endPage && i < pagesSize; i++) {
        PDPage page = pages.get(i);
        PDRectangle cropBox = page.findCropBox();
        int currResolution = calculateResolution(resolution, cropBox.getWidth(), cropBox.getHeight());
        BufferedImage image = page.convertToImage(imageType, currResolution);
        result.add(image);
    }
    return result;
}

Example 35

Project: PDF-to-unusual-HTML-master File: PDTrueTypeFont.java View source code

/**
     * This will load a TTF to be embedded into a document.
     *
     * @param doc The PDF document that will hold the embedded font.
     * @param stream a ttf input stream.
     * @return a PDTrueTypeFont instance.
     * @throws IOException If there is an error loading the data.
     */
public static PDTrueTypeFont loadTTF(PDDocument doc, InputStream stream) throws IOException {
    PDTrueTypeFont retval = new PDTrueTypeFont();
    PDFontDescriptorDictionary fd = new PDFontDescriptorDictionary();
    retval.setFontDescriptor(fd);
    PDStream fontStream = new PDStream(doc, stream, false);
    fontStream.getStream().setInt(COSName.LENGTH1, fontStream.getByteArray().length);
    fontStream.addCompression();
    fd.setFontFile2(fontStream);
    // As the stream was close within the PDStream constructor, we have to recreate it
    stream = fontStream.createInputStream();
    try {
        retval.loadDescriptorDictionary(fd, stream);
    } finally {
        stream.close();
    }
    //only support winansi encoding right now, should really
    //just use Identity-H with unicode mapping
    retval.setFontEncoding(new WinAnsiEncoding());
    retval.setEncoding(COSName.WIN_ANSI_ENCODING);
    return retval;
}

Example 36

Project: rdf-indexer-master File: RdfTextSpider.java View source code

/**
     * Extract the text from the PDF specified by the URI
     * @param uri
     * @return
     * @throws IOException 
     */
private byte[] scrapeExternalPDF(final String uri) throws IOException {
    InputStream is = null;
    GetMethod get = new GetMethod(uri);
    ;
    PDDocument pdfDoc = null;
    try {
        int result;
        result = httpClient.executeMethod(get);
        if (result != 200) {
            throw new IOException(result + " code returned for URL: " + uri);
        }
        is = get.getResponseBodyAsStream();
        pdfDoc = PDDocument.load(is);
        PDFTextStripper pdfStrip = new PDFTextStripper();
        return pdfStrip.getText(pdfDoc).getBytes();
    } catch (IOException e) {
        throw e;
    } finally {
        try {
            get.releaseConnection();
            IOUtils.closeQuietly(is);
            if (pdfDoc != null) {
                pdfDoc.close();
            }
        } catch (Exception e) {
        }
    }
}

Example 37

Project: streamflow-core-master File: TaskFormDraftSummaryContext.java View source code

private PDDocument generatePdf(SubmittedFormValue submittedFormValue) throws Throwable {
    FormDraftDTO form = role(FormDraftDTO.class);
    FormPdfTemplate.Data selectedTemplate = role(FormPdfTemplate.Data.class);
    AttachedFile.Data template = (AttachedFile.Data) selectedTemplate.formPdfTemplate().get();
    if (template == null) {
        ProxyUser proxyUser = role(ProxyUser.class);
        template = (AttachedFile.Data) ((FormPdfTemplate.Data) proxyUser.organization().get()).formPdfTemplate().get();
        if (template == null) {
            template = (AttachedFile.Data) ((DefaultPdfTemplate.Data) proxyUser.organization().get()).defaultPdfTemplate().get();
        }
    }
    String uri = null;
    if (template != null) {
        uri = template.uri().get();
    }
    CaseId.Data idData = role(CaseId.Data.class);
    return pdfGenerator.generateSubmittedFormPdf(submittedFormValue, idData, uri, locale);
}

Example 38

Project: vtechworks-master File: PDFPackager.java View source code

private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        item.addDC("title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            item.addDC("contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }
        value = docinfo.getCreator();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
        }
        value = docinfo.getProducer();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        }
        value = docinfo.getSubject();
        if (value != null) {
            item.addDC("description", "abstract", null, value);
        }
        value = docinfo.getKeywords();
        if (value != null) {
            item.addDC("subject", "other", null, value);
        }
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }
        if (calValue != null) {
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        item.update();
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}

Example 39

Project: catma-core-master File: PDFContentHandler.java View source code

/* (non-Javadoc)
	 * @see de.catma.document.source.contenthandler.SourceContentHandler#load(java.io.InputStream)
	 */
public void load(InputStream is) throws IOException {
    PDDocument document = null;
    try {
        document = PDDocument.load(is, false);
        if (document.isEncrypted()) {
            throw new IOException("can not open pdf document because it is encrypted");
        }
        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            throw new IOException("You do not have permission to extract text");
        }
        PDFTextStripper stripper = new PDFTextStripper("UTF-8");
        stripper.setForceParsing(false);
        stripper.setSortByPosition(false);
        stripper.setShouldSeparateByBeads(true);
        stripper.setStartPage(1);
        stripper.setEndPage(Integer.MAX_VALUE);
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        Writer w = new OutputStreamWriter(os);
        try {
            stripper.writeText(document, w);
        } finally {
            w.close();
        }
        // some pdfs seem to include non valid unicode characters
        // and this causes problems when converting text to HTML
        // for GUI delivery and during indexing 
        setContent(os.toString().replaceAll("[^\\x09\\x0A\\x0D\\x20-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]", "?"));
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

Example 40

Project: extension-aws-master File: PdfParser.java View source code

public Parse parse(InputStream inContent) {
    Parse results = new Parse();
    PDDocument pdf = null;
    try {
        PDFParser parser = new PDFParser(inContent);
        //					new ByteArrayInputStream(inContent));
        parser.parse();
        pdf = parser.getPDDocument();
        if (pdf.isEncrypted()) {
            DocumentEncryption decryptor = new DocumentEncryption(pdf);
            // Just try using the default password and move on
            decryptor.decryptDocument("");
        }
        // collect text
        PDFTextStripper stripper = new PDFTextStripper();
        //TODO: Write this out to a temp file that will be indexed seperately
        String text = null;
        String title = null;
        try {
            text = stripper.getText(pdf);
        } catch (Throwable e) {
            log.error("Could not parse", e);
            text = "";
        }
        text = scrubChars(text);
        results.setText(text);
        results.setPages(pdf.getNumberOfPages());
        // collect title
        PDDocumentInformation info = pdf.getDocumentInformation();
        title = info.getTitle();
        results.setTitle(title);
        if (pdf.getNumberOfPages() > 0) {
            PDPage page = (PDPage) pdf.getDocumentCatalog().getAllPages().get(0);
            PDRectangle mediaBox = page.getMediaBox();
            if (mediaBox == null) {
                mediaBox = page.getArtBox();
            }
            if (mediaBox != null) {
                results.put("width", String.valueOf(Math.round(mediaBox.getWidth())));
                results.put("height", String.valueOf(Math.round(mediaBox.getHeight())));
            }
        }
    //Thread.sleep(500); // Slow down PDF's loading
    } catch (CryptographyException e) {
        log.error("Error decrypting document. " + e);
    } catch (InvalidPasswordException e) {
        log.error("Can't decrypt document - invalid password. " + e);
    } catch (Exception e) {
        log.error("Can't be handled as pdf document. " + e);
    } finally {
        try {
            if (pdf != null)
                pdf.close();
        } catch (IOException e) {
        }
    }
    return results;
}

Example 41

Project: FXDesktopSearch-master File: PDFPreviewGenerator.java View source code

@Override
public Preview createPreviewFor(File aFile) {
    try (PDDocument theDocument = PDDocument.load(aFile)) {
        PDPageTree thePages = theDocument.getPages();
        if (thePages.getCount() == 0) {
            return null;
        }
        PDPage theFirstPage = (PDPage) thePages.get(0);
        PDRectangle mBox = theFirstPage.getMediaBox();
        float theWidthPt = mBox.getWidth();
        // Math.round(widthPt * scaling);
        int theWidthPx = THUMB_WIDTH;
        // Math.round(heightPt * scaling);
        int theHeightPx = THUMB_HEIGHT;
        // resolution / 72.0F;
        float theScaling = THUMB_WIDTH / theWidthPt;
        BufferedImage theImage = new BufferedImage(theWidthPx, theHeightPx, BufferedImage.TYPE_INT_RGB);
        Graphics2D theGraphics = (Graphics2D) theImage.getGraphics();
        theGraphics.setBackground(new Color(255, 255, 255, 0));
        theGraphics.clearRect(0, 0, theImage.getWidth(), theImage.getHeight());
        PDFRenderer theRenderer = new PDFRenderer(theDocument);
        theRenderer.renderPageToGraphics(0, theGraphics, theScaling);
        int rotation = theFirstPage.getRotation();
        if ((rotation == 90) || (rotation == 270)) {
            int w = theImage.getWidth();
            int h = theImage.getHeight();
            BufferedImage rotatedImg = new BufferedImage(w, h, theImage.getType());
            Graphics2D g = rotatedImg.createGraphics();
            g.rotate(Math.toRadians(rotation), w / 2, h / 2);
            g.drawImage(theImage, null, 0, 0);
        }
        theGraphics.dispose();
        return new Preview(theImage);
    } catch (Exception e) {
        LOGGER.error("Error creating preview for " + aFile, e);
        return null;
    }
}

Example 42

Project: java-image-processing-survival-guide-master File: PdfBoxPreviewTest.java View source code

@Test
public void shouldCreatePdfPreviewImages() throws Exception {
    final int imageType = TYPE_INT_RGB;
    // final PDDocument pdDocument = PDDocument.load("./../../pdf/test-large-scan.pdf");
    final PDDocument pdDocument = PDDocument.load("./../../pdf/erste-document-01.pdf");
    final List<BufferedImage> images = toImages(pdDocument, START_PAGE, LAST_PAGE, DPI_72, imageType);
    assertNotNull(images);
    assertFalse(images.isEmpty());
    assertEquals(images.get(0).getType(), imageType);
    for (int i = 0; i < images.size(); i++) {
        File targetImageFile = createOutputFileName("shouldCreatePdfPreviewImages", "page-" + i, "jpeg");
        writeBufferedImage(images.get(i), "jpeg", targetImageFile);
    }
}

Example 43

Project: knowledge_vault-master File: MetadataExtractor.java View source code

/**
	 * Extract metadata from PDF
	 */
public static PdfMetadata pdfExtractor(InputStream is) throws IOException {
    PDDocument doc = PDDocument.load(is);
    PDDocumentInformation info = doc.getDocumentInformation();
    PdfMetadata md = new PdfMetadata();
    md.setNumberOfPages(doc.getNumberOfPages());
    md.setTitle(info.getTitle());
    md.setAuthor(info.getAuthor());
    md.setSubject(info.getSubject());
    md.setKeywords(info.getKeywords());
    md.setCreator(info.getCreator());
    md.setProducer(info.getProducer());
    md.setTrapped(info.getTrapped());
    md.setCreationDate(info.getCreationDate());
    md.setModificationDate(info.getModificationDate());
    log.info("pdfExtractor: {}", md);
    return md;
}

Example 44

Project: nuxeo-versions-difference-master File: TestPdfBoxN.java View source code

private boolean setMain(String FileName) throws Exception {
    file = new File(FileName);
    if (!file.isFile()) {
        System.err.println("File " + "test.pdf" + " does not exist.");
        return false;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return false;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
    } catch (Exception e) {
        return false;
    }
    return true;
}

Example 45

Project: OmegaT-master File: PdfFilter.java View source code

@Override
public BufferedReader createReader(File infile, String encoding) throws IOException, TranslationException {
    PDFTextStripper stripper;
    stripper = new PDFTextStripper();
    stripper.setLineSeparator("\n");
    stripper.setSortByPosition(true);
    try (PDDocument document = PDDocument.load(infile)) {
        String text = stripper.getText(document);
        return new BufferedReader(new StringReader(text));
    } catch (NoClassDefFoundError ex) {
        Logger.getLogger(getClass().getName()).log(Level.WARNING, OStrings.getString("PDFFILTER_ENCRYPTED_FILE"), infile);
        throw new TranslationException(ex);
    }
}

Example 46

Project: openolat-master File: ImageHelperImpl.java View source code

@Override
public Size thumbnailPDF(VFSLeaf pdfFile, VFSLeaf thumbnailFile, int maxWidth, int maxHeight) {
    InputStream in = null;
    PDDocument document = null;
    try {
        WorkThreadInformations.setInfoFiles(null, pdfFile);
        WorkThreadInformations.set("Generate thumbnail VFSLeaf=" + pdfFile);
        in = pdfFile.getInputStream();
        document = PDDocument.load(in);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (Exception e) {
                log.info("PDF document is encrypted: " + pdfFile);
                throw new CannotGenerateThumbnailException("PDF document is encrypted: " + pdfFile);
            }
        }
        List pages = document.getDocumentCatalog().getAllPages();
        PDPage page = (PDPage) pages.get(0);
        BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_BGR, 72);
        Size size = scaleImage(image, thumbnailFile, maxWidth, maxHeight);
        if (size != null) {
            return size;
        }
        return null;
    } catch (CannotGenerateThumbnailException e) {
        return null;
    } catch (Exception e) {
        log.warn("Unable to create image from pdf file.", e);
        return null;
    } finally {
        WorkThreadInformations.unset();
        FileUtils.closeSafely(in);
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
            }
        }
    }
}

Example 47

Project: sakai-cle-master File: PDFContentDigester.java View source code

public String getContent(ContentResource contentResource) {
    if (contentResource == null) {
        throw new RuntimeException("Null contentResource passed to getContent");
    }
    InputStream contentStream = null;
    PDFParser parser = null;
    PDDocument pddoc = null;
    try {
        contentStream = contentResource.streamContent();
        parser = new PDFParser(new BufferedInputStream(contentStream));
        parser.parse();
        pddoc = parser.getPDDocument();
        if (pddoc != null) {
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            CharArrayWriter cw = new CharArrayWriter();
            stripper.writeText(pddoc, cw);
            return SearchUtils.appendCleanString(cw.toCharArray(), null).toString();
        }
    } catch (ServerOverloadException e) {
        String eMessage = e.getMessage();
        if (eMessage == null) {
            eMessage = e.toString();
        }
        throw new RuntimeException("Failed to get content for indexing: cause: ServerOverloadException: " + eMessage, e);
    } catch (IOException e) {
        String eMessage = e.getMessage();
        if (eMessage == null) {
            eMessage = e.toString();
        }
        throw new RuntimeException("Failed to get content for indexing: cause: IOException:  " + eMessage, e);
    } finally {
        if (pddoc != null) {
            try {
                pddoc.close();
            } catch (IOException e) {
                log.debug(e);
            }
        }
        if (contentStream != null) {
            try {
                contentStream.close();
            } catch (IOException e) {
                log.debug(e);
            }
        }
    }
    return null;
}

Example 48

Project: tabula-java-master File: Debug.java View source code

public static void renderPage(String pdfPath, String outPath, int pageNumber, Rectangle area, boolean drawTextChunks, boolean drawSpreadsheets, boolean drawRulings, boolean drawIntersections, boolean drawColumns, boolean drawCharacters, boolean drawArea, boolean drawCells, boolean drawUnprocessedRulings, boolean drawProjectionProfile, boolean drawClippingPaths, boolean drawDetectedTables) throws IOException {
    PDDocument document = PDDocument.load(new File(pdfPath));
    ObjectExtractor oe = new ObjectExtractor(document);
    Page page = oe.extract(pageNumber + 1);
    if (area != null) {
        page = page.getArea(area);
    }
    PDPage p = (PDPage) document.getPage(pageNumber);
    BufferedImage image = Utils.pageConvertToImage(p, 72, ImageType.RGB);
    Graphics2D g = (Graphics2D) image.getGraphics();
    if (drawTextChunks) {
        debugTextChunks(g, page);
    }
    if (drawSpreadsheets) {
        debugSpreadsheets(g, page);
    }
    if (drawRulings) {
        debugRulings(g, page);
    }
    if (drawIntersections) {
        debugIntersections(g, page);
    }
    if (drawColumns) {
        debugColumns(g, page);
    }
    if (drawCharacters) {
        debugCharacters(g, page);
    }
    if (drawArea) {
        g.setColor(Color.ORANGE);
        drawShape(g, area);
    }
    if (drawCells) {
        debugCells(g, area, page);
    }
    if (drawUnprocessedRulings) {
        debugNonCleanRulings(g, page);
    }
    if (drawProjectionProfile) {
        debugProjectionProfile(g, page);
    }
    if (drawClippingPaths) {
    // TODO: Enable when oe.clippingPaths is done
    //drawShapes(g, oe.clippingPaths,
    //		new BasicStroke(2f, BasicStroke.CAP_BUTT, BasicStroke.JOIN_MITER, 10f, new float[] { 3f }, 0f));
    }
    if (drawDetectedTables) {
        debugDetectedTables(g, page);
    }
    document.close();
    ImageIOUtil.writeImage(image, outPath, 72);
}

Example 49

Project: tika-master File: PDF2XHTML.java View source code

/**
     * Converts the given PDF document (and related metadata) to a stream
     * of XHTML SAX events sent to the given content handler.
     *
     * @param document PDF document
     * @param handler  SAX content handler
     * @param metadata PDF metadata
     * @throws SAXException  if the content handler fails to process SAX events
     * @throws TikaException if there was an exception outside of per page processing
     */
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
    PDF2XHTML pdf2XHTML = null;
    try {
        // Extract text using a dummy Writer as we override the
        // key methods to output to the given content
        // handler.
        pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
        config.configure(pdf2XHTML);
        pdf2XHTML.writeText(document, new Writer() {

            @Override
            public void write(char[] cbuf, int off, int len) {
            }

            @Override
            public void flush() {
            }

            @Override
            public void close() {
            }
        });
    } catch (IOException e) {
        if (e.getCause() instanceof SAXException) {
            throw (SAXException) e.getCause();
        } else {
            throw new TikaException("Unable to extract PDF content", e);
        }
    }
    if (pdf2XHTML.exceptions.size() > 0) {
        //throw the first
        throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
    }
}

Example 50

Project: xcmis-master File: PDFDocumentRenditionProvider.java View source code

/**
    * {@inheritDoc}
    */
public RenditionContentStream getRenditionStream(ContentStream stream) throws IOException {
    PDDocument pdf = null;
    try {
        pdf = PDDocument.load(stream.getStream());
        PDPage page = (PDPage) pdf.getDocumentCatalog().getAllPages().get(0);
        BufferedImage image = page.convertToImage();
        // Determine scale and be sure both width and height are not greater the max
        int scale = (int) Math.max(Math.floor((image.getHeight() / maxHeight) + 1.0d), Math.floor((image.getWidth() / maxWidth) + 1.0d));
        int height = image.getHeight() / scale;
        int width = image.getWidth() / scale;
        BufferedImage scaledImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
        Graphics2D graphics2D = scaledImage.createGraphics();
        graphics2D.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR);
        graphics2D.drawImage(image, 0, 0, width, height, null);
        graphics2D.dispose();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        ImageIO.write(scaledImage, "png", out);
        RenditionContentStream renditionStream = new RenditionContentStream(out.toByteArray(), null, new MimeType("image", " png"), getKind(), height, width);
        return renditionStream;
    } finally {
        if (pdf != null) {
            pdf.close();
        }
    }
}

Example 51

Project: CZ3003_Backend-master File: CReport.java View source code

public static void genReport(JSONArray pObjAry) throws IOException, COSVisitorException {
    String imagePath = "C:\\Users\\Bryden\\Desktop\\pie-sample.png";
    List<List<String>> lstContents = new ArrayList<>();
    List<String> aryLst = new ArrayList<>();
    aryLst.add("Incident Type");
    aryLst.add("");
    lstContents.add(aryLst);
    for (Object obj : pObjAry) {
        JSONObject objJson = (JSONObject) obj;
        Iterator<?> keys = objJson.keySet().iterator();
        while (keys.hasNext()) {
            String key = (String) keys.next();
            // loop to get the dynamic key
            String value = (String) objJson.get(key);
            List<String> aryValues = new ArrayList<>();
            aryValues.add(key);
            aryValues.add(value);
            lstContents.add(aryValues);
        }
    }
    try (// Create a document and add a page to it
    PDDocument document = new PDDocument()) {
        PDPage page = new PDPage(PDPage.PAGE_SIZE_A4);
        document.addPage(page);
        // Create a new font object selecting one of the PDF base fonts
        PDFont font = PDType1Font.HELVETICA_BOLD;
        InputStream in = Files.newInputStream(Paths.get(imagePath));
        PDJpeg img = new PDJpeg(document, in);
        // Define a text content stream using the selected font, moving the cursor and drawing the text "Hello World"
        try (// Start a new content stream which will "hold" the to be created content
        PDPageContentStream contentStream = new PDPageContentStream(document, page)) {
            // Define a text content stream using the selected font, moving the cursor and drawing the text "Hello World"
            contentStream.beginText();
            contentStream.setFont(font, 20);
            contentStream.moveTextPositionByAmount(70, 720);
            contentStream.drawString("Incident Summary " + new Date());
            contentStream.endText();
            contentStream.beginText();
            contentStream.setFont(font, 20);
            contentStream.moveTextPositionByAmount(100, 670);
            contentStream.drawString("Statistics");
            contentStream.endText();
            contentStream.drawImage(img, 10, 10);
            drawTable(page, contentStream, 650, 100, lstContents);
        // Make sure that the content stream is closed:
        }
        img.clear();
        // Save the results and ensure that the document is properly closed:
        document.save("Hello World.pdf");
    }
}

Example 52

Project: drc-master File: PdfContentExtractor.java View source code

/**
	 * @param pdfName
	 *            The full path to the PDF file to extract content from
	 * @return The PageInfo object for the PDF
	 */
public static PageInfo extractContentFromPdf(String pdfName) {
    try {
        location = pdfName;
        PDDocument document = PDDocument.load(new File(pdfName));
        PdfContentExtractor x = initExtractor(document);
        PageInfo result = x.toPageInfo();
        document.close();
        return result;
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}

Example 53

Project: gcs-master File: PdfRenderer.java View source code

public static BufferedImage create(PDDocument pdf, int pageIndex, float scale, String textToHighlight) {
    try {
        PDFRenderer renderer = new PDFRenderer(pdf);
        scale = scale * Toolkit.getDefaultToolkit().getScreenResolution() / 72f;
        BufferedImage img = renderer.renderImage(pageIndex, scale);
        if (textToHighlight != null) {
            Graphics2D gc = img.createGraphics();
            gc.setStroke(new BasicStroke(0.1f));
            gc.scale(scale, scale);
            PdfRenderer processor = new PdfRenderer(gc, textToHighlight);
            processor.setSortByPosition(true);
            processor.setStartPage(pageIndex + 1);
            processor.setEndPage(pageIndex + 1);
            try (DummyWriter writer = new DummyWriter()) {
                processor.writeText(pdf, writer);
            }
            gc.dispose();
        }
        return img;
    } catch (Exception exception) {
        Log.error(exception);
        return null;
    }
}

Example 54

Project: infoLink-master File: TextExtractor.java View source code

public InfolisFile extract(InfolisFile inFile, int startPage, boolean tokenize) throws IOException {
    String asText = null;
    // TODO make configurable
    String outFileName = SerializationUtils.changeFileExtension(inFile.getFileName(), "txt");
    // if no output directory is given, create temporary output files
    if (null == getExecution().getOutputDirectory() || getExecution().getOutputDirectory().equals("")) {
        String EXTRACTED_DIR_PREFIX = "extracted-";
        String tempDir = Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), EXTRACTED_DIR_PREFIX).toString();
        FileUtils.forceDeleteOnExit(new File(tempDir));
        outFileName = SerializationUtils.changeBaseDir(outFileName, tempDir);
    } else {
        outFileName = SerializationUtils.changeBaseDir(outFileName, getExecution().getOutputDirectory());
    }
    InfolisFile outFile = new InfolisFile();
    outFile.setFileName(outFileName);
    outFile.setOriginalName(inFile.getFileName());
    outFile.setMediaType("text/plain");
    if (getExecution().getOverwriteTextfiles() == false) {
        File _outFile = new File(outFileName);
        if (_outFile.exists()) {
            debug(log, "File exists: {}, skipping text extraction for {}", _outFile, inFile);
            asText = FileUtils.readFileToString(_outFile, "utf-8");
            outFile.setMd5(SerializationUtils.getHexMd5(asText));
            outFile.setFileStatus("AVAILABLE");
            return outFile;
        }
    }
    InputStream inStream = null;
    OutputStream outStream = null;
    PDDocument pdfIn = null;
    try {
        inStream = getInputFileResolver().openInputStream(inFile);
        try {
            pdfIn = PDDocument.load(inStream);
            asText = extractText(pdfIn, startPage);
            if (null == asText) {
                throw new IOException("extractText returned null!");
            }
            if (getExecution().isRemoveBib()) {
                asText = removeBibSection(asText);
            }
            if (getExecution().isTokenize()) {
                asText = tokenizeText(asText);
            }
            Set<String> tagsToSet = getExecution().getTags();
            tagsToSet.addAll(inFile.getTags());
            tagsToSet.addAll(executionTags);
            outFile.setTags(tagsToSet);
            outFile.setMd5(SerializationUtils.getHexMd5(asText));
            outFile.setFileStatus("AVAILABLE");
            try {
                outStream = getOutputFileResolver().openOutputStream(outFile);
                try {
                    IOUtils.write(asText, outStream);
                } catch (IOException e) {
                    warn(log, "Error copying text to output stream: " + e);
                    throw e;
                }
            } catch (IOException e) {
                warn(log, "Error opening output stream to text file: " + e);
                throw e;
            }
            return outFile;
        } catch (IOException e) {
            warn(log, "Error reading PDF from stream: " + e);
            throw e;
        }
    } catch (IOException e) {
        warn(log, "Error opening input stream: " + e);
        throw e;
    } catch (Exception e) {
        warn(log, "Error converting PDF to text: " + e);
        throw e;
    } finally {
        if (null != outStream)
            outStream.close();
        if (null != inStream)
            inStream.close();
        if (null != pdfIn)
            pdfIn.close();
    }
}

Example 55

Project: jabref-2.9.2-master File: PdfContentImporter.java View source code

@Override
public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException {
    final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1);
    PDDocument document = null;
    try {
        document = PDDocument.load(in);
    } catch (IOException e) {
        logger.log(Level.SEVERE, "Could not load document", e);
        return res;
    }
    try {
        if (document.isEncrypted()) {
            logger.log(Level.INFO, Globals.lang("Encrypted documents are not supported"));
        //return res;
        }
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(1);
        stripper.setEndPage(1);
        stripper.setSortByPosition(true);
        stripper.setParagraphEnd(System.getProperty("line.separator"));
        StringWriter writer = new StringWriter();
        stripper.writeText(document, writer);
        String textResult = writer.toString();
        String doi = Util.getDOI(textResult);
        if (doi.length() < textResult.length()) {
            // A DOI was found in the text
            // We do NO parsing of the text, but use the DOI fetcher
            ImportInspector i = new ImportInspector() {

                @Override
                public void toFront() {
                }

                @Override
                public void setProgress(int current, int max) {
                }

                @Override
                public void addEntry(BibtexEntry entry) {
                    // add the entry to the result object
                    res.add(entry);
                }
            };
            doiToBibTeXFetcher.processQuery(doi, i, status);
            if (res.size() != 0) {
                // if something has been found, return the result
                return res;
            } else {
            // otherwise, we just parse the PDF
            }
        }
        String author = null;
        String editor = null;
        String institution = null;
        String abstractT = null;
        String keywords = null;
        String title = null;
        String conference = null;
        String DOI = null;
        String series = null;
        String volume = null;
        String number = null;
        String pages = null;
        // year is a class variable as the method extractYear() uses it;
        String publisher = null;
        BibtexEntryType type = BibtexEntryType.INPROCEEDINGS;
        final String lineBreak = System.getProperty("line.separator");
        split = textResult.split(lineBreak);
        // idea: split[] contains the different lines
        // blocks are separated by empty lines
        // treat each block
        //   or do special treatment at authors (which are not broken)
        //   therefore, we do a line-based and not a block-based splitting
        // i points to the current line
        // curString (mostly) contains the current block
        //   the different lines are joined into one and thereby separated by " "
        proceedToNextNonEmptyLine();
        if (i >= split.length) {
            // return empty list
            return res;
        }
        curString = split[i];
        i = i + 1;
        if (curString.length() > 4) {
            // special case: possibly conference as first line on the page
            extractYear();
            if (curString.contains("Conference")) {
                fillCurStringWithNonEmptyLines();
                conference = curString;
                curString = "";
            } else {
                // e.g. Copyright (c) 1998 by the Genetics Society of America
                // future work: get year using RegEx
                String lower = curString.toLowerCase();
                if (lower.contains("copyright")) {
                    fillCurStringWithNonEmptyLines();
                    publisher = curString;
                    curString = "";
                }
            }
        }
        // start: title
        fillCurStringWithNonEmptyLines();
        title = streamlineTitle(curString);
        curString = "";
        //i points to the next non-empty line
        // after title: authors
        author = null;
        while ((i < split.length) && (!split[i].equals(""))) {
            // author names are unlikely to be split among different lines
            // treat them line by line
            curString = streamlineNames(split[i]);
            if (author == null) {
                author = curString;
            } else {
                if (curString.equals("")) {
                // if split[i] is "and" then "" is returned by streamlineNames -> do nothing
                } else {
                    author = author.concat(" and ").concat(curString);
                }
            }
            i++;
        }
        curString = "";
        i++;
        // then, abstract and keywords follow
        while (i < split.length) {
            curString = split[i];
            if ((curString.length() >= "Abstract".length()) && (curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract"))) {
                if (curString.length() == "Abstract".length()) {
                    // only word "abstract" found -- skip line
                    curString = "";
                } else {
                    curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak);
                }
                i++;
                // whereas we need linebreak as separator
                while ((i < split.length) && (!split[i].equals(""))) {
                    curString = curString.concat(split[i]).concat(lineBreak);
                    i++;
                }
                abstractT = curString;
                i++;
            } else if ((curString.length() >= "Keywords".length()) && (curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords"))) {
                if (curString.length() == "Keywords".length()) {
                    // only word "Keywords" found -- skip line
                    curString = "";
                } else {
                    curString = curString.substring("Keywords".length() + 1).trim();
                }
                i++;
                fillCurStringWithNonEmptyLines();
                keywords = removeNonLettersAtEnd(curString);
            } else {
                String lower = curString.toLowerCase();
                int pos = lower.indexOf("technical");
                if (pos >= 0) {
                    type = BibtexEntryType.TECHREPORT;
                    pos = curString.trim().lastIndexOf(' ');
                    if (pos >= 0) {
                        // assumption: last character of curString is NOT ' '
                        //   otherwise pos+1 leads to an out-of-bounds exception
                        number = curString.substring(pos + 1);
                    }
                }
                i++;
                proceedToNextNonEmptyLine();
            }
        }
        i = split.length - 1;
        while (i >= 0) {
            readLastBlock();
            // i now points to the block before or is -1
            // curString contains the last block, separated by " "
            extractYear();
            int pos = curString.indexOf("(Eds.)");
            if ((pos >= 0) && (publisher == null)) {
                // looks like a Springer last line
                // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
                publisher = "Springer";
                editor = streamlineNames(curString.substring(0, pos - 1));
                //+2 because of ":" after (Eds.) and the subsequent space
                curString = curString.substring(pos + "(Eds.)".length() + 2);
                String[] springerSplit = curString.split(", ");
                if (springerSplit.length >= 4) {
                    conference = springerSplit[0];
                    String seriesData = springerSplit[1];
                    int lastSpace = seriesData.lastIndexOf(' ');
                    series = seriesData.substring(0, lastSpace);
                    volume = seriesData.substring(lastSpace + 1);
                    pages = springerSplit[2].substring(4);
                    if (springerSplit[3].length() >= 4) {
                        year = springerSplit[3].substring(0, 4);
                    }
                }
            } else {
                if (DOI == null) {
                    pos = curString.indexOf("DOI");
                    if (pos < 0)
                        pos = curString.indexOf("doi");
                    if (pos >= 0) {
                        pos += 3;
                        char delimiter = curString.charAt(pos);
                        if ((delimiter == ':') || (delimiter == ' ')) {
                            pos++;
                        }
                        int nextSpace = curString.indexOf(' ', pos);
                        if (nextSpace > 0)
                            DOI = curString.substring(pos, nextSpace);
                        else
                            DOI = curString.substring(pos);
                    }
                }
                if ((publisher == null) && (curString.indexOf("IEEE") >= 0)) {
                    // IEEE has the conference things at the end
                    publisher = "IEEE";
                    if (conference == null) {
                        pos = curString.indexOf('$');
                        if (pos > 0) {
                            // we found the price
                            // before the price, the ISSN is stated
                            // skip that
                            pos -= 2;
                            while ((pos >= 0) && (curString.charAt(pos) != ' ')) pos--;
                            if (pos > 0) {
                                conference = curString.substring(0, pos);
                            }
                        }
                    }
                }
            //					String lower = curString.toLowerCase();
            //					if (institution == null) {
            //						
            //					}
            }
        }
        BibtexEntry entry = new BibtexEntry();
        entry.setType(type);
        if (author != null)
            entry.setField("author", author);
        if (editor != null)
            entry.setField("editor", editor);
        if (institution != null)
            entry.setField("institution", institution);
        if (abstractT != null)
            entry.setField("abstract", abstractT);
        if (keywords != null)
            entry.setField("keywords", keywords);
        if (title != null)
            entry.setField("title", title);
        if (conference != null)
            entry.setField("booktitle", conference);
        if (DOI != null)
            entry.setField("doi", DOI);
        if (series != null)
            entry.setField("series", series);
        if (volume != null)
            entry.setField("volume", volume);
        if (number != null)
            entry.setField("number", number);
        if (pages != null)
            entry.setField("pages", pages);
        if (year != null)
            entry.setField("year", year);
        if (publisher != null)
            entry.setField("publisher", publisher);
        entry.setField("review", textResult);
        res.add(entry);
    } catch (NoClassDefFoundError e) {
        if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) {
            status.showMessage(Globals.lang("Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/."));
        } else {
            logger.log(Level.SEVERE, e.getLocalizedMessage(), e);
        }
    } finally {
        document.close();
    }
    return res;
}

Example 56

Project: jabref-master File: PdfContentImporter.java View source code

@Override
public ParserResult importDatabase(Path filePath, Charset defaultEncoding) {
    final ArrayList<BibEntry> result = new ArrayList<>(1);
    try (FileInputStream fileStream = new FileInputStream(filePath.toFile());
        PDDocument document = XMPUtil.loadWithAutomaticDecryption(fileStream)) {
        String firstPageContents = getFirstPageContents(document);
        Optional<DOI> doi = DOI.findInText(firstPageContents);
        if (doi.isPresent()) {
            ParserResult parserResult = new ParserResult(result);
            Optional<BibEntry> entry = new DoiFetcher(importFormatPreferences).performSearchById(doi.get().getDOI());
            entry.ifPresent(parserResult.getDatabase()::insertEntry);
            return parserResult;
        }
        // idea: split[] contains the different lines
        // blocks are separated by empty lines
        // treat each block
        //   or do special treatment at authors (which are not broken)
        //   therefore, we do a line-based and not a block-based splitting
        // i points to the current line
        // curString (mostly) contains the current block
        //   the different lines are joined into one and thereby separated by " "
        lines = firstPageContents.split(System.lineSeparator());
        proceedToNextNonEmptyLine();
        if (i >= lines.length) {
            // return empty list
            return new ParserResult();
        }
        // we start at the current line
        curString = lines[i];
        // i might get incremented later and curString modified, too
        i = i + 1;
        String author;
        String editor = null;
        String abstractT = null;
        String keywords = null;
        String title;
        String conference = null;
        String DOI = null;
        String series = null;
        String volume = null;
        String number = null;
        String pages = null;
        // year is a class variable as the method extractYear() uses it;
        String publisher = null;
        EntryType type = BibtexEntryTypes.INPROCEEDINGS;
        if (curString.length() > 4) {
            // special case: possibly conference as first line on the page
            extractYear();
            if (curString.contains("Conference")) {
                fillCurStringWithNonEmptyLines();
                conference = curString;
                curString = "";
            } else {
                // e.g. Copyright (c) 1998 by the Genetics Society of America
                // future work: get year using RegEx
                String lower = curString.toLowerCase(Locale.ROOT);
                if (lower.contains("copyright")) {
                    fillCurStringWithNonEmptyLines();
                    publisher = curString;
                    curString = "";
                }
            }
        }
        // start: title
        fillCurStringWithNonEmptyLines();
        title = streamlineTitle(curString);
        curString = "";
        //i points to the next non-empty line
        // after title: authors
        author = null;
        while ((i < lines.length) && !"".equals(lines[i])) {
            // author names are unlikely to be lines among different lines
            // treat them line by line
            curString = streamlineNames(lines[i]);
            if (author == null) {
                author = curString;
            } else {
                if ("".equals(curString)) {
                // if lines[i] is "and" then "" is returned by streamlineNames -> do nothing
                } else {
                    author = author.concat(" and ").concat(curString);
                }
            }
            i++;
        }
        curString = "";
        i++;
        // then, abstract and keywords follow
        while (i < lines.length) {
            curString = lines[i];
            if ((curString.length() >= "Abstract".length()) && "Abstract".equalsIgnoreCase(curString.substring(0, "Abstract".length()))) {
                if (curString.length() == "Abstract".length()) {
                    // only word "abstract" found -- skip line
                    curString = "";
                } else {
                    curString = curString.substring("Abstract".length() + 1).trim().concat(System.lineSeparator());
                }
                i++;
                // whereas we need linebreak as separator
                while ((i < lines.length) && !"".equals(lines[i])) {
                    curString = curString.concat(lines[i]).concat(System.lineSeparator());
                    i++;
                }
                abstractT = curString.trim();
                i++;
            } else if ((curString.length() >= "Keywords".length()) && "Keywords".equalsIgnoreCase(curString.substring(0, "Keywords".length()))) {
                if (curString.length() == "Keywords".length()) {
                    // only word "Keywords" found -- skip line
                    curString = "";
                } else {
                    curString = curString.substring("Keywords".length() + 1).trim();
                }
                i++;
                fillCurStringWithNonEmptyLines();
                keywords = removeNonLettersAtEnd(curString);
            } else {
                String lower = curString.toLowerCase(Locale.ROOT);
                int pos = lower.indexOf("technical");
                if (pos >= 0) {
                    type = BibtexEntryTypes.TECHREPORT;
                    pos = curString.trim().lastIndexOf(' ');
                    if (pos >= 0) {
                        // assumption: last character of curString is NOT ' '
                        //   otherwise pos+1 leads to an out-of-bounds exception
                        number = curString.substring(pos + 1);
                    }
                }
                i++;
                proceedToNextNonEmptyLine();
            }
        }
        i = lines.length - 1;
        while (i >= 0) {
            readLastBlock();
            // i now points to the block before or is -1
            // curString contains the last block, separated by " "
            extractYear();
            int pos = curString.indexOf("(Eds.)");
            if ((pos >= 0) && (publisher == null)) {
                // looks like a Springer last line
                // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
                publisher = "Springer";
                editor = streamlineNames(curString.substring(0, pos - 1));
                //+2 because of ":" after (Eds.) and the subsequent space
                curString = curString.substring(pos + "(Eds.)".length() + 2);
                String[] springerSplit = curString.split(", ");
                if (springerSplit.length >= 4) {
                    conference = springerSplit[0];
                    String seriesData = springerSplit[1];
                    int lastSpace = seriesData.lastIndexOf(' ');
                    series = seriesData.substring(0, lastSpace);
                    volume = seriesData.substring(lastSpace + 1);
                    pages = springerSplit[2].substring(4);
                    if (springerSplit[3].length() >= 4) {
                        year = springerSplit[3].substring(0, 4);
                    }
                }
            } else {
                if (DOI == null) {
                    pos = curString.indexOf("DOI");
                    if (pos < 0) {
                        pos = curString.indexOf(FieldName.DOI);
                    }
                    if (pos >= 0) {
                        pos += 3;
                        char delimiter = curString.charAt(pos);
                        if ((delimiter == ':') || (delimiter == ' ')) {
                            pos++;
                        }
                        int nextSpace = curString.indexOf(' ', pos);
                        if (nextSpace > 0) {
                            DOI = curString.substring(pos, nextSpace);
                        } else {
                            DOI = curString.substring(pos);
                        }
                    }
                }
                if ((publisher == null) && curString.contains("IEEE")) {
                    // IEEE has the conference things at the end
                    publisher = "IEEE";
                    if (conference == null) {
                        pos = curString.indexOf('$');
                        if (pos > 0) {
                            // we found the price
                            // before the price, the ISSN is stated
                            // skip that
                            pos -= 2;
                            while ((pos >= 0) && (curString.charAt(pos) != ' ')) {
                                pos--;
                            }
                            if (pos > 0) {
                                conference = curString.substring(0, pos);
                            }
                        }
                    }
                }
            }
        }
        BibEntry entry = new BibEntry();
        entry.setType(type);
        if (author != null) {
            entry.setField(FieldName.AUTHOR, author);
        }
        if (editor != null) {
            entry.setField(FieldName.EDITOR, editor);
        }
        if (abstractT != null) {
            entry.setField(FieldName.ABSTRACT, abstractT);
        }
        if (!Strings.isNullOrEmpty(keywords)) {
            entry.setField(FieldName.KEYWORDS, keywords);
        }
        if (title != null) {
            entry.setField(FieldName.TITLE, title);
        }
        if (conference != null) {
            entry.setField(FieldName.BOOKTITLE, conference);
        }
        if (DOI != null) {
            entry.setField(FieldName.DOI, DOI);
        }
        if (series != null) {
            entry.setField(FieldName.SERIES, series);
        }
        if (volume != null) {
            entry.setField(FieldName.VOLUME, volume);
        }
        if (number != null) {
            entry.setField(FieldName.NUMBER, number);
        }
        if (pages != null) {
            entry.setField(FieldName.PAGES, pages);
        }
        if (year != null) {
            entry.setField(FieldName.YEAR, year);
        }
        if (publisher != null) {
            entry.setField(FieldName.PUBLISHER, publisher);
        }
        result.add(entry);
    } catch (EncryptedPdfsNotSupportedException e) {
        return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
    } catch (IOException exception) {
        return ParserResult.fromError(exception);
    } catch (FetcherException e) {
        return ParserResult.fromErrorMessage(e.getMessage());
    }
    return new ParserResult(result);
}

Example 57

Project: java-thumbnailer-master File: PDFBoxThumbnailer.java View source code

@Override
public void generateThumbnail(File input, File output) throws IOException, ThumbnailerException {
    FileDoesNotExistException.check(input);
    if (input.length() == 0)
        throw new FileDoesNotExistException("File is empty");
    FileUtils.deleteQuietly(output);
    PDDocument document = null;
    try {
        try {
            document = PDDocument.load(input);
        } catch (IOException e) {
            throw new ThumbnailerException("Could not load PDF File", e);
        }
        BufferedImage tmpImage = writeImageFirstPage(document, BufferedImage.TYPE_INT_RGB);
        if (tmpImage.getWidth() == thumbWidth) {
            ImageIO.write(tmpImage, "PNG", output);
        } else {
            ResizeImage resizer = new ResizeImage(thumbWidth, thumbHeight);
            resizer.resizeMethod = ResizeImage.NO_RESIZE_ONLY_CROP;
            resizer.setInputImage(tmpImage);
            resizer.writeOutput(output);
        }
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
            }
        }
    }
}

Example 58

Project: lucene-solr-master File: ReducePDFSize.java View source code

public static void main(String[] args) throws IOException {
    if (2 != args.length) {
        throw new RuntimeException("arg0 must be input file, org1 must be output file");
    }
    String in = args[0];
    String out = args[1];
    PDDocument doc = null;
    try {
        doc = PDDocument.load(new File(in));
        doc.setAllSecurityToBeRemoved(true);
        for (COSObject cosObject : doc.getDocument().getObjects()) {
            COSBase base = cosObject.getObject();
            // if it's a stream: decode it, then re-write it using FLATE_DECODE
            if (base instanceof COSStream) {
                COSStream stream = (COSStream) base;
                byte[] bytes;
                try {
                    bytes = new PDStream(stream).toByteArray();
                } catch (IOException ex) {
                    throw new RuntimeException("can't serialize byte[] from: " + cosObject.getObjectNumber() + " " + cosObject.getGenerationNumber() + " obj: " + ex.getMessage(), ex);
                }
                stream.removeItem(COSName.FILTER);
                OutputStream streamOut = stream.createOutputStream(COSName.FLATE_DECODE);
                streamOut.write(bytes);
                streamOut.close();
            }
        }
        doc.getDocumentCatalog();
        doc.save(out);
    } finally {
        if (doc != null) {
            doc.close();
        }
    }
}

Example 59

Project: MEditor-master File: KrameriusImageSupport.java View source code

/**
     * Read image.
     *
     * @param url
     *        the url
     * @param type
     *        the type
     * @param page
     *        the page
     * @return the image
     * @throws IOException
     *         Signals that an I/O exception has occurred.
     */
public static Image readImage(URL url, ImageMimeType type, int page) throws IOException {
    if (type.javaNativeSupport()) {
        return ImageIO.read(url.openStream());
    } else if ((type.equals(ImageMimeType.DJVU)) || (type.equals(ImageMimeType.VNDDJVU)) || (type.equals(ImageMimeType.XDJVU))) {
        com.lizardtech.djvu.Document doc = new com.lizardtech.djvu.Document(url);
        doc.setAsync(false);
        DjVuPage[] p = new DjVuPage[1];
        // read page from the document - index 0, priority 1, favorFast true
        int size = doc.size();
        if ((page != 0) && (page >= size)) {
            page = 0;
        }
        p[0] = doc.getPage(page, 1, true);
        p[0].setAsync(false);
        DjVuImage djvuImage = new DjVuImage(p, true);
        Rectangle pageBounds = djvuImage.getPageBounds(0);
        Image[] images = djvuImage.getImage(new JPanel(), new Rectangle(pageBounds.width, pageBounds.height));
        if (images.length == 1) {
            Image img = images[0];
            return img;
        } else
            return null;
    } else if (type.equals(ImageMimeType.PDF)) {
        try (PDDocument document = PDDocument.load(url.openStream())) {
            PDFRenderer pdfRenderer = new PDFRenderer(document);
            int resolution = 96;
            BufferedImage image = pdfRenderer.renderImageWithDPI(page, resolution, ImageType.RGB);
            return image;
        }
    } else
        throw new IllegalArgumentException("unsupported mimetype '" + type.getValue() + "'");
}

Example 60

Project: nuxeo-core-master File: PDF2TextConverter.java View source code

@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
    PDDocument document = null;
    File f = null;
    OutputStream fas = null;
    try {
        document = PDDocument.load(blobHolder.getBlob().getStream());
        // NXP-1556: if document is protected an IOException will be raised
        // Instead of catching the exception based on its message string
        // lets avoid sending messages that will generate this error
        // code taken from PDFTextStripper.writeText source.
        // only care about standard encryption and if it was decrypted with
        // the user password
        AccessPermission permission = document.getCurrentAccessPermission();
        if (permission.canExtractContent()) {
            PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();
            // use the position information to heuristically organize the
            // extracted paragraphs. This is also important for
            // right-to-left languages.
            textStripper.setSortByPosition(true);
            String text = textStripper.getText(document);
            // replace non breaking space by regular spaces (why?)
            // text = text.replace("\u00a0", " ");
            f = File.createTempFile("pdfboplugin", ".txt");
            fas = new FileOutputStream(f);
            fas.write(text.getBytes("UTF-8"));
            return new SimpleCachableBlobHolder(new FileBlob(new FileInputStream(f), "text/plain", "UTF-8"));
        } else {
            return new SimpleCachableBlobHolder(new StringBlob(""));
        }
    } catch (Exception e) {
        throw new ConversionException("Error during text extraction with PDFBox", e);
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (Exception e) {
                log.error("Error while closing PDFBox document", e);
            }
        }
        if (fas != null) {
            try {
                fas.close();
            } catch (IOException e) {
                log.error(e);
            }
        }
        if (f != null) {
            f.delete();
        }
    }
}

Example 61

Project: nuxeo-master File: PDF2TextConverter.java View source code

@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
    PDDocument document = null;
    File f = null;
    OutputStream fas = null;
    try {
        document = PDDocument.load(blobHolder.getBlob().getStream());
        // NXP-1556: if document is protected an IOException will be raised
        // Instead of catching the exception based on its message string
        // lets avoid sending messages that will generate this error
        // code taken from PDFTextStripper.writeText source.
        // only care about standard encryption and if it was decrypted with
        // the user password
        AccessPermission permission = document.getCurrentAccessPermission();
        if (permission.canExtractContent()) {
            PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();
            // use the position information to heuristically organize the
            // extracted paragraphs. This is also important for
            // right-to-left languages.
            textStripper.setSortByPosition(true);
            String text = textStripper.getText(document);
            // replace non breaking space by regular spaces (why?)
            // text = text.replace("\u00a0", " ");
            f = Framework.createTempFile("pdfboplugin", ".txt");
            fas = new FileOutputStream(f);
            fas.write(text.getBytes("UTF-8"));
            try (FileInputStream is = new FileInputStream(f)) {
                Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8");
                return new SimpleCachableBlobHolder(blob);
            }
        } else {
            return new SimpleCachableBlobHolder(Blobs.createBlob(""));
        }
    } catch (IOException e) {
        throw new ConversionException("Error during text extraction with PDFBox", e);
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
                log.error("Error while closing PDFBox document", e);
            }
        }
        if (fas != null) {
            try {
                fas.close();
            } catch (IOException e) {
                log.error(e);
            }
        }
        if (f != null) {
            f.delete();
        }
    }
}

Example 62

Project: nuxeo-services-master File: BaseConverterTest.java View source code

public static boolean isPDFA(File pdfFile) throws Exception {
    PDDocument pddoc = PDDocument.load(pdfFile);
    XMPMetadata xmp = pddoc.getDocumentCatalog().getMetadata().exportXMPMetadata();
    Document doc = xmp.getXMPDocument();
    // <rdf:Description xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
    // rdf:about="">
    // <pdfaid:part>1</pdfaid:part>
    // <pdfaid:conformance>A</pdfaid:conformance>
    // </rdf:Description>
    NodeList list = doc.getElementsByTagName("pdfaid:conformance");
    return list != null && "A".equals(list.item(0).getTextContent());
}

Example 63

Project: paper2ebook-master File: Transformer.java View source code

/**
     * Output a PDF with as many pages as there are interesting areas in the
     * input document
     */
@Override
public PDDocument extract() throws IOException {
    PDDocument extractedDocument = new PDDocument();
    extractedDocument.setDocumentInformation(sourceDocument.getDocumentInformation());
    extractedDocument.getDocumentCatalog().setViewerPreferences(sourceDocument.getDocumentCatalog().getViewerPreferences());
    @SuppressWarnings("unchecked") List<PDPage> pages = sourceDocument.getDocumentCatalog().getAllPages();
    int pageCounter = 1;
    for (PDPage page : pages) {
        if (pageCounter >= startPage && pageCounter <= endPage) {
            List<PDRectangle> zoomedFragments = getFragments(page);
            for (PDRectangle fragment : zoomedFragments) {
                PDPage outputPage = extractedDocument.importPage(page);
                outputPage.setCropBox(fragment);
                outputPage.setMediaBox(page.getMediaBox());
                outputPage.setResources(page.findResources());
                outputPage.setRotation(page.findRotation());
            // TODO: rotate the page in landscape mode is width > height
            }
        }
        pageCounter++;
    }
    return extractedDocument;
}

Example 64

Project: PDFExtract-master File: PDFBoxSource.java View source code

// -------------------------- STATIC METHODS --------------------------
@NotNull
protected static PDDocument openPdfDocument(@NotNull final File pdfFile, @Nullable final String password) {
    long t0 = System.currentTimeMillis();
    MDC.put("doc", pdfFile.getName());
    log.info("LOG00120:Opening PDF file " + pdfFile + ".");
    try {
        final PDDocument document = PDDocument.load(pdfFile);
        if (document.isEncrypted()) {
            if (password != null) {
                try {
                    document.decrypt(password);
                } catch (Exception e) {
                    throw new RuntimeException("Error while reading encrypted PDF:", e);
                }
            } else {
                log.warn("File claims to be encrypted, a password should be provided");
            }
        }
        log.debug("load()took" + (System.currentTimeMillis() - t0) + "ms");
        return document;
    } catch (IOException e) {
        MDC.put("doc", "");
        throw new RuntimeException("Error while reading " + pdfFile + ".", e);
    }
}

Example 65

Project: seng310-ebookme-master File: PdfExtractor.java View source code

public void extract(InputStream stream, Charset charset, String mimeType, Map result) throws ExtractorException {
    // setup a PDDocument
    PDDocument document = null;
    try {
        try {
            PDFParser parser = new PDFParser(stream);
            parser.parse();
            document = parser.getPDDocument();
        } catch (IOException e) {
            throw new ExtractorException(e);
        }
        // decrypt and extract info from this document
        processDocument(document, result);
    } finally {
        if (document != null) {
            // close the document
            try {
                document.close();
            } catch (IOException e) {
                throw new ExtractorException(e);
            }
        }
    }
}

Example 66

Project: syncope-master File: BinaryPDFPreviewer.java View source code

@Override
public Component preview(final byte[] uploadedBytes) {
    firstPage = null;
    PDDocument document = null;
    try {
        document = PDDocument.load(new ByteArrayInputStream(uploadedBytes));
        if (document.isEncrypted()) {
            LOG.info("Document is encrypted, no preview is possible");
        } else {
            firstPage = new PDFRenderer(document).renderImage(0, RESOLUTION, IMAGE_TYPE);
        }
    } catch (IOException e) {
        LOG.error("While generating thumbnail from first page", e);
    } finally {
        IOUtils.closeQuietly(document);
    }
    Fragment fragment;
    if (firstPage == null) {
        fragment = new Fragment("preview", "noPreviewFragment", this);
    } else {
        fragment = new Fragment("preview", "previewFragment", this);
        fragment.add(new NonCachingImage("previewImage", new ThumbnailImageResource(firstPage)));
    }
    WebMarkupContainer previewContainer = new WebMarkupContainer("previewContainer");
    previewContainer.setOutputMarkupId(true);
    previewContainer.add(fragment);
    return this.addOrReplace(previewContainer);
}

Example 67

Project: amos-ss15-proj4-master File: ZipGenerator.java View source code

public void generate(OutputStream out, Locale locale, float height, Employee employee, int fontSize, String zipPassword) throws ZipException, NoSuchMessageException, IOException, COSVisitorException, CloneNotSupportedException {
    final ZipOutputStream zout = new ZipOutputStream(out);
    if (zipPassword == null) {
        // Use default password if none is set.
        zipPassword = "fragebogen";
    }
    ZipParameters params = new ZipParameters();
    params.setFileNameInZip("employee.txt");
    params.setCompressionLevel(Zip4jConstants.COMP_DEFLATE);
    params.setCompressionLevel(Zip4jConstants.DEFLATE_LEVEL_ULTRA);
    params.setEncryptFiles(true);
    params.setReadHiddenFiles(false);
    params.setEncryptionMethod(Zip4jConstants.ENC_METHOD_AES);
    params.setAesKeyStrength(Zip4jConstants.AES_STRENGTH_256);
    params.setPassword(zipPassword);
    params.setSourceExternalStream(true);
    zout.putNextEntry(null, params);
    zout.write((AppContext.getApplicationContext().getMessage("HEADER", null, locale) + "\n\n").getBytes());
    zout.write((AppContext.getApplicationContext().getMessage("print.section.personalData", null, locale) + "\n\n").getBytes());
    Iterator it = employee.getPersonalDataFields().entrySet().iterator();
    while (it.hasNext()) {
        Map.Entry pair = (Map.Entry) it.next();
        zout.write((pair.getKey() + ": " + pair.getValue() + '\n').getBytes());
        // avoids a ConcurrentModificationException
        it.remove();
    }
    zout.write(("\n\n" + AppContext.getApplicationContext().getMessage("print.section.taxes", null, locale) + "\n\n").getBytes());
    it = employee.getTaxesFields().entrySet().iterator();
    while (it.hasNext()) {
        Map.Entry pair = (Map.Entry) it.next();
        zout.write((pair.getKey() + ": " + pair.getValue() + '\n').getBytes());
        // avoids a ConcurrentModificationException
        it.remove();
    }
    zout.closeEntry();
    // Create a document and add a page to it
    PDDocument document = new PDDocument();
    PDPage page = new PDPage();
    document.addPage(page);
    float y = -1;
    int margin = 100;
    // Create a new font object selecting one of the PDF base fonts
    PDFont font = PDType1Font.TIMES_ROMAN;
    // Start a new content stream which will "hold" the to be created content
    PDPageContentStream contentStream = new PDPageContentStream(document, page);
    // Define a text content stream using the selected font, moving the cursor and drawing the text "Hello World"
    contentStream.beginText();
    y = page.getMediaBox().getHeight() - margin + height;
    contentStream.moveTextPositionByAmount(margin, y);
    /*
                List<String> list = StringUtils.splitEqually(fileContent, 90);
                for (String e : list) {
                    contentStream.moveTextPositionByAmount(0, -15);
                    contentStream.drawString(e);
                }
                */
    contentStream.setFont(PDType1Font.TIMES_BOLD, 36);
    contentStream.drawString(AppContext.getApplicationContext().getMessage("HEADER", null, locale));
    contentStream.setFont(PDType1Font.TIMES_BOLD, 14);
    contentStream.moveTextPositionByAmount(0, -4 * height);
    contentStream.drawString(AppContext.getApplicationContext().getMessage("print.section.personalData", null, locale));
    contentStream.moveTextPositionByAmount(0, -2 * height);
    contentStream.setFont(font, fontSize);
    it = employee.getPersonalDataFields().entrySet().iterator();
    while (it.hasNext()) {
        StringBuffer nextLineToDraw = new StringBuffer();
        Map.Entry pair = (Map.Entry) it.next();
        nextLineToDraw.append(pair.getKey());
        nextLineToDraw.append(": ");
        nextLineToDraw.append(pair.getValue());
        contentStream.drawString(nextLineToDraw.toString());
        contentStream.moveTextPositionByAmount(0, -height);
        // avoids a ConcurrentModificationException
        it.remove();
    }
    contentStream.setFont(PDType1Font.TIMES_BOLD, 14);
    contentStream.moveTextPositionByAmount(0, -2 * height);
    contentStream.drawString(AppContext.getApplicationContext().getMessage("print.section.taxes", null, locale));
    contentStream.moveTextPositionByAmount(0, -2 * height);
    contentStream.setFont(font, fontSize);
    it = employee.getTaxesFields().entrySet().iterator();
    while (it.hasNext()) {
        StringBuffer nextLineToDraw = new StringBuffer();
        Map.Entry pair = (Map.Entry) it.next();
        nextLineToDraw.append(pair.getKey());
        nextLineToDraw.append(": ");
        nextLineToDraw.append(pair.getValue());
        contentStream.drawString(nextLineToDraw.toString());
        contentStream.moveTextPositionByAmount(0, -height);
        // avoids a ConcurrentModificationException
        it.remove();
    }
    contentStream.endText();
    // Make sure that the content stream is closed:
    contentStream.close();
    // Save the results and ensure that the document is properly closed:
    ByteArrayOutputStream pdfout = new ByteArrayOutputStream();
    document.save(pdfout);
    document.close();
    ZipParameters params2 = (ZipParameters) params.clone();
    params2.setFileNameInZip("employee.pdf");
    zout.putNextEntry(null, params2);
    zout.write(pdfout.toByteArray());
    zout.closeEntry();
    // Write the zip to client
    zout.finish();
    zout.flush();
    zout.close();
}

Example 68

Project: ARX-master File: ARXCertificate.java View source code

/**
     * Renders the document into the given output stream
     * 
     * @param stream
     * @throws IOException 
     */
public void save(OutputStream stream) throws IOException {
    // Render
    Document document = new Document(style.gethMargin(), style.gethMargin(), style.getvMargin(), style.getvMargin());
    for (Element element : this.elements) {
        element.render(document, 0, this.style);
    }
    // Save to temp file
    File tmp = File.createTempFile("arx", "certificate");
    document.save(tmp);
    // Load and watermark
    PDDocument pdDocument = PDDocument.load(tmp);
    Watermark watermark = new Watermark(pdDocument);
    watermark.mark(pdDocument);
    // Save
    pdDocument.save(stream);
    pdDocument.close();
    tmp.delete();
}

Example 69

Project: cider-master File: pdfIdiom.java View source code

@Override
public Model parse(DataSource source) throws ParserException {
    // create an empty Model
    Model model = ModelFactory.createDefaultModel();
    Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource();
    // open pdf document
    final PDDocument theDocument;
    final PDFParser parser;
    try {
        parser = new PDFParser(source.getStream());
        parser.parse();
        theDocument = parser.getPDDocument();
    } catch (IOException e) {
        log.error(e.getMessage(), e);
        throw new ParserException(e.getMessage(), source.getURI());
    }
    if (theDocument.isEncrypted()) {
        try {
            theDocument.openProtection(new StandardDecryptionMaterial(""));
        } catch (BadSecurityHandlerException e) {
            throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e);
        } catch (IOException e) {
            throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
        } catch (CryptographyException e) {
            throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e);
        }
        final AccessPermission perm = theDocument.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent())
            throw new ParserException("PDF cannot be decrypted", source.getURI());
    }
    // get metadata
    final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
    if (theDocInfo != null) {
        docTitle = theDocInfo.getTitle();
        docSubject = theDocInfo.getSubject();
        docAuthor = theDocInfo.getAuthor();
        docKeywordStr = theDocInfo.getKeywords();
    }
    if (docAuthor != null && docAuthor.length() > 0) {
        resource.addProperty(VCARD.FN, docAuthor);
        resource.addProperty(DC.creator, docAuthor);
    }
    if (docSubject != null && docSubject.length() > 0) {
        resource.addProperty(DC.subject, docSubject);
    }
    if (docTitle != null && docTitle.length() > 0) {
        resource.addProperty(DC.title, docTitle);
    }
    String[] docKeywords = null;
    if (docKeywordStr != null && docKeywordStr.length() > 0) {
        docKeywords = docKeywordStr.split(" |,");
        resource.addProperty(DC.coverage, concat(docKeywords));
    }
    // get the content
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Writer writer;
    try {
        writer = new OutputStreamWriter(baos, "UTF-8");
    } catch (UnsupportedEncodingException e1) {
        writer = new OutputStreamWriter(baos);
    }
    try {
        final PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(theDocument, writer);
        theDocument.close();
        writer.close();
    } catch (IOException e) {
        if (writer != null)
            try {
                writer.close();
            } catch (final Exception ex) {
            }
        throw new ParserException("PDF content reader", source.getURI(), e);
    }
    String content;
    try {
        content = new String(baos.toByteArray(), "UTF-8");
    } catch (UnsupportedEncodingException e) {
        content = new String(baos.toByteArray());
    }
    if (content != null && content.length() > 0) {
        resource.addProperty(CIDER.data_content_text, content);
    }
    return model;
}

Example 70

Project: dkpro-core-master File: Pdf2CasConverter.java View source code

public void writeText(final CAS aCas, final InputStream aIs) throws IOException {
    final PDDocument doc = PDDocument.load(aIs);
    try {
        if (doc.isEncrypted()) {
            throw new IOException("Encrypted documents currently not supported");
        }
        cas = aCas;
        text = new StringBuilder();
        writeText(doc);
    } finally {
        doc.close();
    }
}

Example 71

Project: DrakkarKeel-master File: PdfParser.java View source code

/**
     * Para extraer contenido del pdf
     *
     * @param f
     * @return
     */
public boolean analyzePdfDocument(File f) {
    try {
        pdoc = PDDocument.load(f);
        if (!pdoc.isEncrypted() && pdoc.getCurrentAccessPermission().canExtractContent() && pdoc.getNumberOfPages() != 0) {
            this.numberPages = pdoc.getNumberOfPages();
            pdfText = new PDFTextStripper();
            swriter = new StringWriter();
            ////////////////////datos
            pinf = pdoc.getDocumentInformation();
            if (pinf == null) {
                OutputMonitor.printLine("The document does not have available information.", OutputMonitor.INFORMATION_MESSAGE);
            } else {
                setTitle(pinf.getTitle());
                setAuthor(pinf.getAuthor());
                setNumberpages(pdoc.getNumberOfPages());
                setCalCreation(pinf.getCreationDate());
                setCalModification(pinf.getModificationDate());
                pdfText.writeText(pdoc, swriter);
                allContent = swriter.getBuffer().toString();
            }
            pdoc.close();
            swriter.close();
            return true;
        } else {
            OutputMonitor.printLine("Encrypted document.", OutputMonitor.INFORMATION_MESSAGE);
        }
    } catch (Exception ex) {
        OutputMonitor.printStream("", ex);
    } finally {
        if (pdoc != null) {
            try {
                pdoc.close();
            } catch (IOException ex) {
                OutputMonitor.printStream("IO", ex);
            }
        }
    }
    return false;
}

Example 72

Project: EasySendToKindle-master File: FileUtil.java View source code

public static void pdf2txt(String file) throws Exception {
    boolean sort = false;
    String pdfFile = file;
    String textFile = null;
    String encoding = "UTF-8";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    Writer output = null;
    PDDocument document = null;
    try {
        try {
            URL url = new URL(pdfFile);
            document = PDDocument.load(pdfFile);
            String fileName = url.getFile();
            if (fileName.length() > 4) {
                File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt");
                textFile = outputFile.getName();
            }
        } catch (MalformedURLException e) {
            document = PDDocument.load(pdfFile);
            if (pdfFile.length() > 4) {
                textFile = pdfFile.substring(0, pdfFile.length() - 4) + ".txt";
            }
        }
        output = new OutputStreamWriter(new FileOutputStream(textFile), encoding);
        PDFTextStripper stripper = null;
        stripper = new PDFTextStripper();
        stripper.setSortByPosition(sort);
        stripper.setStartPage(startPage);
        stripper.setEndPage(endPage);
        stripper.writeText(document, output);
    } finally {
        if (output != null) {
            output.close();
        }
        if (document != null) {
            document.close();
        }
    }
}

Example 73

Project: eid-applet-master File: PdfSpikeTest.java View source code

@Test
public void testSignPDF() throws Exception {
    // create a sample PDF file
    Document document = new Document();
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    PdfWriter.getInstance(document, baos);
    document.open();
    Paragraph titleParagraph = new Paragraph("This is a test.");
    titleParagraph.setAlignment(Paragraph.ALIGN_CENTER);
    document.add(titleParagraph);
    document.newPage();
    Paragraph textParagraph = new Paragraph("Hello world.");
    document.add(textParagraph);
    document.close();
    File tmpFile = File.createTempFile("test-", ".pdf");
    LOG.debug("tmp file: " + tmpFile.getAbsolutePath());
    FileUtils.writeByteArrayToFile(tmpFile, baos.toByteArray());
    // eID
    PcscEid pcscEid = new PcscEid(new TestView(), new Messages(Locale.getDefault()));
    if (false == pcscEid.isEidPresent()) {
        LOG.debug("insert eID card");
        pcscEid.waitForEidPresent();
    }
    List<X509Certificate> signCertificateChain = pcscEid.getSignCertificateChain();
    Certificate[] certs = new Certificate[signCertificateChain.size()];
    for (int idx = 0; idx < certs.length; idx++) {
        certs[idx] = signCertificateChain.get(idx);
    }
    // open the pdf
    FileInputStream pdfInputStream = new FileInputStream(tmpFile);
    File signedTmpFile = File.createTempFile("test-signed-", ".pdf");
    PdfReader reader = new PdfReader(pdfInputStream);
    FileOutputStream pdfOutputStream = new FileOutputStream(signedTmpFile);
    PdfStamper stamper = PdfStamper.createSignature(reader, pdfOutputStream, '\0', null, true);
    // add extra page
    Rectangle pageSize = reader.getPageSize(1);
    int pageCount = reader.getNumberOfPages();
    int extraPageIndex = pageCount + 1;
    stamper.insertPage(extraPageIndex, pageSize);
    // calculate unique signature field name
    int signatureNameIndex = 1;
    String signatureName;
    AcroFields existingAcroFields = reader.getAcroFields();
    List<String> existingSignatureNames = existingAcroFields.getSignatureNames();
    do {
        signatureName = "Signature" + signatureNameIndex;
        signatureNameIndex++;
    } while (existingSignatureNames.contains(signatureName));
    LOG.debug("new unique signature name: " + signatureName);
    PdfSignatureAppearance signatureAppearance = stamper.getSignatureAppearance();
    signatureAppearance.setCrypto(null, certs, null, PdfSignatureAppearance.SELF_SIGNED);
    signatureAppearance.setCertificationLevel(PdfSignatureAppearance.CERTIFIED_NO_CHANGES_ALLOWED);
    signatureAppearance.setReason("PDF Signature Test");
    signatureAppearance.setLocation("Belgium");
    signatureAppearance.setVisibleSignature(new Rectangle(54, 440, 234, 566), extraPageIndex, signatureName);
    signatureAppearance.setExternalDigest(new byte[128], new byte[20], "RSA");
    signatureAppearance.preClose();
    byte[] content = IOUtils.toByteArray(signatureAppearance.getRangeStream());
    byte[] hash = MessageDigest.getInstance("SHA-1").digest(content);
    byte[] signatureBytes = pcscEid.sign(hash, "SHA-1");
    pcscEid.close();
    PdfSigGenericPKCS sigStandard = signatureAppearance.getSigStandard();
    PdfPKCS7 signature = sigStandard.getSigner();
    signature.setExternalDigest(signatureBytes, hash, "RSA");
    PdfDictionary dictionary = new PdfDictionary();
    dictionary.put(PdfName.CONTENTS, new PdfString(signature.getEncodedPKCS1()).setHexWriting(true));
    signatureAppearance.close(dictionary);
    LOG.debug("signed tmp file: " + signedTmpFile.getAbsolutePath());
    // verify the signature
    reader = new PdfReader(new FileInputStream(signedTmpFile));
    AcroFields acroFields = reader.getAcroFields();
    ArrayList<String> signatureNames = acroFields.getSignatureNames();
    for (String signName : signatureNames) {
        LOG.debug("signature name: " + signName);
        LOG.debug("signature covers whole document: " + acroFields.signatureCoversWholeDocument(signName));
        LOG.debug("document revision " + acroFields.getRevision(signName) + " of " + acroFields.getTotalRevisions());
        PdfPKCS7 pkcs7 = acroFields.verifySignature(signName);
        Calendar signDate = pkcs7.getSignDate();
        LOG.debug("signing date: " + signDate.getTime());
        LOG.debug("Subject: " + PdfPKCS7.getSubjectFields(pkcs7.getSigningCertificate()));
        LOG.debug("Document modified: " + !pkcs7.verify());
        Certificate[] verifyCerts = pkcs7.getCertificates();
        for (Certificate certificate : verifyCerts) {
            X509Certificate x509Certificate = (X509Certificate) certificate;
            LOG.debug("cert subject: " + x509Certificate.getSubjectX500Principal());
        }
    }
    /*
		 * Reading the signature using Apache PDFBox.
		 */
    PDDocument pdDocument = PDDocument.load(signedTmpFile);
    COSDictionary trailer = pdDocument.getDocument().getTrailer();
    /*
		 * PDF Reference - third edition - Adobe Portable Document Format -
		 * Version 1.4 - 3.6.1 Document Catalog
		 */
    COSDictionary documentCatalog = (COSDictionary) trailer.getDictionaryObject(COSName.ROOT);
    /*
		 * 8.6.1 Interactive Form Dictionary
		 */
    COSDictionary acroForm = (COSDictionary) documentCatalog.getDictionaryObject(COSName.ACRO_FORM);
    COSArray fields = (COSArray) acroForm.getDictionaryObject(COSName.FIELDS);
    for (int fieldIdx = 0; fieldIdx < fields.size(); fieldIdx++) {
        COSDictionary field = (COSDictionary) fields.getObject(fieldIdx);
        String fieldType = field.getNameAsString("FT");
        if ("Sig".equals(fieldType)) {
            COSDictionary signatureDictionary = (COSDictionary) field.getDictionaryObject(COSName.V);
            /*
				 * TABLE 8.60 Entries in a signature dictionary
				 */
            COSString signatoryName = (COSString) signatureDictionary.getDictionaryObject(COSName.NAME);
            if (null != signatoryName) {
                LOG.debug("signatory name: " + signatoryName.getString());
            }
            COSString reason = (COSString) signatureDictionary.getDictionaryObject(COSName.REASON);
            if (null != reason) {
                LOG.debug("reason: " + reason.getString());
            }
            COSString location = (COSString) signatureDictionary.getDictionaryObject(COSName.LOCATION);
            if (null != location) {
                LOG.debug("location: " + location.getString());
            }
            Calendar signingTime = signatureDictionary.getDate(COSName.M);
            if (null != signingTime) {
                LOG.debug("signing time: " + signingTime.getTime());
            }
            String signatureHandler = signatureDictionary.getNameAsString(COSName.FILTER);
            LOG.debug("signature handler: " + signatureHandler);
        }
    }
}

Example 74

Project: elexis-3-base-master File: PrintVaccinationEntriesHandler.java View source code

private void createPDF(Patient patient, Image image) throws IOException, COSVisitorException {
    PDDocumentInformation pdi = new PDDocumentInformation();
    Mandant mandant = (Mandant) ElexisEventDispatcher.getSelected(Mandant.class);
    pdi.setAuthor(mandant.getName() + " " + mandant.getVorname());
    pdi.setCreationDate(new GregorianCalendar());
    pdi.setTitle("Impfausweis " + patient.getLabel());
    PDDocument document = new PDDocument();
    document.setDocumentInformation(pdi);
    PDPage page = new PDPage();
    page.setMediaBox(PDPage.PAGE_SIZE_A4);
    document.addPage(page);
    PDRectangle pageSize = page.findMediaBox();
    PDFont font = PDType1Font.HELVETICA_BOLD;
    PDFont subFont = PDType1Font.HELVETICA;
    PDPageContentStream contentStream = new PDPageContentStream(document, page);
    contentStream.beginText();
    contentStream.setFont(font, 14);
    contentStream.moveTextPositionByAmount(40, pageSize.getUpperRightY() - 40);
    contentStream.drawString(patient.getLabel());
    contentStream.endText();
    String dateLabel = sdf.format(Calendar.getInstance().getTime());
    String title = Person.load(mandant.getId()).get(Person.TITLE);
    String mandantLabel = title + " " + mandant.getName() + " " + mandant.getVorname();
    contentStream.beginText();
    contentStream.setFont(subFont, 10);
    contentStream.moveTextPositionByAmount(40, pageSize.getUpperRightY() - 55);
    contentStream.drawString("Ausstellung " + dateLabel + ", " + mandantLabel);
    contentStream.endText();
    BufferedImage imageAwt = convertToAWT(image.getImageData());
    PDXObjectImage pdPixelMap = new PDPixelMap(document, imageAwt);
    contentStream.drawXObject(pdPixelMap, 40, 30, pageSize.getWidth() - 80, pageSize.getHeight() - 100);
    contentStream.close();
    String outputPath = CoreHub.userCfg.get(PreferencePage.VAC_PDF_OUTPUTDIR, CoreHub.getWritableUserDir().getAbsolutePath());
    if (outputPath.equals(CoreHub.getWritableUserDir().getAbsolutePath())) {
        SWTHelper.showInfo("Kein Ausgabeverzeichnis definiert", "Ausgabe erfolgt in: " + outputPath + "\nDas Ausgabeverzeichnis kann unter Einstellungen\\Klinische Hilfsmittel\\Impfplan definiert werden.");
    }
    File outputDir = new File(outputPath);
    File pdf = new File(outputDir, "impfplan_" + patient.getPatCode() + ".pdf");
    document.save(pdf);
    document.close();
    Desktop.getDesktop().open(pdf);
}

Example 75

Project: geoserver-master File: PDFGetMapTest.java View source code

/**
     * Returns the last tiling pattern found during a render of the PDF document. Can be used to extract
     * one tiling pattern that gets actually used to render shapes (meant to be used against a document
     * that only has a single tiling pattern)
     * 
     * @param pdfDocument
     * @return
     * @throws InvalidPasswordException
     * @throws IOException
     */
PDTilingPattern getTilingPattern(byte[] pdfDocument) throws InvalidPasswordException, IOException {
    // load the document using PDFBOX (iText is no good for parsing tiling patterns, mostly works
    // well for text and image extraction, spent a few hours trying to use it with no results)
    PDDocument doc = PDDocument.load(pdfDocument);
    PDPage page = doc.getPage(0);
    // use a graphics stream engine, it's the only thing I could find that parses the PDF
    // deep enough to allow catching the tiling pattern in parsed form 
    AtomicReference<PDTilingPattern> pattern = new AtomicReference<>();
    PDFStreamEngine engine = new PDFGraphicsStreamEngine(page) {

        @Override
        public void strokePath() throws IOException {
        }

        @Override
        public void shadingFill(COSName shadingName) throws IOException {
        }

        @Override
        public void moveTo(float x, float y) throws IOException {
        }

        @Override
        public void lineTo(float x, float y) throws IOException {
        }

        @Override
        public Point2D getCurrentPoint() throws IOException {
            return null;
        }

        @Override
        public void fillPath(int windingRule) throws IOException {
        }

        @Override
        public void fillAndStrokePath(int windingRule) throws IOException {
        }

        @Override
        public void endPath() throws IOException {
        }

        @Override
        public void drawImage(PDImage pdImage) throws IOException {
        }

        @Override
        public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException {
        }

        @Override
        public void closePath() throws IOException {
        }

        @Override
        public void clip(int windingRule) throws IOException {
        }

        @Override
        public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {
        }
    };
    // setup the tiling pattern trap
    engine.addOperator(new SetNonStrokingColorN() {

        @Override
        public void process(Operator operator, List<COSBase> arguments) throws IOException {
            super.process(operator, arguments);
            PDColor color = context.getGraphicsState().getNonStrokingColor();
            if (context.getGraphicsState().getNonStrokingColorSpace() instanceof PDPattern) {
                PDPattern colorSpace = (PDPattern) context.getGraphicsState().getNonStrokingColorSpace();
                PDAbstractPattern ap = colorSpace.getPattern(color);
                if (ap instanceof PDTilingPattern) {
                    pattern.set((PDTilingPattern) ap);
                }
            }
        }
    });
    // run it
    engine.processPage(page);
    return pattern.get();
}

Example 76

Project: is.idega.idegaweb.marathon-master File: PDFTester.java View source code

public void doIt(String message, String outfile) throws IOException, COSVisitorException {
    // the document
    PDDocument doc = null;
    try {
        doc = new PDDocument();
        // Page 1
        PDFont font = PDType1Font.HELVETICA;
        PDPage page = new PDPage();
        page.setMediaBox(PDPage.PAGE_SIZE_A4);
        doc.addPage(page);
        float fontSize = 12.0f;
        PDRectangle pageSize = page.findMediaBox();
        float centeredXPosition = (pageSize.getWidth() - fontSize / 1000f) / 2f;
        float stringWidth = font.getStringWidth(message);
        float centeredYPosition = (pageSize.getHeight() - (stringWidth * fontSize) / 1000f) / 3f;
        PDPageContentStream contentStream = new PDPageContentStream(doc, page, false, false);
        contentStream.setFont(font, fontSize);
        contentStream.beginText();
        // counterclockwise rotation
        for (int i = 0; i < 8; i++) {
            contentStream.setTextRotation(i * Math.PI * 0.25, centeredXPosition, pageSize.getHeight() - centeredYPosition);
            contentStream.drawString(message + " " + i);
        }
        // clockwise rotation
        for (int i = 0; i < 8; i++) {
            contentStream.setTextRotation(-i * Math.PI * 0.25, centeredXPosition, centeredYPosition);
            contentStream.drawString(message + " " + i);
        }
        contentStream.endText();
        contentStream.close();
        // Page 2
        page = new PDPage();
        page.setMediaBox(PDPage.PAGE_SIZE_A4);
        doc.addPage(page);
        fontSize = 1.0f;
        contentStream = new PDPageContentStream(doc, page, false, false);
        contentStream.setFont(font, fontSize);
        contentStream.beginText();
        // text scaling
        for (int i = 0; i < 10; i++) {
            contentStream.setTextScaling(12 + (i * 6), 12 + (i * 6), 100, 100 + i * 50);
            contentStream.drawString(message + " " + i);
        }
        contentStream.endText();
        contentStream.close();
        // Page 3
        page = new PDPage();
        page.setMediaBox(PDPage.PAGE_SIZE_A4);
        doc.addPage(page);
        fontSize = 1.0f;
        contentStream = new PDPageContentStream(doc, page, false, false);
        contentStream.setFont(font, fontSize);
        contentStream.beginText();
        int i = 0;
        // text scaling combined with rotation 
        contentStream.setTextMatrix(12, 0, 0, 12, centeredXPosition, centeredYPosition * 1.5);
        contentStream.drawString(message + " " + i++);
        contentStream.setTextMatrix(0, 18, -18, 0, centeredXPosition, centeredYPosition * 1.5);
        contentStream.drawString(message + " " + i++);
        contentStream.setTextMatrix(-24, 0, 0, -24, centeredXPosition, centeredYPosition * 1.5);
        contentStream.drawString(message + " " + i++);
        contentStream.setTextMatrix(0, -30, 30, 0, centeredXPosition, centeredYPosition * 1.5);
        contentStream.drawString(message + " " + i++);
        contentStream.endText();
        contentStream.close();
        doc.save(outfile);
    } finally {
        if (doc != null) {
            doc.close();
        }
    }
}

Example 77

Project: pdf2alto-master File: PrintWordLocations.java View source code

public void processDocuments(String[] args) throws Exception {
    if (args.length != 1) {
        usage();
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(args[0]);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }
            PrintWordLocations printer = new PrintWordLocations();
            List allPages = document.getDocumentCatalog().getAllPages();
            System.out.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?><alto xmlns=\"http://www.loc.gov/standards/alto/alto-v2.0.xsd\"><Description><MeasurementUnit>inch1200</MeasurementUnit></Description><Layout>");
            for (int i = 0; i < allPages.size(); i++) {
                PDPage page = (PDPage) allPages.get(i);
                if (page.getCropBox() != null) {
                    PDRectangle mediaBox = (PDRectangle) page.getMediaBox();
                    PDRectangle cropBox = (PDRectangle) page.getCropBox();
                    printer.setOffset(new MarginOffset(cropBox.getLowerLeftX() - mediaBox.getLowerLeftX(), cropBox.getLowerLeftY() - mediaBox.getLowerLeftY()));
                }
                System.out.println("<Page>");
                System.out.println("<PrintSpace>");
                System.out.println("<TextBlock>");
                System.out.println("<TextLine>");
                PDStream contents = page.getContents();
                if (contents != null) {
                    printer.processStream(page, page.findResources(), page.getContents().getStream());
                }
                endOfPage();
                System.out.println("</TextLine>");
                System.out.println("</TextBlock>");
                System.out.println("</PrintSpace>");
                System.out.println("</Page>");
            }
            System.out.println("</Layout></alto>");
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

Example 78

Project: s2robot-master File: PdfExtractor.java View source code

/*
     * (non-Javadoc)
     * 
     * @see org.seasar.robot.extractor.Extractor#getText(java.io.InputStream,
     * java.util.Map)
     */
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new RobotSystemException("The inputstream is null.");
    }
    synchronized (pdfBoxLockObj) {
        PDDocument document = null;
        try {
            document = PDDocument.load(in, null, force);
            if (document.isEncrypted() && params != null) {
                String password = params.get(ExtractData.PDF_PASSWORD);
                if (password == null) {
                    password = getPassword(params.get(ExtractData.URL), params.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
                }
                if (password != null) {
                    final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password);
                    document.openProtection(sdm);
                    final AccessPermission ap = document.getCurrentAccessPermission();
                    if (!ap.canExtractContent()) {
                        throw new IOException("You do not have permission to extract text.");
                    }
                }
            }
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper(encoding);
            stripper.setForceParsing(force);
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            Thread task = new Thread(new Runnable() {

                @Override
                public void run() {
                    try {
                        stripper.writeText(doc, output);
                    } catch (Exception e) {
                        exceptionSet.add(e);
                    } finally {
                        done.set(true);
                    }
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        } finally {
            if (document != null) {
                try {
                    document.close();
                } catch (final IOException e) {
                }
            }
        }
    }
}

Example 79

Project: sad-analyzer-master File: PdfParser.java View source code

/**
	 * @see input.parser.SadParser#getSad(java.lang.String)
	 */
public Section getSad(String pathTemplate, String urlSad) {
    Section section = new CompositeSection();
    if (!pathTemplate.isEmpty()) {
        structureXml = new XmlReader(pathTemplate);
    }
    try {
        File input = new File(urlSad);
        PDDocument doc;
        doc = PDDocument.load(input);
        PDDocumentOutline root = doc.getDocumentCatalog().getDocumentOutline();
        if (root != null) {
            // Se pide el primer nodo del árbol
            PDOutlineItem item = root.getFirstChild();
            if (structureXml != null) {
                if (validateTemplate(item)) {
                    section = parserSections(item, doc);
                } else {
                    section = null;
                }
            } else {
                section = parserSections(item, doc);
            }
        } else {
            Item s = new Item();
            s.setText(extractText(0, doc, doc.getNumberOfPages()));
            s.setName(input.getName());
            ((CompositeSection) section).addSection(s);
        }
        return section;
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}

Example 80

Project: shr5rcp-master File: SourceBookView.java View source code

@Override
protected IStatus run(IProgressMonitor monitor) {
    try {
        PDDocument pdDocument = getpdfDoc(file, monitor);
        if (pdDocument == null)
            return Status.OK_STATUS;
        final String text1 = getTextFromPage(src, pdDocument);
        pageMap.put(key, text1);
        Display.getDefault().asyncExec(new Runnable() {

            @Override
            public void run() {
                displayedText.setValue(text1);
                processText(text1, src);
            }
        });
    } catch (IOException e) {
        Activator.logError(e);
    }
    return Status.OK_STATUS;
}

Example 81

Project: smartly-master File: PDFUtils.java View source code

public static void forEachPage(final File pdfFile, final boolean nonSequential, Delegates.Function<Boolean> callback) throws IOException {
    final PDDocument doc = nonSequential ? PDDocument.loadNonSeq(pdfFile, null) : PDDocument.load(pdfFile);
    try {
        final List<PDPage> pages = doc.getDocumentCatalog().getAllPages();
        final int len = pages.size();
        int count = 1;
        for (final PDPage page : pages) {
            if (null != callback) {
                if (callback.handle(page, count, len)) {
                    count++;
                } else {
                    break;
                }
            }
        }
    } finally {
        doc.close();
    }
}

Example 82

Project: trantor-pdf-converter-master File: PdfDoc.java View source code

public static void pdfToPngPreview(String pdf, String output) throws IOException {
    PDDocument pdDoc = null;
    try {
        pdDoc = PDDocument.load(pdf);
        List pdPages = pdDoc.getDocumentCatalog().getAllPages();
        ListIterator pageIter = pdPages.listIterator();
        PDPage firstPage = (PDPage) pageIter.next();
        BufferedImage img = firstPage.convertToImage(BufferedImage.TYPE_INT_RGB, Consts.PREVIEW_DPI);
        ImageIO.write(img, Consts.PNG, new File(output));
    } catch (Exception ex) {
        Logger.getLogger(PdfDoc.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
        if (null != pdDoc) {
            pdDoc.close();
        }
    }
}

Example 83

Project: webarchive-discovery-master File: PDFParser.java View source code

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    PDDocument pdfDocument = null;
    TemporaryResources tmp = new TemporaryResources();
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        TikaInputStream tstream = TikaInputStream.cast(stream);
        if (tstream != null && tstream.hasFile()) {
            // File based, take that as a cue to use a temporary file
            RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
            pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
        } else {
            // Go for the normal, stream based in-memory parsing
            pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
        }
        if (pdfDocument.isEncrypted()) {
            String password = null;
            // Did they supply a new style Password Provider?
            PasswordProvider passwordProvider = context.get(PasswordProvider.class);
            if (passwordProvider != null) {
                password = passwordProvider.getPassword(metadata);
            }
            // Fall back on the old style metadata if set
            if (password == null && metadata.get(PASSWORD) != null) {
                password = metadata.get(PASSWORD);
            }
            // If no password is given, use an empty string as the default
            if (password == null) {
                password = "";
            }
            try {
                pdfDocument.decrypt(password);
            } catch (Exception e) {
            }
        }
        metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
        extractMetadata(pdfDocument, metadata);
        PDF2XHTML.process(pdfDocument, handler, metadata, extractAnnotationText, enableAutoSpace, suppressDuplicateOverlappingText, sortByPosition);
    } catch (Exception e) {
        log.error("Exception while parsing PDF: " + e);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
        tmp.dispose();
    }
}

Example 84

Project: Xponents-master File: PDFConverter.java View source code

/**
     * Implementation is informed by PDFBox authors.
     *
     * @param doc
     * @return
     * @throws IOException
     */
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {
    /*
         * Licensed to the Apache Software Foundation (ASF) under one or more
         * contributor license agreements.  See the NOTICE file distributed with
         * this work for additional information regarding copyright ownership.
         * The ASF licenses this file to You under the Apache License, Version 2.0
         * (the "License"); you may not use this file except in compliance with
         * the License.  You may obtain a copy of the License at
         *
         *      http://www.apache.org/licenses/LICENSE-2.0
         *
         * Unless required by applicable law or agreed to in writing, software
         * distributed under the License is distributed on an "AS IS" BASIS,
         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         * See the License for the specific language governing permissions and
         * limitations under the License.
         */
    /**
         * Adapted from LucenePDFDocument.java from PDFBox lucene project
         *
         * This class is used to create a document for the lucene search engine.
         * This should easily plug into the IndexHTML or IndexFiles that comes
         * with the lucene project. This class will populate the following
         * fields.
         * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
         * <tr>
         * <td>path</td> <td>File system path if loaded from a file</td> </tr>
         * <tr>
         * <td>url</td> <td>URL to PDF document</td> </tr> <tr>
         * <td>contents</td>
         * <td>Entire contents of PDF document, indexed but not stored</td>
         * </tr>
         * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
         * <tr>
         * <td>modified</td> <td>The modified date/time according to the url or
         * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
         * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
         * <td>From PDF meta-data if available</td> </tr> <tr>
         * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
         * </table>
         *
         * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
         * @version $Revision: 1.23 $
         *
         * @throws IOException If there is an error parsing the document.
         */
    PDDocument pdfDocument = null;
    ConvertedDocument textdoc = new ConvertedDocument(doc);
    try {
        pdfDocument = PDDocument.load(doc);
        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            // Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
            textdoc.addProperty("encrypted", "YES");
        }
        //create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        stripper.resetEngine();
        stripper.writeText(pdfDocument, writer);
        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            textdoc.addAuthor(info.getAuthor());
            try {
                textdoc.addCreateDate(info.getCreationDate());
            } catch (IOException io) {
            }
            textdoc.addProperty("creator_tool", info.getCreator());
            textdoc.addProperty("keywords", info.getKeywords());
            /* try {
                 metadata.add("ModificationDate", info.getModificationDate());
                 } catch (IOException io) {
                 //ignore, bad date but continue with indexing
                 } */
            //metadata.add("Producer", info.getProducer());
            textdoc.addProperty("subject", info.getSubject());
            String ttl = info.getTitle();
            if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                ttl = textdoc.filename;
            }
            textdoc.addTitle(ttl);
            // metadata.add("Trapped", info.getTrapped());
            // TODO: Character set is what?
            textdoc.setEncoding("UTF-8");
        }
        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        textdoc.setText(writer.getBuffer().toString());
        return textdoc;
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

Example 85

Project: xtf-dsc-master File: PDFToString.java View source code

//////////////////////////////////////////////////////////////////////////////
/** Convert a PDF file into an XML string.
   *
   *  @param PDFInputStream  The stream of PDF data to convert to an
   *                         XML string.
   *
   *  @return
   *      If successful, a string containing the XML equivalent of the source
   *      PDF file. If an error occurred, this method returns <code>null</code>.
   *
   */
static String convert(InputStream PDFInputStream) throws IOException {
    // Make a stripper if we haven't already.
    if (stripper == null)
        stripper = new PDFTextStripper();
    // Workaround: using PDFTextStripper normally results in a Window
    // being created. However, since we're running in a servlet container, this
    // isn't generally desirable (and often isn't possible.) So we let AWT know
    // that it's running in "headless" mode, and this prevents the window from
    // being created.
    //
    System.setProperty("java.awt.headless", "true");
    XMLFormatter formatter = new XMLFormatter();
    try {
        PDDocument pdfDoc = null;
        try {
            // Get hold of the PDF document to convert.
            pdfDoc = PDDocument.load(PDFInputStream);
            // If the document is encrypted, we've got a problem.
            if (pdfDoc.isEncrypted()) {
                Trace.info("*** PDF File is Encrypted. File Skipped.");
                throw new Exception();
            }
            // Start the XML with an XML format tag.
            formatter.procInstr("xml version=\"1.0\" encoding=\"utf-8\"");
            // Set up the tab size and blank line formatting.   
            formatter.tabSize(4);
            formatter.blankLineAfterTag(false);
            // Determine how many pages there are in the PDF file.   
            int pageCount = pdfDoc.getNumberOfPages();
            // Create an all-enclosing document tag summarizing 
            // the original document name and the number of pages.
            //   
            formatter.beginTag("pdfDocument");
            formatter.attr("pageCount", pageCount);
            // Process each page in the PDF document.   
            for (int i = 1; i <= pageCount; i++) {
                // Start with a new page tag.
                formatter.beginTag("pdfPage");
                formatter.attr("number", i);
                // Tell the stripper to only process the current page.
                stripper.setStartPage(i);
                stripper.setEndPage(i);
                // Get the text for this page.
                String pdfText = stripper.getText(pdfDoc);
                // Escape and normalize characters.
                pdfText = XMLIndexSource.normalize(pdfText);
                // Tack the text onto the XML output, nicely formatted
                // into lines of 128 characters or less.
                //   
                formatter.text(pdfText, 128);
                formatter.newLineAfterText();
                // End the current page tag.   
                formatter.endTag();
            }
            // for( int i = 1; i <= pageCount; i++ )
            // End any remaining open tags (should only be the pdfDocument
            // tag.)
            //
            formatter.endAllTags();
        }// If anything went wrong, say what it was.    
         catch (Throwable t) {
            Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
            Trace.error("                    With message: " + t.getMessage());
        } finally // Finally, close up the the PDF document.
        {
            if (pdfDoc != null)
                pdfDoc.close();
        }
    }// Shunt out any other exceptions.
     catch (Throwable t) {
        Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
        Trace.error("                    With message: " + t.getMessage());
    }
    // Return the resulting XML string to the caller.
    return formatter.toString();
}

Example 86

Project: xtf-master File: PDFToString.java View source code

//////////////////////////////////////////////////////////////////////////////
/** Convert a PDF file into an XML string.
   *
   *  @param PDFInputStream  The stream of PDF data to convert to an
   *                         XML string.
   *
   *  @return
   *      If successful, a string containing the XML equivalent of the source
   *      PDF file. If an error occurred, this method returns <code>null</code>.
   *
   */
static String convert(InputStream PDFInputStream) throws IOException {
    // Make a stripper if we haven't already.
    if (stripper == null)
        stripper = new PDFTextStripper();
    // Workaround: using PDFTextStripper normally results in a Window
    // being created. However, since we're running in a servlet container, this
    // isn't generally desirable (and often isn't possible.) So we let AWT know
    // that it's running in "headless" mode, and this prevents the window from
    // being created.
    //
    System.setProperty("java.awt.headless", "true");
    XMLFormatter formatter = new XMLFormatter();
    try {
        PDDocument pdfDoc = null;
        try {
            // Get hold of the PDF document to convert.
            pdfDoc = PDDocument.load(PDFInputStream);
            // If the document is encrypted, we've got a problem.
            if (pdfDoc.isEncrypted()) {
                Trace.info("*** PDF File is Encrypted. File Skipped.");
                throw new Exception();
            }
            // Start the XML with an XML format tag.
            formatter.procInstr("xml version=\"1.0\" encoding=\"utf-8\"");
            // Set up the tab size and blank line formatting.   
            formatter.tabSize(4);
            formatter.blankLineAfterTag(false);
            // Determine how many pages there are in the PDF file.   
            int pageCount = pdfDoc.getNumberOfPages();
            // Create an all-enclosing document tag summarizing 
            // the original document name and the number of pages.
            //   
            formatter.beginTag("pdfDocument");
            formatter.attr("pageCount", pageCount);
            // Process each page in the PDF document.   
            for (int i = 1; i <= pageCount; i++) {
                // Start with a new page tag.
                formatter.beginTag("pdfPage");
                formatter.attr("number", i);
                // Tell the stripper to only process the current page.
                stripper.setStartPage(i);
                stripper.setEndPage(i);
                // Get the text for this page.
                String pdfText = stripper.getText(pdfDoc);
                // Escape and normalize characters.
                pdfText = XMLIndexSource.normalize(pdfText);
                // Tack the text onto the XML output, nicely formatted
                // into lines of 128 characters or less.
                //   
                formatter.text(pdfText, 128);
                formatter.newLineAfterText();
                // End the current page tag.   
                formatter.endTag();
            }
            // for( int i = 1; i <= pageCount; i++ )
            // End any remaining open tags (should only be the pdfDocument
            // tag.)
            //
            formatter.endAllTags();
        }// If anything went wrong, say what it was.    
         catch (Throwable t) {
            Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
            Trace.error("                    With message: " + t.getMessage());
        } finally // Finally, close up the the PDF document.
        {
            if (pdfDoc != null)
                pdfDoc.close();
        }
    }// Shunt out any other exceptions.
     catch (Throwable t) {
        Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
        Trace.error("                    With message: " + t.getMessage());
    }
    // Return the resulting XML string to the caller.
    return formatter.toString();
}

Example 87

Project: DynamicSpotter-master File: ResourceViewer.java View source code

// creates image data from the first page of the pdf file
private ImageData createImageDataFromPdf(String resourceFile) {
    try {
        PDDocument document = PDDocument.load(resourceFile);
        @SuppressWarnings("unchecked") List<PDPage> pages = document.getDocumentCatalog().getAllPages();
        if (pages.isEmpty()) {
            throw new SWTException(SWT.ERROR_INVALID_IMAGE);
        }
        BufferedImage bufferedImage = pages.get(0).convertToImage(PDF_IMAGE_TYPE, PDF_VIEW_RESOLUTION);
        return ImageUtils.convertToImageData(bufferedImage);
    } catch (IOException e) {
        throw new SWTException(SWT.ERROR_IO);
    }
}

Example 88

Project: ServerDevelopmentGuideV2-master File: CmisCustomPdfWatermarkServiceWrapper.java View source code

@Override
public ContentStream getContentStream(String repositoryId, String objectId, String streamId, BigInteger offset, BigInteger length, ExtensionsData extension) {
    slflog("getContentStream override from Chameleon module --------------", repositoryId);
    long startTime = System.currentTimeMillis();
    CallContext sharedContext = this.getCallContext();
    // Get the native domain object from the call context if one is shared
    // by the vendor (example only)
    // Your CMIS vendor's documentation must expose the name of any shared
    // objects they place here for extensions.
    // Object objShared = sharedContext.get("shared_key_name_from_vendor");
    ContentStream retVal = getWrappedService().getContentStream(repositoryId, objectId, streamId, offset, length, extension);
    if (sharedContext.getUsername().equalsIgnoreCase(userToWatermark)) {
        if ((retVal != null) && (retVal.getMimeType().contains("pdf"))) {
            InputStream rawStream = retVal.getStream();
            // return a pdfbox document object
            // for debugging only - load to pdfbox and stream out
            // PDDocument modifiedPDF = watermarkPDF_loadOnly(rawStream);
            // actual watermark code
            PDDocument modifiedPDF = watermarkPDF(rawStream);
            // Extra credit here. Replace with TempStoreOutputStream or find
            // another way to handle very large objects in a small memory
            // footprint.
            // ByteArrayOutputStream out = new ByteArrayOutputStream();
            TempStoreOutputStream out;
            TempStoreOutputStreamFactory outFactory = (TempStoreOutputStreamFactory) sharedContext.get(CallContext.STREAM_FACTORY);
            if (outFactory != null) {
                // reuse the server factory configuration
                out = outFactory.newOutputStream();
            } else {
                // there is no default ThresholdOutputStreamFactory
                // -> create a stream manually:
                // default temp directory, max 4MiB in main memory,
                // unlimited content size
                out = new ThresholdOutputStream(null, 4 * 1024 * 1024, -1);
            }
            try {
                modifiedPDF.save(out);
                modifiedPDF.close();
                // new
                InputStream modifiedInputStream = out.getInputStream();
                // ByteArrayInputStream(out.toByteArray());
                // Extra credit here. Handle offset and length if provided
                // by the client.
                // now write the stream back to the ContentStream object
                retVal = new ContentStreamImpl(retVal.getFileName(), null, "application/pdf", modifiedInputStream);
            } catch (Exception e) {
                slflog("error transposing stream getContentStream ", e.getMessage());
                LOG.error("Could not watermark PDF document: {}", e.getMessage(), e);
                throw new CmisRuntimeException("Could not watermark PDF document!");
            }
        }
    // if pdf stream
    }
    // if user matches filter param
    LOG.info("[CmisCustomServiceWrapper] Exiting method getContentStream. time (ms):" + (System.currentTimeMillis() - startTime));
    return retVal;
}

Example 89

Project: xwiki-enterprise-master File: PDFTest.java View source code

private String getPDFContent(URL url) throws Exception {
    HttpURLConnection connection = (HttpURLConnection) url.openConnection();
    InputStream is = connection.getInputStream();
    PDDocument pdd = PDDocument.load(is);
    String text;
    try {
        PDFText2HTML stripper = new PDFText2HTML();
        text = stripper.getText(pdd);
    } finally {
        if (pdd != null) {
            pdd.close();
        }
        if (is != null) {
            is.close();
        }
    }
    return text;
}

Example 90

Project: converge-1.x-master File: MetaDataService.java View source code

/** {@inheritDoc } */
@Override
public String extractContent(MediaItemRendition mir) {
    String contentType = mir.getContentType();
    String story = "";
    if (contentType == null) {
        LOG.log(Level.WARNING, "Content type is null");
        return story;
    }
    if (contentType.equals("application/pdf")) {
        // Extract text in PDF
        try {
            URL originalFile = new URL(mir.getAbsoluteFilename());
            PDDocument doc = null;
            try {
                // Read PDF
                PDFParser parser = new PDFParser(originalFile.openStream());
                parser.parse();
                COSDocument cosDoc = parser.getDocument();
                PDDocument pdDoc = new PDDocument(cosDoc);
                PDFTextStripper stripper = new PDFTextStripper();
                story = stripper.getText(pdDoc);
            } catch (IOException ex) {
                LOG.log(Level.SEVERE, ex.getMessage());
                LOG.log(Level.FINEST, "", ex);
            } finally {
                if (doc != null) {
                    try {
                        doc.close();
                    } catch (IOException ex) {
                        LOG.log(Level.SEVERE, ex.getMessage());
                        LOG.log(Level.FINEST, "", ex);
                    }
                }
            }
        } catch (MalformedURLException ex) {
        }
    } else if (contentType.equals("application/msword") || contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
        try {
            URL originalFile = new URL(mir.getAbsoluteFilename());
            HWPFDocument doc = new HWPFDocument(originalFile.openStream());
            WordExtractor extractor = new WordExtractor(doc);
            story = extractor.getText();
        } catch (IOException ex) {
            LOG.log(Level.SEVERE, ex.getMessage());
            LOG.log(Level.FINEST, "", ex);
        }
    }
    return story;
}

Example 91

Project: CrossRefX-master File: PlainTextDialog.java View source code

/**
	 * 	@Method: pdfParser
	 * 
	 * 	input : File
	 *  output: String
	 *  
	 *  Diese Methode lieï¿½t den Text aus der Pdf Datei aus und gibt den Text als String zurï¿½ck
	 */
private String pdfParser(File pdfFile) {
    NDC.push("pdfParser");
    PDDocument document = null;
    try {
        document = PDDocument.load(pdfFile);
    } catch (IOException e) {
        logger.error("Could not load document", e);
        NDC.pop();
        return null;
    }
    if (document.isEncrypted()) {
        NDC.pop();
        return "Encrypted documents are not supported";
    }
    PDFTextStripper stripper;
    try {
        stripper = new PDFTextStripper();
    } catch (IOException e) {
        logger.error("Could not create stripper", e);
        NDC.pop();
        return null;
    }
    stripper.setStartPage(1);
    stripper.setEndPage(2);
    String text;
    try {
        text = stripper.getText(document);
    } catch (Exception e) {
        logger.error("Could not parse PDF", e);
        NDC.pop();
        return null;
    }
    NDC.pop();
    return text;
}

Example 92

Project: flaming-sailor-master File: PDFParser.java View source code

/**
     * get a TextPage out of the PDF, ignoring characters smaller than minHeight.
     *
     * @param pdfFile   the File to extract it out of
     * @param minHeight minimum height to ignore
     * @return a Page
     */
public List<TextPage> getTextPages(File pdfFile, float minHeight) {
    fileName = pdfFile.getName();
    outString = new StringWriter();
    this.minHeight = minHeight;
    this.textPageList = new ArrayList<>();
    Map<String, Map<Integer, Long>> fontCounts = new HashMap<>();
    document = null;
    try {
        document = PDDocument.load(pdfFile);
        catalog = document.getDocumentCatalog();
        allpages = catalog.getAllPages();
        this.writeText(document, outString);
        outString.close();
        outString = null;
    // document.close();
    } catch (IOException e) {
        logger.error("I/O Error:" + pdfFile.getName(), e);
    } finally {
        if (document != null) {
            try {
                document.close();
                document = null;
            } catch (IOException e) {
                logger.error("I/O error closing file:" + pdfFile.getName(), e);
            }
        }
    }
    // the page is currently a set of lines with text pieces.
    // next steps
    // 1. remove header/footer boilerplate
    // 2. get font stats
    // 3. construct higher order components
    //
    TextPage.removeBoilerplate(textPageList, TextPage.LEVENSHTEIN_DISTANCE);
    long histogram[] = null;
    for (TextPage page : textPageList) {
        double avgLeft = page.getAvgLeft();
        double avgRight = page.getAvgRight();
        double avgWidth = page.getAvgWidth();
        long lineCount = page.getLineCount();
        Double charDensity = page.getCharDensity();
        histogram = Component.mergeHistogram(page.getHistogram(), histogram);
        if (lineCount > 0) {
            docAvgLeft += avgLeft * lineCount;
            docAvgWidth += avgWidth * lineCount;
            docAvgRight += avgRight * lineCount;
            docCharDensity += charDensity * lineCount;
            docLineCount += lineCount;
        }
        Map<String, Map<Integer, Long>> pageFontCounts = page.getFontCounts();
        for (Map.Entry<String, Map<Integer, Long>> e : pageFontCounts.entrySet()) {
            Map<Integer, Long> fontTally = fontCounts.get(e.getKey());
            if (fontTally == null) {
                fontTally = new HashMap<>();
            }
            for (Map.Entry<Integer, Long> pageFontTally : e.getValue().entrySet()) {
                Long tally = fontTally.get(pageFontTally.getKey());
                if (tally == null) {
                    fontTally.put(pageFontTally.getKey(), pageFontTally.getValue());
                } else {
                    fontTally.put(pageFontTally.getKey(), tally + pageFontTally.getValue());
                }
            }
            fontCounts.put(e.getKey(), fontTally);
        }
    }
    docAvgLeft /= docLineCount;
    docAvgRight /= docLineCount;
    docAvgWidth /= docLineCount;
    docCharDensity /= docLineCount;
    linesPerPage = docLineCount / textPageList.size();
    normalizeFontCounts(fontCounts);
    normalizedHistogram = Component.getNormalizedHistogram(histogram);
    logger.info(Component.normHistoGramToString(normalizedHistogram) + String.format(" H:%5.1f W:%6.1f D:%4.2f P:%4.2f", (double) highestFreqSize, docAvgWidth, docCharDensity, 1.0));
    for (TextPage page : textPageList) {
        page.constructPageComponents(highestFreqSize, this.minFontSize, this.maxFontSize, normalizedFontCounts, normalizedFonts, normalizedSizes, docAvgLeft, docAvgRight, docAvgWidth, docCharDensity, linesPerPage, normalizedHistogram);
    }
    return textPageList;
}

Example 93

Project: gsearch-master File: TransformerToText.java View source code

private StringBuffer getTextFromPDF(byte[] doc) throws GenericSearchException {
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF");
    StringBuffer docText = new StringBuffer();
    ByteArrayInputStream bais = null;
    try {
        bais = new ByteArrayInputStream(doc);
    } catch (Exception e) {
        closeBAIS(bais);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF new ByteArrayInputStream: ", e);
        throw new GenericSearchException("getTextFromPDF new ByteArrayInputStream: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF new ByteArrayInputStream");
    PDFParser parser;
    try {
        parser = new PDFParser(bais);
    } catch (Exception e) {
        closeBAIS(bais);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF new PDFParser: ", e);
        throw new GenericSearchException("getTextFromPDF new PDFParser: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF new PDFParser");
    try {
        parser.parse();
    } catch (Exception e) {
        closeBAIS(bais);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF parser.parse: ", e);
        throw new GenericSearchException("getTextFromPDF parser.parse: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF parser.parse");
    COSDocument cosDoc = null;
    try {
        cosDoc = parser.getDocument();
    } catch (Exception e) {
        closeBAIS(bais);
        closeCOSDocument(cosDoc);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF parser.getDocument: ", e);
        throw new GenericSearchException("getTextFromPDF parser.getDocument: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF parser.getDocument");
    PDDocument pdDoc = null;
    try {
        pdDoc = new PDDocument(cosDoc);
    } catch (Exception e) {
        closeBAIS(bais);
        closeCOSDocument(cosDoc);
        closePDDocument(pdDoc);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF new PDDocument: ", e);
        throw new GenericSearchException("getTextFromPDF new PDDocument: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF new PDDocument isEncrypted=" + pdDoc.isEncrypted() + " getNumberOfPages=" + pdDoc.getNumberOfPages());
    PDFTextStripper stripper;
    try {
        stripper = new PDFTextStripper();
    } catch (Exception e) {
        closeBAIS(bais);
        closeCOSDocument(cosDoc);
        closePDDocument(pdDoc);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF new PDFTextStripper: ", e);
        throw new GenericSearchException("getTextFromPDF new PDFTextStripper: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF new PDFTextStripper getStartPage=" + stripper.getStartPage() + " getEndPage=" + stripper.getEndPage());
    String docString = "";
    try {
        docString = stripper.getText(pdDoc);
    } catch (Exception e) {
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF stripper.getText: ", e);
        throw new GenericSearchException("getTextFromPDF stripper.getText: ", e);
    } finally {
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF stripper.getText finally");
        closeBAIS(bais);
        closeCOSDocument(cosDoc);
        closePDDocument(pdDoc);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF stripper.getText");
    docText = new StringBuffer(docString);
    //      put space instead of characters not allowed in the indexing stylesheet
    char c;
    for (int i = 0; i < docText.length(); i++) {
        c = docText.charAt(i);
        if (c < 32 && c != 9 && c != 10 && c != 13) {
            if (logger.isDebugEnabled())
                logger.debug("getTextFromPDF index=" + i + " char=" + c + " set to 32");
            docText.replace(i, i + 1, " ");
        }
    }
    return docText;
}

Example 94

Project: jucy-master File: TextIndexer.java View source code

private Reader getReader(File file) throws IOException {
    FileInputStream input = new FileInputStream(file);
    BufferedInputStream bin = new BufferedInputStream(input);
    String fileending = GH.getFileEnding(file.getName());
    if (fileending.equalsIgnoreCase("pdf")) {
        PDDocument pdfDocument = null;
        try {
            //				if (file.length() > MAX_TOTALSIZE/2) {
            //					System.gc();
            //				}
            pdfDocument = PDDocument.load(bin, getScratchRaf(), true);
            if (pdfDocument.isEncrypted()) {
                return null;
            }
            PDFTextStripper stripper = new PDFTextStripper();
            // create a writer where to append the text content.
            Reader reader;
            if (file.length() < MAX_RAMSIZE_FOR_PDF) {
                StringWriter writer = new StringWriter();
                stripper.writeText(pdfDocument, writer);
                String contents = writer.getBuffer().toString();
                reader = new StringReader(contents);
            } else {
                final File f = new File(PI.getTempPath(), "index.tmp");
                FileWriter fw = new FileWriter(f);
                try {
                    stripper.writeText(pdfDocument, fw);
                } finally {
                    GH.close(fw);
                }
                FileReader fr = new FileReader(f) {

                    @Override
                    public void close() throws IOException {
                        super.close();
                        if (!f.delete()) {
                            f.deleteOnExit();
                        }
                    }
                };
                reader = fr;
            }
            return reader;
        } finally {
            if (pdfDocument != null) {
                pdfDocument.close();
            }
        }
    } else {
        return new FileReader(file);
    }
}

Example 95

Project: OCRaptor-master File: PDF2XHTML.java View source code

/**
   * Converts the given PDF document (and related metadata) to a stream of XHTML
   * SAX events sent to the given content handler.
   *
   * @param document
   *          PDF document
   * @param handler
   *          SAX content handler
   * @param metadata
   *          PDF metadata
   * @throws SAXException
   *           if the content handler fails to process SAX events
   * @throws TikaException
   *           if the PDF document can not be processed
   */
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
    try {
        // Extract text using a dummy Writer as we override the
        // key methods to output to the given content
        // handler.
        PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);
        pdf2XHTML.writeText(document, new Writer() {

            @Override
            public void write(char[] cbuf, int off, int len) {
            }

            @Override
            public void flush() {
            }

            @Override
            public void close() {
            }
        });
    } catch (IOException e) {
        if (e.getCause() instanceof SAXException) {
            throw (SAXException) e.getCause();
        } else {
            throw new TikaException("Unable to extract PDF content", e);
        }
    }
}

Example 96

Project: opensearchserver-master File: ViewerController.java View source code

private void loadPdfBox() throws IOException, CryptographyException, SearchLibException, InterruptedException {
    PDDocument document = null;
    try {
        document = PDDocument.loadNonSeq(tempFile, null);
        // Trying to open with empty password
        boolean isEncrypted = document.isEncrypted();
        if (isEncrypted)
            document.decrypt("");
        loadGS(isEncrypted ? "" : null);
        List<Rectangle> boxList = new ArrayList<Rectangle>(0);
        checkPdfBoxHighlight(document, boxList);
        checkHocrHighlight(currentImage.getWidth(), currentImage.getHeight(), boxList);
        ImageUtils.yellowHighlight(currentImage, boxList, 0.1F);
        numberOfPages = document.getNumberOfPages();
    } finally {
        if (document != null)
            IOUtils.close(document);
    }
}

Example 97

Project: pdfxtk-master File: ProcessFile.java View source code

/*
    public static String STR_INFILE = "";
    public static String STR_OUTPUT_PATH = ".";
    public static int STR_CURR_PAGE_NO = -1;
    public static final String STR_IMAGE_PREFIX = "-imgPrefix";
    */
/*
     * possible conversions:
     * pdf -> xml, pdf -> xhtml,
     * gecko -> xml, gecko -> xhtml
     */
public static List<Page> processPDF(byte[] theFile, PageProcessor pp, int startPage, int endPage, String encoding, String password, List<AdjacencyGraph<GenericSegment>> adjGraphList, boolean GUI) throws DocumentProcessingException {
    boolean toConsole = false;
    if (password == null)
        password = "";
    if (encoding == null || encoding == "")
        encoding = DEFAULT_ENCODING;
    if (startPage == 0)
        startPage = 1;
    if (endPage == 0)
        endPage = Integer.MAX_VALUE;
    ByteArrayInputStream inStream = new ByteArrayInputStream(theFile);
    PDDocument document = null;
    try {
        PDFObjectExtractor extractor = new PDFObjectExtractor();
        //          PDDocument document = null;
        document = PDDocument.load(inStream);
        //      document.print();
        if (document.isEncrypted()) {
            try {
                document.decrypt(password);
            } catch (InvalidPasswordException e) {
                if (!(password == null || password == "")) {
                    throw new DocumentProcessingException("Error: The supplied password is incorrect.");
                } else {
                    throw new DocumentProcessingException("Error: The document is encrypted.");
                }
            } catch (CryptographyException e) {
                throw new DocumentProcessingException(e);
            }
        }
        extractor.setStartPage(startPage);
        extractor.setEndPage(endPage);
        // stripper.writeText( document, output );
        List<PDFPage> thePages = extractor.findObjects(document);
        List<Page> theResult = new ArrayList<Page>();
        startPage = extractor.getStartPage();
        endPage = extractor.getEndPage();
        // now the DU part
        Iterator<PDFPage> pageIter = thePages.iterator();
        int currentPage = -1;
        while (pageIter.hasNext()) {
            currentPage++;
            PDFPage thePage = pageIter.next();
            Page resultPage = pp.processPage(thePage);
            theResult.add(resultPage);
            if (adjGraphList != null)
                adjGraphList.add(pp.getAdjGraph());
        }
        // 17.11.10 document-wide processing for headers, footers, etc.
        if (!GUI)
            theResult = pp.processDocPages(theResult, null);
        // move to finally block somewhere?
        if (document != null) {
            document.close();
        }
        return theResult;
    } catch (IOException e) {
        e.printStackTrace();
        throw new DocumentProcessingException(e);
    }
}

Example 98

Project: sd-dss-master File: PdfBoxSignatureService.java View source code

@Override
public byte[] digest(final InputStream toSignDocument, final SignatureParameters parameters, final DigestAlgorithm digestAlgorithm, final Map.Entry<String, PdfDict>... extraDictionariesToAddBeforeSign) throws DSSException {
    final byte[] signatureValue = DSSUtils.EMPTY_BYTE_ARRAY;
    File toSignFile = null;
    File signedFile = null;
    PDDocument pdDocument = null;
    try {
        toSignFile = DSSPDFUtils.getFileFromPdfData(toSignDocument);
        pdDocument = PDDocument.load(toSignFile);
        addExtraDictionaries(pdDocument, extraDictionariesToAddBeforeSign);
        PDSignature pdSignature = createSignatureDictionary(parameters);
        signedFile = File.createTempFile("sd-dss-", "-signed.pdf");
        final FileOutputStream fileOutputStream = DSSPDFUtils.getFileOutputStream(toSignFile, signedFile);
        final byte[] digestValue = signDocumentAndReturnDigest(parameters, signatureValue, signedFile, fileOutputStream, pdDocument, pdSignature, digestAlgorithm);
        return digestValue;
    } catch (IOException e) {
        throw new DSSException(e);
    } finally {
        DSSUtils.delete(toSignFile);
        DSSUtils.delete(signedFile);
        DSSPDFUtils.close(pdDocument);
    }
}

Example 99

Project: spimedb-master File: Multimedia.java View source code

@Override
public NObject apply(NObject p, NObject x) {
    final String url = x.get("url_in");
    String xid = x.id();
    if (url == null) {
        return x;
    }
    try {
        long exp;
        InputStream stream;
        long fileSize;
        if (url.startsWith("file:")) {
            File f = new File(url.substring(5));
            exp = f.lastModified();
            stream = new FileInputStream(f);
            fileSize = f.length();
        } else {
            URL uu = new URL(url);
            URLConnection con = uu.openConnection();
            exp = con.getExpiration();
            if (exp == 0)
                exp = con.getLastModified();
            fileSize = con.getContentLengthLong();
            stream = con.getInputStream();
        }
        if (stream == null) {
            throw new FileNotFoundException();
        }
        //TODO store a hashcode of the data as well as the time for additional integrity
        if (p != null) {
            String whenCached = p.get("url_cached");
            if (!(whenCached == null || Long.valueOf(whenCached) < exp)) {
                logger.debug("cached: {}", url);
                //still valid
                return p;
            }
        }
        logger.info("load: {}", url);
        GeoNObject y = new GeoNObject(x);
        y.put("url_cached", Long.toString(exp));
        boolean isKMLorKMZ = url.endsWith(".kml") || url.endsWith(".kmz");
        boolean isGeoJSON = url.endsWith(".geojson");
        if (!isKMLorKMZ && !isGeoJSON) /* handled separately below */
        {
            Metadata metadata = new Metadata();
            ParseContext context = new ParseContext();
            final RecursiveParserWrapper tikaWrapper = new RecursiveParserWrapper(tika, tikaFactory);
            if (stream instanceof FileInputStream) {
                y.put("data", url);
            } else {
                //buffer the bytes for saving
                byte[] bytes = IOUtils.readFully(stream, (int) fileSize);
                stream = new ByteArrayInputStream(bytes);
                y.put("data", bytes);
            }
            tikaWrapper.parse(stream, new DefaultHandler(), metadata, context);
            stream.close();
            List<Metadata> m = tikaWrapper.getMetadata();
            m.forEach( md -> {
                for (String k : md.names()) {
                    String[] v = md.getValues(k);
                    String kk = tikiToField(k);
                    if (kk != null) {
                        Object vv = v.length > 1 ? v : v[0];
                        if (vv instanceof String) {
                            try {
                                int ivv = Integer.parseInt((String) vv);
                                vv = ivv;
                            } catch (Exception e) {
                            }
                        }
                        y.put(kk, vv);
                    }
                }
            });
        }
        if (isKMLorKMZ) {
            new KML(db, y).url(url).run();
        } else if (isGeoJSON) {
            GeoJSON.load(url, GeoJSON.baseGeoJSONBuilder, db);
        }
        x = y;
    } catch (Exception e) {
        logger.error("url_in removal: {}", e);
    }
    Object mime = x.get(NObject.TYPE);
    if (mime != null && (mime.equals("image/jpeg") || mime.equals("image/png"))) {
        x = new MutableNObject(x).name(titleify(xid)).put(NObject.DESC, null).put("thumbnail", "data");
    }
    if ("application/pdf".equals(mime) && x.has("pageCount") && x.has(NObject.DESC)) /* leaf */
    {
        int pageCount = x.get("pageCount");
        //float docPri = Util.lerp(1f / (pageCount), 0.75f, 0.25f);
        String parentContent = x.get(NObject.DESC);
        String author = x.get("author");
        //db.runLater(docPri, () -> {
        Document parentDOM = Jsoup.parse(parentContent);
        Elements pagesHTML = parentDOM.select(".page");
        PDDocument document = null;
        try {
            InputStream is;
            if (url.startsWith("file:")) {
                is = fileStream(url);
            } else {
                is = new URL(url).openStream();
            }
            document = PDDocument.load(is);
            PDFRenderer renderer = new PDFRenderer(document);
            for (int _page = 0; _page < pageCount; _page++) {
                final int pageActual = _page;
                final int page = _page + 1;
                logger.info("paginate: {} {}", xid, page);
                Document pd = Document.createShell("");
                pd.body().appendChild(pagesHTML.get(pageActual).removeAttr("class"));
                Elements cc = cleaner.clean(pd).body().children();
                String[] pdb = cc.stream().filter( xx -> !xx.children().isEmpty() || xx.hasText()).map(//just use <p> contents
                 xx -> xx.tagName().equals("p") ? xx.text() : xx).map(Object::toString).toArray(String[]::new);
                //                    List<JsonNode> jdb = new ArrayList(pdb.size());
                //                    pdb.forEach(e -> {
                //                        if (e.children().isEmpty() && e.text().isEmpty())
                //                            return;
                //                        jdb.add(html2json(e));
                //                    });
                //x.name();
                String docTitle = parentDOM.title();
                if (docTitle == null || docTitle.isEmpty()) {
                    docTitle = titleify(xid);
                }
                BufferedImage img = renderer.renderImageWithDPI(pageActual, (float) pdfPageImageDPI, ImageType.RGB);
                //boolean result = ImageIOUtil.writeImage(img, outputFile, pdfPageImageDPI);
                ByteArrayOutputStream os = new ByteArrayOutputStream(img.getWidth() * img.getHeight() * 3);
                boolean result = ImageIOUtil.writeImage(img, "jpg", os, pdfPageImageDPI, thumbnailQuality);
                byte[] thumbnail = os.toByteArray();
                String text = pdb.length > 0 ? Joiner.on('\n').join(pdb) : null;
                db.add(new MutableNObject(xid + "/" + page).name(docTitle + " - (" + page + " of " + (pageCount + 1) + ")").withTags(xid).put("author", author).put("url", //HACK browser loads the specific page when using the '#' anchor
                url).put(NObject.TYPE, "application/pdf").put("data", xid + "#page=" + page).put("page", page).put(NObject.DESC, text).put(/*.putLater("textParse", 0.1f, ()-> {
                                                return (pdb.length > 0) ? Stream.of(pdb).map(
                                                        t -> NLP.toString(NLP.parse(t))
                                                ).collect(Collectors.joining("\n")) : null;
                                            })*/
                "thumbnail", thumbnail));
            }
        } catch (IOException f) {
            logger.error("error: {} {}", xid, f);
        } finally {
            if (document != null)
                try {
                    document.close();
                } catch (IOException e) {
                }
        }
    }
    //clean and update parent DOM
    //String xname = x.name();
    //String desc = x.get(NObject.DESC);
    x = new MutableNObject(x).name(titleify(xid)).put(NObject.DESC, null);
    return x;
}

Example 100

Project: yacy_search_server-master File: pdfParser.java View source code

@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException {
    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);
    // create a pdf parser
    PDDocument pdfDoc;
    try {
        // the pdfparser is a big pain
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
        MemoryUsageSetting mus = MemoryUsageSetting.setupMixed(200 * 1024 * 1024);
        pdfDoc = PDDocument.load(source, mus);
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }
    if (pdfDoc.isEncrypted()) {
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }
    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        if (info.getModificationDate() != null)
            docDate = info.getModificationDate().getTime();
    // unused:
    // info.getTrapped());
    }
    info = null;
    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }
    Document[] result = null;
    try {
        // get the links
        final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper();
        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url
            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
            //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }
            // create individual documents for each page
            assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
            result = new Document[Math.min(pages.length, pdflinks.size())];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(// these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0d, 0.0d, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), null, null, false, docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            // get first 3 pages (always)
            stripper.setEndPage(3);
            writer.append(stripper.getText(pdfDoc));
            // remember text in case of interrupting thread
            contentBytes = writer.getBytes();
            if (pdfDoc.getNumberOfPages() > 3) {
                // spare creating/starting thread if all pages read
                // continue with page 4 (terminated, resulting in no text)
                stripper.setStartPage(4);
                // set to default
                stripper.setEndPage(Integer.MAX_VALUE);
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {

                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                // pdfbox likes to forget to terminate ... (quite often)
                t.join(3000);
                if (t.isAlive())
                    t.interrupt();
                // get final text before closing writer
                contentBytes = writer.getBytes();
                // free writer resources
                writer.close();
            }
            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null)
                pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0d, 0.0d, contentBytes, pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }
    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
    return result;
}

Example 101

Project: qi4j-sdk-master File: PDFWriter.java View source code

protected void writeImpl(File file, ApplicationDetailDescriptor descriptor, List<GraphDisplay> graphDisplays) throws IOException, COSVisitorException {
    try {
        doc = new PDDocument();
        for (GraphDisplay graphDisplay : graphDisplays) {
            writeGraphPage(graphDisplay);
        }
        writePage(descriptor);
        if (curContentStream != null) {
            curContentStream.close();
            curContentStream = null;
        }
        doc.save(new FileOutputStream(file));
    } finally {
        if (curContentStream != null) {
            curContentStream.close();
            curContentStream = null;
        }
        if (doc != null) {
            doc.close();
            doc = null;
        }
    }
}