Java Examples for org.apache.pdfbox.io.MemoryUsageSetting

The following java examples will help you to understand the usage of org.apache.pdfbox.io.MemoryUsageSetting. These source code samples are taken from different open source projects.

Example 1

Project: pdfbox-master File: PDDocument.java View source code

/**
     * Parses a PDF.
     * 
     * @param file file to be loaded
     * @param password password to be used for decryption
     * @param keyStore key store to be used for decryption when using public key security 
     * @param alias alias to be used for decryption when using public key security
     * @param memUsageSetting defines how memory is used for buffering PDF streams 
     * 
     * @return loaded document
     * 
     * @throws IOException in case of a file reading or parsing error
     */
public static PDDocument load(File file, String password, InputStream keyStore, String alias, MemoryUsageSetting memUsageSetting) throws IOException {
    RandomAccessBufferedFileInputStream raFile = new RandomAccessBufferedFileInputStream(file);
    try {
        ScratchFile scratchFile = new ScratchFile(memUsageSetting);
        try {
            PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile);
            parser.parse();
            return parser.getPDDocument();
        } catch (IOException ioe) {
            IOUtils.closeQuietly(scratchFile);
            throw ioe;
        }
    } catch (IOException ioe) {
        IOUtils.closeQuietly(raFile);
        throw ioe;
    }
}

Example 2

Project: DSpace-master File: PDFPackager.java View source code

private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        ScratchFile scratchFile = null;
        try {
            // use up to 80% of JVM free memory
            long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100;
            // then fallback to temp file (unlimited size)
            scratchFile = new ScratchFile(MemoryUsageSetting.setupMixed(useRAM));
        } catch (IOException ioe) {
            log.warn("Error initializing scratch file: " + ioe.getMessage());
        }
        PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(metadata), scratchFile);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }
        value = docinfo.getCreator();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "provenance", "en", "Application that created the original document: " + value);
        }
        value = docinfo.getProducer();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "provenance", "en", "Original document converted to PDF by: " + value);
        }
        value = docinfo.getSubject();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "abstract", null, value);
        }
        value = docinfo.getKeywords();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "subject", "other", null, value);
        }
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }
        if (calValue != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        itemService.update(context, item);
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}

Example 3

Project: yacy_search_server-master File: pdfParser.java View source code

@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException {
    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);
    // create a pdf parser
    PDDocument pdfDoc;
    try {
        // the pdfparser is a big pain
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
        MemoryUsageSetting mus = MemoryUsageSetting.setupMixed(200 * 1024 * 1024);
        pdfDoc = PDDocument.load(source, mus);
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }
    if (pdfDoc.isEncrypted()) {
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }
    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        if (info.getModificationDate() != null)
            docDate = info.getModificationDate().getTime();
    // unused:
    // info.getTrapped());
    }
    info = null;
    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }
    Document[] result = null;
    try {
        // get the links
        final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper();
        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url
            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
            //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }
            // create individual documents for each page
            assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
            result = new Document[Math.min(pages.length, pdflinks.size())];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(// these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0d, 0.0d, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), null, null, false, docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            // get first 3 pages (always)
            stripper.setEndPage(3);
            writer.append(stripper.getText(pdfDoc));
            // remember text in case of interrupting thread
            contentBytes = writer.getBytes();
            if (pdfDoc.getNumberOfPages() > 3) {
                // spare creating/starting thread if all pages read
                // continue with page 4 (terminated, resulting in no text)
                stripper.setStartPage(4);
                // set to default
                stripper.setEndPage(Integer.MAX_VALUE);
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {

                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                // pdfbox likes to forget to terminate ... (quite often)
                t.join(3000);
                if (t.isAlive())
                    t.interrupt();
                // get final text before closing writer
                contentBytes = writer.getBytes();
                // free writer resources
                writer.close();
            }
            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null)
                pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0d, 0.0d, contentBytes, pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }
    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
    return result;
}