Java Examples for org.apache.pdfbox.io.MemoryUsageSetting
The following java examples will help you to understand the usage of org.apache.pdfbox.io.MemoryUsageSetting. These source code samples are taken from different open source projects.
Example 1
| Project: pdfbox-master File: PDDocument.java View source code |
/**
* Parses a PDF.
*
* @param file file to be loaded
* @param password password to be used for decryption
* @param keyStore key store to be used for decryption when using public key security
* @param alias alias to be used for decryption when using public key security
* @param memUsageSetting defines how memory is used for buffering PDF streams
*
* @return loaded document
*
* @throws IOException in case of a file reading or parsing error
*/
public static PDDocument load(File file, String password, InputStream keyStore, String alias, MemoryUsageSetting memUsageSetting) throws IOException {
RandomAccessBufferedFileInputStream raFile = new RandomAccessBufferedFileInputStream(file);
try {
ScratchFile scratchFile = new ScratchFile(memUsageSetting);
try {
PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile);
parser.parse();
return parser.getPDDocument();
} catch (IOException ioe) {
IOUtils.closeQuietly(scratchFile);
throw ioe;
}
} catch (IOException ioe) {
IOUtils.closeQuietly(raFile);
throw ioe;
}
}Example 2
| Project: DSpace-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
ScratchFile scratchFile = null;
try {
// use up to 80% of JVM free memory
long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100;
// then fallback to temp file (unlimited size)
scratchFile = new ScratchFile(MemoryUsageSetting.setupMixed(useRAM));
} catch (IOException ioe) {
log.warn("Error initializing scratch file: " + ioe.getMessage());
}
PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(metadata), scratchFile);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
}
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null) {
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
}
if (log.isDebugEnabled()) {
log.debug("PDF Info dict title=\"" + title + "\"");
}
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "title", null, "en", title);
String value = docinfo.getAuthor();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "contributor", "author", null, value);
if (log.isDebugEnabled()) {
log.debug("PDF Info dict author=\"" + value + "\"");
}
}
value = docinfo.getCreator();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "provenance", "en", "Application that created the original document: " + value);
}
value = docinfo.getProducer();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "provenance", "en", "Original document converted to PDF by: " + value);
}
value = docinfo.getSubject();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "abstract", null, value);
}
value = docinfo.getKeywords();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "subject", "other", null, value);
}
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue = docinfo.getCreationDate();
if (calValue == null) {
calValue = docinfo.getModificationDate();
}
if (calValue != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "date", "created", null, (new DCDate(calValue.getTime())).toString());
}
itemService.update(context, item);
} finally {
if (cos != null) {
cos.close();
}
}
}Example 3
| Project: yacy_search_server-master File: pdfParser.java View source code |
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false))
throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);
// create a pdf parser
PDDocument pdfDoc;
try {
// the pdfparser is a big pain
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
MemoryUsageSetting mus = MemoryUsageSetting.setupMixed(200 * 1024 * 1024);
pdfDoc = PDDocument.load(source, mus);
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
} finally {
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}
if (pdfDoc.isEncrypted()) {
final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent()) {
try {
pdfDoc.close();
} catch (final IOException ee) {
}
throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
}
}
// extracting some metadata
PDDocumentInformation info = pdfDoc.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
Date docDate = new Date();
if (info != null) {
docTitle = info.getTitle();
docSubject = info.getSubject();
docAuthor = info.getAuthor();
docPublisher = info.getProducer();
if (docPublisher == null || docPublisher.isEmpty())
docPublisher = info.getCreator();
docKeywordStr = info.getKeywords();
if (info.getModificationDate() != null)
docDate = info.getModificationDate().getTime();
// unused:
// info.getTrapped());
}
info = null;
if (docTitle == null || docTitle.isEmpty()) {
docTitle = MultiProtocolURL.unescape(location.getFileName());
}
if (docTitle == null) {
docTitle = docSubject;
}
String[] docKeywords = null;
if (docKeywordStr != null) {
docKeywords = docKeywordStr.split(" |,");
}
Document[] result = null;
try {
// get the links
final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
// get the fulltext (either per document or for each page)
final PDFTextStripper stripper = new PDFTextStripper();
if (individualPages) {
// this is a hack which stores individual pages of the source pdf into individual index documents
// the new documents will get a virtual link with a post argument page=X appended to the original url
// collect text
int pagecount = pdfDoc.getNumberOfPages();
String[] pages = new String[pagecount];
for (int page = 1; page <= pagecount; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
pages[page - 1] = stripper.getText(pdfDoc);
//System.out.println("PAGE " + page + ": " + pages[page - 1]);
}
// create individual documents for each page
assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
result = new Document[Math.min(pages.length, pdflinks.size())];
String loc = location.toNormalform(true);
for (int page = 0; page < result.length; page++) {
result[page] = new Document(// these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0d, 0.0d, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), null, null, false, docDate);
}
} else {
// collect the whole text at once
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
// get first 3 pages (always)
stripper.setEndPage(3);
writer.append(stripper.getText(pdfDoc));
// remember text in case of interrupting thread
contentBytes = writer.getBytes();
if (pdfDoc.getNumberOfPages() > 3) {
// spare creating/starting thread if all pages read
// continue with page 4 (terminated, resulting in no text)
stripper.setStartPage(4);
// set to default
stripper.setEndPage(Integer.MAX_VALUE);
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final PDDocument pdfDocC = pdfDoc;
final Thread t = new Thread() {
@Override
public void run() {
Thread.currentThread().setName("pdfParser.getText:" + location);
try {
writer.append(stripper.getText(pdfDocC));
} catch (final Throwable e) {
}
}
};
t.start();
// pdfbox likes to forget to terminate ... (quite often)
t.join(3000);
if (t.isAlive())
t.interrupt();
// get final text before closing writer
contentBytes = writer.getBytes();
// free writer resources
writer.close();
}
Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null)
pdflinksCombined.addAll(pdflinksx);
result = new Document[] { new Document(location, mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0d, 0.0d, contentBytes, pdflinksCombined, null, null, false, docDate) };
}
} catch (final Throwable e) {
} finally {
try {
pdfDoc.close();
} catch (final Throwable e) {
}
}
// clear resources in pdfbox. they say that is resolved but it's not. see:
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351
// https://issues.apache.org/jira/browse/PDFBOX-441
// the pdfbox still generates enormeous number of object allocations and don't delete these
// the following Object are statically stored and never flushed:
// COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
// COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
// the great number of these objects can easily be seen in Java Visual VM
// we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
pdfDoc = null;
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
return result;
}