Java Examples for org.pdfbox.pdmodel.PDDocument
The following java examples will help you to understand the usage of org.pdfbox.pdmodel.PDDocument. These source code samples are taken from different open source projects.
Example 1
| Project: xwiki-clams-core-master File: PDFTextExtractor.java View source code |
public String getText(byte[] data) throws Exception {
PDDocument pdfDocument = null;
try {
PDFParser parser = new PDFParser(new ByteArrayInputStream(data));
parser.parse();
pdfDocument = parser.getPDDocument();
Writer writer = new CharArrayWriter();
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfDocument, writer);
return writer.toString();
} finally {
if (pdfDocument != null)
pdfDocument.close();
}
}Example 2
| Project: carbon-registry-master File: PDFIndexer.java View source code |
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
COSDocument cosDoc = null;
try {
PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
parser.parse();
cosDoc = parser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
String docText = stripper.getText(new PDDocument(cosDoc));
return new IndexDocument(fileData.path, docText, null);
} catch (IOException e) {
String msg = "Failed to write to the index";
log.error(msg, e);
throw new SolrException(ErrorCode.SERVER_ERROR, msg);
} finally {
if (cosDoc != null) {
try {
cosDoc.close();
} catch (IOException e) {
log.error("Failed to close pdf doc stream ", e);
}
}
}
}Example 3
| Project: openmicroscopy-master File: PdfParser.java View source code |
@Override
public void run() {
try {
document = PDDocument.load(file);
} catch (IOException e) {
log.warn("Could not load Pdf " + file, e);
try {
writer.close();
} catch (IOException ioe) {
}
}
try {
if (document != null && !document.isEncrypted()) {
try {
PDFTextStripper stripper = null;
stripper = new PDFTextStripper();
stripper.writeText(document, writer);
} finally {
close();
}
}
} catch (IOException e) {
log.warn("Error reading pdf file", e);
}
}Example 4
| Project: Hibernate-Search-on-action-master File: TestPDFTextExtractor.java View source code |
@Test(groups = "ch13")
public void testPDFExtractor() throws Exception {
FullTextSession session = Search.getFullTextSession(openSession());
Transaction tx = session.beginTransaction();
PDDocument doc;
try {
File f = new File("ch13/src/com/manning/hsia/dvdstore/file1.pdf");
istream = new FileInputStream(f.getAbsolutePath());
PDFParser p = new PDFParser(istream);
p.parse();
doc = p.getPDDocument();
Pdf pdf = getDocument(doc);
closeInputStream(istream);
closeDocument(doc);
pdf.setId(1);
buildIndex(pdf, session, tx);
tx = session.beginTransaction();
QueryParser parser = new QueryParser("description", analyzer);
Query query = parser.parse("description:salesman");
org.hibernate.search.FullTextQuery hibQuery = session.createFullTextQuery(query, Pdf.class);
List results = hibQuery.list();
assert results.size() == 1 : "incorrect result size";
Pdf result = (Pdf) results.get(0);
assert result.getAuthor().startsWith("John Griffin") : "incorrect author";
assert result.getDescription().startsWith("Keanu Reeves") : "incorrect description";
for (Object element : session.createQuery("from " + Pdf.class.getName()).list()) {
session.delete(element);
}
tx.commit();
} catch (Exception e) {
e.printStackTrace();
} finally {
session.close();
}
}Example 5
| Project: CORISCO2-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
PDFParser parser = new PDFParser(metadata);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null)
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null)
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
log.debug("PDF Info dict title=\"" + title + "\"");
item.addDC("title", null, "en", title);
String value;
Calendar date;
if ((value = docinfo.getAuthor()) != null) {
item.addDC("contributor", "author", null, value);
log.debug("PDF Info dict author=\"" + value + "\"");
}
if ((value = docinfo.getCreator()) != null)
item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
if ((value = docinfo.getProducer()) != null)
item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
if ((value = docinfo.getSubject()) != null)
item.addDC("description", "abstract", null, value);
if ((value = docinfo.getKeywords()) != null)
item.addDC("subject", "other", null, value);
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue;
if ((calValue = docinfo.getCreationDate()) == null)
calValue = docinfo.getModificationDate();
if (calValue != null)
item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
item.update();
} finally {
if (cos != null)
cos.close();
}
}Example 6
| Project: jinhe-tss-master File: AttachmentIndex.java View source code |
private String getContentFromPDF(File pdfFile) {
// 内存中存储的PDF Document
PDDocument document = null;
String content = "";
try {
// 加载pdf文档
document = PDDocument.load(pdfFile);
// PDFTextStripper来提取文本
PDFTextStripper stripper = new PDFTextStripper();
// 设置是否排序
stripper.setSortByPosition(false);
// 设置起始页
stripper.setStartPage(1);
// 设置结束页
stripper.setEndPage(Integer.MAX_VALUE);
// 调用PDFTextStripper的getText()提取文本信息
content = stripper.getText(document);
} catch (Exception e) {
log.error("发布索引时提取PDF文档:" + pdfFile.getPath() + " 内容失败!", e);
return "";
} finally {
if (document != null) {
// 关闭PDF Document
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return content;
}Example 7
| Project: railo-master File: PDFUtil.java View source code |
public static Object extractText(PDFDocument doc, Set<Integer> pageNumbers) throws IOException, CryptographyException, InvalidPasswordException {
PDDocument pdDoc = doc.toPDDocument();
//PDPageNode pages = pdDoc.getDocumentCatalog().getPages();
//pages.
//pdDoc.getDocumentCatalog().
/*Iterator<Integer> it = pageNumbers.iterator();
int p;
while(it.hasNext()){
p=it.next().intValue();
pdDoc.getDocumentCatalog().getPages()
}
*/
//print.o(pages);
//pdDoc.
//PDFTextStripperByArea stripper = new PDFTextStripperByArea();
//PDFHighlighter stripper = new PDFHighlighter();
PDFText2HTML stripper = new PDFText2HTML();
//PDFTextStripper stripper = new PDFTextStripper();
StringWriter writer = new StringWriter();
stripper.writeText(pdDoc, writer);
return writer.toString();
}Example 8
| Project: openpipe-master File: PDFParser.java View source code |
@Override
public ParserResult parse(ParseData data) throws IOException, ParserException {
final PDDocument doc = PDDocument.load(data.getInputStream(), scratchFile);
try {
writer.reset();
try {
stripper.writeText(doc, writer);
final ParserResultImpl result = new ParserResultImpl();
result.setText(writer.toString());
result.setTitle(doc.getDocumentInformation().getTitle());
return result;
} finally {
writer.trimToMaxSize(1024 * 64);
}
} finally {
try {
doc.close();
} catch (IOException e) {
}
}
}Example 9
| Project: agile-itsm-master File: Arquivo.java View source code |
/**
* Extrai o texto de um documento no formato PDF.
*
* @param caminhoDocumento
* @param nomeArquivo
* @return StringBuilder
* @throws IOException
*/
private StringBuilder extrairFormatoPDF(String caminhoDocumento) throws IOException {
StringBuilder texto = new StringBuilder();
PDFParser parser;
FileInputStream fi = null;
COSDocument cd = null;
try {
fi = new FileInputStream(new File(caminhoDocumento));
parser = new PDFParser(fi);
parser.parse();
cd = parser.getDocument();
texto.append(new PDFTextStripper().getText(new PDDocument(cd)));
} catch (FileNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
} finally {
if (cd != null)
cd.close();
if (fi != null)
fi.close();
}
return texto;
}Example 10
| Project: Desktop-master File: XMPUtilTest.java View source code |
/**
* Write a manually constructed xmp-string to file
*
* @param xmpString
* @throws Exception
*/
public void writeManually(File tempFile, String xmpString) throws Exception {
PDDocument document = null;
try {
document = PDDocument.load(tempFile.getAbsoluteFile());
if (document.isEncrypted()) {
System.err.println("Error: Cannot add metadata to encrypted document.");
System.exit(1);
}
PDDocumentCatalog catalog = document.getDocumentCatalog();
// Convert to UTF8 and make available for metadata.
ByteArrayOutputStream bs = new ByteArrayOutputStream();
OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8");
os.write(xmpString);
os.close();
ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray());
PDMetadata metadataStream = new PDMetadata(document, in, false);
catalog.setMetadata(metadataStream);
document.save(tempFile.getAbsolutePath());
} finally {
if (document != null)
document.close();
}
}Example 11
| Project: infoglue-master File: LuceneController.java View source code |
private String extractTextToIndex(DigitalAssetVO digitalAssetVO, File file) {
String text = "";
if (logger.isInfoEnabled())
logger.info("Asset content type:" + digitalAssetVO.getAssetContentType());
if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/pdf")) {
try {
Writer output = null;
PDDocument document = null;
try {
document = PDDocument.load(file);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
if (!document.isEncrypted()) {
output = new OutputStreamWriter(baos, "UTF-8");
PDFTextStripper stripper = new PDFTextStripper();
//stripper.setSortByPosition( sort );
//stripper.setStartPage( startPage );
//stripper.setEndPage( endPage );
stripper.writeText(document, output);
text = baos.toString("UTF-8");
if (logger.isInfoEnabled())
logger.info("PDF Document has " + text.length() + " chars\n\n" + text);
}
} catch (Exception e) {
logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage());
} finally {
if (output != null) {
output.close();
}
if (document != null) {
document.close();
}
}
} catch (Exception e) {
logger.warn("Error indexing:" + e.getMessage());
}
} else if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/msword")) {
try {
InputStream is = new FileInputStream(file);
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(file));
is.close();
// Create a document for this file
HWPFDocument doc = new HWPFDocument(fs);
// Create a WordExtractor to read the text of the word document
WordExtractor we = new WordExtractor(doc);
// Extract all paragraphs in the document as strings
text = we.getText();
// Output the document
if (logger.isInfoEnabled())
logger.info("Word Document has " + text.length() + " chars\n\n" + text);
} catch (Exception e) {
logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage());
}
}
return text;
}Example 12
| Project: apache-nutch-fork-master File: PdfParser.java View source code |
public ParseResult getParse(Content content) {
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
Metadata metadata = new Metadata();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
//Just try using the default password and move on
pdf.openProtection(new StandardDecryptionMaterial(""));
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
metadata.add(Metadata.AUTHOR, info.getAuthor());
metadata.add(Metadata.SUBJECT, info.getSubject());
metadata.add(Metadata.KEYWORDS, info.getKeywords());
metadata.add(Metadata.CREATOR, info.getCreator());
metadata.add(Metadata.PUBLISHER, info.getProducer());
//TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
//error here
//metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
//metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (BadSecurityHandlerException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (// run time exception
Exception // run time exception
e) {
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: " + e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
}
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}Example 13
| Project: nutchbase-master File: PdfParser.java View source code |
public ParseResult getParse(Content content) {
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
Metadata metadata = new Metadata();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
//Just try using the default password and move on
pdf.openProtection(new StandardDecryptionMaterial(""));
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
metadata.add(Metadata.AUTHOR, info.getAuthor());
metadata.add(Metadata.SUBJECT, info.getSubject());
metadata.add(Metadata.KEYWORDS, info.getKeywords());
metadata.add(Metadata.CREATOR, info.getCreator());
metadata.add(Metadata.PUBLISHER, info.getProducer());
//TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
//error here
//metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
//metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (BadSecurityHandlerException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (// run time exception
Exception // run time exception
e) {
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: " + e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
}
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}Example 14
| Project: gnutch-master File: PdfParser.java View source code |
public ParseResult getParse(Content content) {
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
Metadata metadata = new Metadata();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
DocumentEncryption decryptor = new DocumentEncryption(pdf);
//Just try using the default password and move on
decryptor.decryptDocument("");
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
metadata.add(Metadata.AUTHOR, info.getAuthor());
metadata.add(Metadata.SUBJECT, info.getSubject());
metadata.add(Metadata.KEYWORDS, info.getKeywords());
metadata.add(Metadata.CREATOR, info.getCreator());
metadata.add(Metadata.PUBLISHER, info.getProducer());
//TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
//error here
//metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
//metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (InvalidPasswordException e) {
return new ParseStatus(ParseStatus.FAILED, "Can't decrypt document - invalid password. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (// run time exception
Exception // run time exception
e) {
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: " + e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
}
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}Example 15
| Project: Jorum-DSpace-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
PDFParser parser = new PDFParser(metadata);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null)
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null)
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
log.debug("PDF Info dict title=\"" + title + "\"");
item.addDC("title", null, "en", title);
String value;
Calendar date;
if ((value = docinfo.getAuthor()) != null) {
item.addDC("contributor", "author", null, value);
log.debug("PDF Info dict author=\"" + value + "\"");
}
if ((value = docinfo.getCreator()) != null)
item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
if ((value = docinfo.getProducer()) != null)
item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
if ((value = docinfo.getSubject()) != null)
item.addDC("description", "abstract", null, value);
if ((value = docinfo.getKeywords()) != null)
item.addDC("subject", "other", null, value);
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue;
if ((calValue = docinfo.getCreationDate()) == null)
calValue = docinfo.getModificationDate();
if (calValue != null)
item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
item.update();
} finally {
if (cos != null)
cos.close();
}
}Example 16
| Project: Docear-master File: XMPUtilTest.java View source code |
/**
* Write a manually constructed xmp-string to file
*
* @param xmpString
* @throws Exception
*/
public void writeManually(File tempFile, String xmpString) throws Exception {
PDDocument document = null;
try {
document = PDDocument.load(tempFile.getAbsoluteFile());
if (document.isEncrypted()) {
System.err.println("Error: Cannot add metadata to encrypted document.");
System.exit(1);
}
PDDocumentCatalog catalog = document.getDocumentCatalog();
// Convert to UTF8 and make available for metadata.
ByteArrayOutputStream bs = new ByteArrayOutputStream();
OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8");
os.write(xmpString);
os.close();
ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray());
PDMetadata metadataStream = new PDMetadata(document, in, false);
catalog.setMetadata(metadataStream);
document.save(tempFile.getAbsolutePath());
} finally {
if (document != null)
document.close();
}
}Example 17
| Project: desktop-master File: XMPUtilTest.java View source code |
/**
* Write a manually constructed xmp-string to file
*
* @param xmpString
* @throws Exception
*/
public void writeManually(File tempFile, String xmpString) throws Exception {
PDDocument document = null;
try {
document = PDDocument.load(tempFile.getAbsoluteFile());
if (document.isEncrypted()) {
System.err.println("Error: Cannot add metadata to encrypted document.");
System.exit(1);
}
PDDocumentCatalog catalog = document.getDocumentCatalog();
// Convert to UTF8 and make available for metadata.
ByteArrayOutputStream bs = new ByteArrayOutputStream();
OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8");
os.write(xmpString);
os.close();
ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray());
PDMetadata metadataStream = new PDMetadata(document, in, false);
catalog.setMetadata(metadataStream);
document.save(tempFile.getAbsolutePath());
} finally {
if (document != null)
document.close();
}
}Example 18
| Project: caelum-stella-master File: BoletoTransformerIntegrationTest.java View source code |
@Test
public void testPDFWriterEscreveValorCorreto() throws IOException {
PDFTextStripper stripper = new PDFTextStripper();
PDDocument document = PDDocument.load(new File("arquivo.pdf"));
String text = stripper.getText(document);
document.close();
assertTrue(text.contains("40,00"));
}Example 19
| Project: novelang-master File: HttpDaemonFixture.java View source code |
public static String extractPdfText(final byte[] pdfBytes) throws IOException {
final PDDocument pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
try {
return new PDFTextStripper().getText(pdfDocument);
} finally {
pdfDocument.close();
}
}Example 20
| Project: ecologylabFundamental-master File: Environment.java View source code |
public boolean hasPDFBox() {
if (!checkedForPDFBox) {
checkedForPDFBox = true;
hasPDFBox = checkFor("org.pdfbox.pdmodel.PDDocument");
debug("hasPDFBox() = " + hasPDFBox);
if (hasPDFBox)
ConsoleUtils.obtrusiveConsoleOutput("PDFBox Found");
}
return hasPDFBox;
}