Java Examples for org.pdfbox.pdmodel.PDDocument
The following java examples will help you to understand the usage of org.pdfbox.pdmodel.PDDocument. These source code samples are taken from different open source projects.
Example 1
Project: xwiki-clams-core-master File: PDFTextExtractor.java View source code |
public String getText(byte[] data) throws Exception {
PDDocument pdfDocument = null;
try {
PDFParser parser = new PDFParser(new ByteArrayInputStream(data));
parser.parse();
pdfDocument = parser.getPDDocument();
Writer writer = new CharArrayWriter();
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfDocument, writer);
return writer.toString();
} finally {
if (pdfDocument != null)
pdfDocument.close();
}
}
Example 2
Project: carbon-registry-master File: PDFIndexer.java View source code |
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
COSDocument cosDoc = null;
try {
PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
parser.parse();
cosDoc = parser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
String docText = stripper.getText(new PDDocument(cosDoc));
return new IndexDocument(fileData.path, docText, null);
} catch (IOException e) {
String msg = "Failed to write to the index";
log.error(msg, e);
throw new SolrException(ErrorCode.SERVER_ERROR, msg);
} finally {
if (cosDoc != null) {
try {
cosDoc.close();
} catch (IOException e) {
log.error("Failed to close pdf doc stream ", e);
}
}
}
}
Example 3
Project: openmicroscopy-master File: PdfParser.java View source code |
@Override
public void run() {
try {
document = PDDocument.load(file);
} catch (IOException e) {
log.warn("Could not load Pdf " + file, e);
try {
writer.close();
} catch (IOException ioe) {
}
}
try {
if (document != null && !document.isEncrypted()) {
try {
PDFTextStripper stripper = null;
stripper = new PDFTextStripper();
stripper.writeText(document, writer);
} finally {
close();
}
}
} catch (IOException e) {
log.warn("Error reading pdf file", e);
}
}
Example 4
Project: Hibernate-Search-on-action-master File: TestPDFTextExtractor.java View source code |
@Test(groups = "ch13")
public void testPDFExtractor() throws Exception {
FullTextSession session = Search.getFullTextSession(openSession());
Transaction tx = session.beginTransaction();
PDDocument doc;
try {
File f = new File("ch13/src/com/manning/hsia/dvdstore/file1.pdf");
istream = new FileInputStream(f.getAbsolutePath());
PDFParser p = new PDFParser(istream);
p.parse();
doc = p.getPDDocument();
Pdf pdf = getDocument(doc);
closeInputStream(istream);
closeDocument(doc);
pdf.setId(1);
buildIndex(pdf, session, tx);
tx = session.beginTransaction();
QueryParser parser = new QueryParser("description", analyzer);
Query query = parser.parse("description:salesman");
org.hibernate.search.FullTextQuery hibQuery = session.createFullTextQuery(query, Pdf.class);
List results = hibQuery.list();
assert results.size() == 1 : "incorrect result size";
Pdf result = (Pdf) results.get(0);
assert result.getAuthor().startsWith("John Griffin") : "incorrect author";
assert result.getDescription().startsWith("Keanu Reeves") : "incorrect description";
for (Object element : session.createQuery("from " + Pdf.class.getName()).list()) {
session.delete(element);
}
tx.commit();
} catch (Exception e) {
e.printStackTrace();
} finally {
session.close();
}
}
Example 5
Project: CORISCO2-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException { COSDocument cos = null; try { PDFParser parser = new PDFParser(metadata); parser.parse(); cos = parser.getDocument(); // sanity check: PDFBox breaks on encrypted documents, so give up. if (cos.getEncryptionDictionary() != null) throw new MetadataValidationException("This packager cannot accept an encrypted PDF document."); /* PDF to DC "crosswalk": * * NOTE: This is not in a crosswalk plugin because (a) it isn't * useful anywhere else, and more importantly, (b) the source * data is not XML so it doesn't fit the plugin's interface. * * pattern of crosswalk -- PDF dict entries to DC: * Title -> title.null * Author -> contributor.author * CreationDate -> date.created * ModDate -> date.created * Creator -> description.provenance (application that created orig) * Producer -> description.provenance (convertor to pdf) * Subject -> description.abstract * Keywords -> subject.other * date is java.util.Calendar */ PDDocument pd = new PDDocument(cos); PDDocumentInformation docinfo = pd.getDocumentInformation(); String title = docinfo.getTitle(); // sanity check: item must have a title. if (title == null) throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary."); log.debug("PDF Info dict title=\"" + title + "\""); item.addDC("title", null, "en", title); String value; Calendar date; if ((value = docinfo.getAuthor()) != null) { item.addDC("contributor", "author", null, value); log.debug("PDF Info dict author=\"" + value + "\""); } if ((value = docinfo.getCreator()) != null) item.addDC("description", "provenance", "en", "Application that created the original document: " + value); if ((value = docinfo.getProducer()) != null) item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value); if ((value = docinfo.getSubject()) != null) item.addDC("description", "abstract", null, value); if ((value = docinfo.getKeywords()) != null) item.addDC("subject", "other", null, value); // Take either CreationDate or ModDate as "date.created", // Too bad there's no place to put "last modified" in the DC. Calendar calValue; if ((calValue = docinfo.getCreationDate()) == null) calValue = docinfo.getModificationDate(); if (calValue != null) item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString()); item.update(); } finally { if (cos != null) cos.close(); } }
Example 6
Project: jinhe-tss-master File: AttachmentIndex.java View source code |
private String getContentFromPDF(File pdfFile) { // 内存中存储的PDF Document PDDocument document = null; String content = ""; try { // 加载pdf文档 document = PDDocument.load(pdfFile); // PDFTextStripper来提取文本 PDFTextStripper stripper = new PDFTextStripper(); // 设置是否排序 stripper.setSortByPosition(false); // 设置起始页 stripper.setStartPage(1); // 设置结束页 stripper.setEndPage(Integer.MAX_VALUE); // 调用PDFTextStripper的getText()提取文本信息 content = stripper.getText(document); } catch (Exception e) { log.error("发布索引时提取PDF文档:" + pdfFile.getPath() + " 内容失败!", e); return ""; } finally { if (document != null) { // 关闭PDF Document try { document.close(); } catch (IOException e) { e.printStackTrace(); } } } return content; }
Example 7
Project: railo-master File: PDFUtil.java View source code |
public static Object extractText(PDFDocument doc, Set<Integer> pageNumbers) throws IOException, CryptographyException, InvalidPasswordException {
PDDocument pdDoc = doc.toPDDocument();
//PDPageNode pages = pdDoc.getDocumentCatalog().getPages();
//pages.
//pdDoc.getDocumentCatalog().
/*Iterator<Integer> it = pageNumbers.iterator();
int p;
while(it.hasNext()){
p=it.next().intValue();
pdDoc.getDocumentCatalog().getPages()
}
*/
//print.o(pages);
//pdDoc.
//PDFTextStripperByArea stripper = new PDFTextStripperByArea();
//PDFHighlighter stripper = new PDFHighlighter();
PDFText2HTML stripper = new PDFText2HTML();
//PDFTextStripper stripper = new PDFTextStripper();
StringWriter writer = new StringWriter();
stripper.writeText(pdDoc, writer);
return writer.toString();
}
Example 8
Project: openpipe-master File: PDFParser.java View source code |
@Override public ParserResult parse(ParseData data) throws IOException, ParserException { final PDDocument doc = PDDocument.load(data.getInputStream(), scratchFile); try { writer.reset(); try { stripper.writeText(doc, writer); final ParserResultImpl result = new ParserResultImpl(); result.setText(writer.toString()); result.setTitle(doc.getDocumentInformation().getTitle()); return result; } finally { writer.trimToMaxSize(1024 * 64); } } finally { try { doc.close(); } catch (IOException e) { } } }
Example 9
Project: agile-itsm-master File: Arquivo.java View source code |
/**
* Extrai o texto de um documento no formato PDF.
*
* @param caminhoDocumento
* @param nomeArquivo
* @return StringBuilder
* @throws IOException
*/
private StringBuilder extrairFormatoPDF(String caminhoDocumento) throws IOException {
StringBuilder texto = new StringBuilder();
PDFParser parser;
FileInputStream fi = null;
COSDocument cd = null;
try {
fi = new FileInputStream(new File(caminhoDocumento));
parser = new PDFParser(fi);
parser.parse();
cd = parser.getDocument();
texto.append(new PDFTextStripper().getText(new PDDocument(cd)));
} catch (FileNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
} finally {
if (cd != null)
cd.close();
if (fi != null)
fi.close();
}
return texto;
}
Example 10
Project: Desktop-master File: XMPUtilTest.java View source code |
/** * Write a manually constructed xmp-string to file * * @param xmpString * @throws Exception */ public void writeManually(File tempFile, String xmpString) throws Exception { PDDocument document = null; try { document = PDDocument.load(tempFile.getAbsoluteFile()); if (document.isEncrypted()) { System.err.println("Error: Cannot add metadata to encrypted document."); System.exit(1); } PDDocumentCatalog catalog = document.getDocumentCatalog(); // Convert to UTF8 and make available for metadata. ByteArrayOutputStream bs = new ByteArrayOutputStream(); OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8"); os.write(xmpString); os.close(); ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray()); PDMetadata metadataStream = new PDMetadata(document, in, false); catalog.setMetadata(metadataStream); document.save(tempFile.getAbsolutePath()); } finally { if (document != null) document.close(); } }
Example 11
Project: infoglue-master File: LuceneController.java View source code |
private String extractTextToIndex(DigitalAssetVO digitalAssetVO, File file) { String text = ""; if (logger.isInfoEnabled()) logger.info("Asset content type:" + digitalAssetVO.getAssetContentType()); if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/pdf")) { try { Writer output = null; PDDocument document = null; try { document = PDDocument.load(file); ByteArrayOutputStream baos = new ByteArrayOutputStream(); if (!document.isEncrypted()) { output = new OutputStreamWriter(baos, "UTF-8"); PDFTextStripper stripper = new PDFTextStripper(); //stripper.setSortByPosition( sort ); //stripper.setStartPage( startPage ); //stripper.setEndPage( endPage ); stripper.writeText(document, output); text = baos.toString("UTF-8"); if (logger.isInfoEnabled()) logger.info("PDF Document has " + text.length() + " chars\n\n" + text); } } catch (Exception e) { logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage()); } finally { if (output != null) { output.close(); } if (document != null) { document.close(); } } } catch (Exception e) { logger.warn("Error indexing:" + e.getMessage()); } } else if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/msword")) { try { InputStream is = new FileInputStream(file); POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(file)); is.close(); // Create a document for this file HWPFDocument doc = new HWPFDocument(fs); // Create a WordExtractor to read the text of the word document WordExtractor we = new WordExtractor(doc); // Extract all paragraphs in the document as strings text = we.getText(); // Output the document if (logger.isInfoEnabled()) logger.info("Word Document has " + text.length() + " chars\n\n" + text); } catch (Exception e) { logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage()); } } return text; }
Example 12
Project: apache-nutch-fork-master File: PdfParser.java View source code |
public ParseResult getParse(Content content) {
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
Metadata metadata = new Metadata();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
//Just try using the default password and move on
pdf.openProtection(new StandardDecryptionMaterial(""));
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
metadata.add(Metadata.AUTHOR, info.getAuthor());
metadata.add(Metadata.SUBJECT, info.getSubject());
metadata.add(Metadata.KEYWORDS, info.getKeywords());
metadata.add(Metadata.CREATOR, info.getCreator());
metadata.add(Metadata.PUBLISHER, info.getProducer());
//TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
//error here
//metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
//metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (BadSecurityHandlerException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (// run time exception
Exception // run time exception
e) {
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: " + e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
}
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Example 13
Project: nutchbase-master File: PdfParser.java View source code |
public ParseResult getParse(Content content) {
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
Metadata metadata = new Metadata();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
//Just try using the default password and move on
pdf.openProtection(new StandardDecryptionMaterial(""));
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
metadata.add(Metadata.AUTHOR, info.getAuthor());
metadata.add(Metadata.SUBJECT, info.getSubject());
metadata.add(Metadata.KEYWORDS, info.getKeywords());
metadata.add(Metadata.CREATOR, info.getCreator());
metadata.add(Metadata.PUBLISHER, info.getProducer());
//TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
//error here
//metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
//metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (BadSecurityHandlerException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (// run time exception
Exception // run time exception
e) {
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: " + e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
}
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Example 14
Project: gnutch-master File: PdfParser.java View source code |
public ParseResult getParse(Content content) {
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
Metadata metadata = new Metadata();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
DocumentEncryption decryptor = new DocumentEncryption(pdf);
//Just try using the default password and move on
decryptor.decryptDocument("");
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
metadata.add(Metadata.AUTHOR, info.getAuthor());
metadata.add(Metadata.SUBJECT, info.getSubject());
metadata.add(Metadata.KEYWORDS, info.getKeywords());
metadata.add(Metadata.CREATOR, info.getCreator());
metadata.add(Metadata.PUBLISHER, info.getProducer());
//TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
//error here
//metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
//metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (InvalidPasswordException e) {
return new ParseStatus(ParseStatus.FAILED, "Can't decrypt document - invalid password. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (// run time exception
Exception // run time exception
e) {
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: " + e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
}
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Example 15
Project: Jorum-DSpace-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException { COSDocument cos = null; try { PDFParser parser = new PDFParser(metadata); parser.parse(); cos = parser.getDocument(); // sanity check: PDFBox breaks on encrypted documents, so give up. if (cos.getEncryptionDictionary() != null) throw new MetadataValidationException("This packager cannot accept an encrypted PDF document."); /* PDF to DC "crosswalk": * * NOTE: This is not in a crosswalk plugin because (a) it isn't * useful anywhere else, and more importantly, (b) the source * data is not XML so it doesn't fit the plugin's interface. * * pattern of crosswalk -- PDF dict entries to DC: * Title -> title.null * Author -> contributor.author * CreationDate -> date.created * ModDate -> date.created * Creator -> description.provenance (application that created orig) * Producer -> description.provenance (convertor to pdf) * Subject -> description.abstract * Keywords -> subject.other * date is java.util.Calendar */ PDDocument pd = new PDDocument(cos); PDDocumentInformation docinfo = pd.getDocumentInformation(); String title = docinfo.getTitle(); // sanity check: item must have a title. if (title == null) throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary."); log.debug("PDF Info dict title=\"" + title + "\""); item.addDC("title", null, "en", title); String value; Calendar date; if ((value = docinfo.getAuthor()) != null) { item.addDC("contributor", "author", null, value); log.debug("PDF Info dict author=\"" + value + "\""); } if ((value = docinfo.getCreator()) != null) item.addDC("description", "provenance", "en", "Application that created the original document: " + value); if ((value = docinfo.getProducer()) != null) item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value); if ((value = docinfo.getSubject()) != null) item.addDC("description", "abstract", null, value); if ((value = docinfo.getKeywords()) != null) item.addDC("subject", "other", null, value); // Take either CreationDate or ModDate as "date.created", // Too bad there's no place to put "last modified" in the DC. Calendar calValue; if ((calValue = docinfo.getCreationDate()) == null) calValue = docinfo.getModificationDate(); if (calValue != null) item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString()); item.update(); } finally { if (cos != null) cos.close(); } }
Example 16
Project: Docear-master File: XMPUtilTest.java View source code |
/** * Write a manually constructed xmp-string to file * * @param xmpString * @throws Exception */ public void writeManually(File tempFile, String xmpString) throws Exception { PDDocument document = null; try { document = PDDocument.load(tempFile.getAbsoluteFile()); if (document.isEncrypted()) { System.err.println("Error: Cannot add metadata to encrypted document."); System.exit(1); } PDDocumentCatalog catalog = document.getDocumentCatalog(); // Convert to UTF8 and make available for metadata. ByteArrayOutputStream bs = new ByteArrayOutputStream(); OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8"); os.write(xmpString); os.close(); ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray()); PDMetadata metadataStream = new PDMetadata(document, in, false); catalog.setMetadata(metadataStream); document.save(tempFile.getAbsolutePath()); } finally { if (document != null) document.close(); } }
Example 17
Project: desktop-master File: XMPUtilTest.java View source code |
/** * Write a manually constructed xmp-string to file * * @param xmpString * @throws Exception */ public void writeManually(File tempFile, String xmpString) throws Exception { PDDocument document = null; try { document = PDDocument.load(tempFile.getAbsoluteFile()); if (document.isEncrypted()) { System.err.println("Error: Cannot add metadata to encrypted document."); System.exit(1); } PDDocumentCatalog catalog = document.getDocumentCatalog(); // Convert to UTF8 and make available for metadata. ByteArrayOutputStream bs = new ByteArrayOutputStream(); OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8"); os.write(xmpString); os.close(); ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray()); PDMetadata metadataStream = new PDMetadata(document, in, false); catalog.setMetadata(metadataStream); document.save(tempFile.getAbsolutePath()); } finally { if (document != null) document.close(); } }
Example 18
Project: caelum-stella-master File: BoletoTransformerIntegrationTest.java View source code |
@Test public void testPDFWriterEscreveValorCorreto() throws IOException { PDFTextStripper stripper = new PDFTextStripper(); PDDocument document = PDDocument.load(new File("arquivo.pdf")); String text = stripper.getText(document); document.close(); assertTrue(text.contains("40,00")); }
Example 19
Project: novelang-master File: HttpDaemonFixture.java View source code |
public static String extractPdfText(final byte[] pdfBytes) throws IOException { final PDDocument pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes)); try { return new PDFTextStripper().getText(pdfDocument); } finally { pdfDocument.close(); } }
Example 20
Project: ecologylabFundamental-master File: Environment.java View source code |
public boolean hasPDFBox() {
if (!checkedForPDFBox) {
checkedForPDFBox = true;
hasPDFBox = checkFor("org.pdfbox.pdmodel.PDDocument");
debug("hasPDFBox() = " + hasPDFBox);
if (hasPDFBox)
ConsoleUtils.obtrusiveConsoleOutput("PDFBox Found");
}
return hasPDFBox;
}