Java Examples for org.apache.pdfbox.pdmodel.PDDocument
The following java examples will help you to understand the usage of org.apache.pdfbox.pdmodel.PDDocument. These source code samples are taken from different open source projects.
Example 1
| Project: PaperManager-master File: TestPDFBox.java View source code |
/**
* @param args
*/
public static void main(String[] args) {
PDDocument document;
try {
document = PDDocument.load("test.pdf");
PDDocumentInformation info = document.getDocumentInformation();
System.out.println("Page Count=" + document.getNumberOfPages());
System.out.println("Title=" + info.getTitle());
System.out.println("Author=" + info.getAuthor());
} catch (IOException e) {
e.printStackTrace();
}
}Example 2
| Project: aplikator-master File: PDFLoader.java View source code |
public static BufferedImage load(InputStream stream) throws IOException {
PDDocument document = null;
try {
document = PDDocument.load(stream);
int resolution = 160;
int page = 0;
PDFRenderer renderer = new PDFRenderer(document);
BufferedImage renderImage = renderer.renderImageWithDPI(page, resolution, ImageType.RGB);
return renderImage;
} finally {
if (document != null) {
document.close();
}
IOUtils.tryClose(stream);
}
}Example 3
| Project: dss-master File: PdfBoxSignatureService.java View source code |
@Override
public byte[] digest(final InputStream toSignDocument, final PAdESSignatureParameters parameters, final DigestAlgorithm digestAlgorithm) throws DSSException {
final byte[] signatureValue = DSSUtils.EMPTY_BYTE_ARRAY;
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PDDocument pdDocument = null;
try {
pdDocument = PDDocument.load(toSignDocument);
PDSignature pdSignature = createSignatureDictionary(parameters);
return signDocumentAndReturnDigest(parameters, signatureValue, outputStream, pdDocument, pdSignature, digestAlgorithm);
} catch (IOException e) {
throw new DSSException(e);
} finally {
Utils.closeQuietly(pdDocument);
Utils.closeQuietly(outputStream);
}
}Example 4
| Project: padaf-master File: SynchronizedMetaDataValidation.java View source code |
/**
* Check if document information entries and XMP information are synchronized
*
* @param document
* the PDF Document
* @param metadata
* the XMP MetaData
* @return List of validation errors
* @throws ValidationException
*/
public List<ValidationError> validateMetadataSynchronization(PDDocument document, XMPMetadata metadata) throws ValidationException {
List<ValidationError> ve = new ArrayList<ValidationError>();
if (document == null) {
throw new ValidationException("Document provided is null");
} else {
PDDocumentInformation dico = document.getDocumentInformation();
if (metadata == null) {
throw new ValidationException("Metadata provided are null");
} else {
DublinCoreSchema dc = metadata.getDublinCoreSchema();
// TITLE
analyzeTitleProperty(dico, dc, ve);
// AUTHOR
analyzeAuthorProperty(dico, dc, ve);
// SUBJECT
analyzeSubjectProperty(dico, dc, ve);
AdobePDFSchema pdf = metadata.getAdobePDFSchema();
// KEYWORDS
analyzeKeywordsProperty(dico, pdf, ve);
// PRODUCER
analyzeProducerProperty(dico, pdf, ve);
XMPBasicSchema xmp = metadata.getXMPBasicSchema();
// CREATOR TOOL
analyzeCreatorToolProperty(dico, xmp, ve);
// CREATION DATE
analyzeCreationDateProperty(dico, xmp, ve);
// MODIFY DATE
analyzeModifyDateProperty(dico, xmp, ve);
}
}
return ve;
}Example 5
| Project: preservation-tools-master File: PdfAValidator.java View source code |
public static void main(String args[]) throws IOException {
try {
changecolor();
String path = "D://Eclipse New//PDFBoxLogo.gif";
String description = "PDFBox Logo";
ImageIcon icon = new ImageIcon(path, description);
JOptionPane.showMessageDialog(null, "Please choose the folder with PDF/A files to validate.", "PDFBox Validation", JOptionPane.QUESTION_MESSAGE, icon);
examinedFolder = utilities.BrowserDialogs.chooseFolder();
outputfile = new PrintWriter(new FileWriter(examinedFolder + "//" + "PdfAValidation.xml"));
shortSummary = new PrintWriter(new FileWriter(examinedFolder + "//" + "PdfAValidationShortSummary.xml"));
String xmlVersion = "xml version='1.0'";
String xmlEncoding = "encoding='ISO-8859-1'";
String xsltStyleSheet = "<?xml-stylesheet type=\"text/xsl\" href=\"PdfBoxValidationStyle.xsl\"?>";
String xsltStyleSheetSummary = "<?xml-stylesheet type=\"text/xsl\" href=\"PdfBoxSummaryStyle.xsl\"?>";
String xsltLocation = examinedFolder + "//" + "PdfBoxValidationStyle.xsl";
String xsltLocationSum = examinedFolder + "//" + "PdfBoxSummaryStyle.xsl";
output.XslStyleSheets.PdfBoxCustomizedXsl(xsltLocation);
output.XslStyleSheets.PdfBoxSummaryCustomizedXsl(xsltLocationSum);
outputfile.println("<?" + xmlVersion + " " + xmlEncoding + "?>");
outputfile.println(xsltStyleSheet);
outputfile.println("<PdfBoxValidation>");
shortSummary.println("<?" + xmlVersion + " " + xmlEncoding + "?>");
shortSummary.println(xsltStyleSheetSummary);
shortSummary.println("<PdfBoxValidationSummary>");
int examinedPdfa = 0;
int validPdfa = 0;
int invalidPdfa = 0;
if (examinedFolder != null) {
ArrayList<File> files = utilities.ListsFiles.getPaths(new File(examinedFolder), new ArrayList<File>());
for (int i = 0; i < files.size(); i++) {
if (files.get(i) != null) {
try {
if (PdfAnalysis.testPdfOk(files.get(i))) /*
* Test if the Pdf File is ok to be examined.
* Otherwise gives error in Console
*/
{
String PdfType = PdfAnalysis.checkIfPdfA(files.get(i));
if (PdfType.contains("PDF/A")) {
outputfile.println("<PdfAFile>");
shortSummary.println("<PdfAFile>");
int syntaxError = 0;
int graphicError = 0;
int fontError = 0;
int transparencyError = 0;
int annotationError = 0;
int actionError = 0;
int metadataError = 0;
examinedPdfa++;
outputfile.println("<FileName>" + utilities.fileStringUtilities.getFileName(files.get(i)) + "</FileName>");
shortSummary.println("<FileName>" + utilities.fileStringUtilities.getFileName(files.get(i)) + "</FileName>");
PDDocument pd = new PDDocument();
pd = PDDocument.load(files.get(i));
PDDocumentInformation info = pd.getDocumentInformation();
getsomeMetadata(info);
pd.close();
/*
* the actual PdfAValidation starts here
*/
ValidationResult result = null;
FileDataSource fd = new FileDataSource(files.get(i).toString());
PreflightParser parser = new PreflightParser(fd);
try {
parser.parse();
PreflightDocument document = parser.getPreflightDocument();
try {
document.validate();
result = document.getResult();
document.close();
} catch (NullPointerException e) {
outputfile.println("<Error>" + e + "</Error>");
shortSummary.println("<Error>" + e + "</Error>");
logger.error("Error analyzing " + files.get(i).getAbsolutePath(), e);
}
} catch (SyntaxValidationException e) {
result = e.getResult();
logger.error("Error analyzing " + files.get(i).getAbsolutePath(), e);
}
if (result != null) {
if (result.isValid()) {
outputfile.println("<Status>" + "Valid" + "</Status>");
shortSummary.println("<Status>" + "Valid" + "</Status>");
validPdfa++;
} else {
int errorslen = 0;
outputfile.println("<Status>" + "Invalid" + "</Status>");
shortSummary.println("<Status>" + "Invalid" + "</Status>");
invalidPdfa++;
for (ValidationError error : result.getErrorsList()) {
errorslen++;
String errorCode = error.getErrorCode().toString();
outputfile.println("<Code>" + error.getErrorCode() + "</Code>");
String errorDetails = utilities.fileStringUtilities.reduceXmlEscapors(error.getDetails());
if (errorCode.startsWith("1")) {
outputfile.println("<Details Category=\"SyntaxError\">" + errorDetails + "</Details>");
syntaxError++;
}
if (errorCode.startsWith("2")) {
outputfile.println("<Details Category=\"GraphicError\">" + errorDetails + "</Details>");
graphicError++;
}
if (errorCode.startsWith("3")) {
outputfile.println("<Details Category=\"FontError\">" + errorDetails + "</Details>");
fontError++;
}
if (errorCode.startsWith("4")) {
outputfile.println("<Details Category=\"TransparencyError\">" + errorDetails + "</Details>");
transparencyError++;
}
if (errorCode.startsWith("5")) {
outputfile.println("<Details Category=\"AnnotationError\">" + errorDetails + "</Details>");
annotationError++;
}
if (errorCode.startsWith("6")) {
outputfile.println("<Details Category=\"ActionError\">" + errorDetails + "</Details>");
actionError++;
}
if (errorCode.startsWith("7")) {
outputfile.println("<Details Category=\"MetadataError\">" + errorDetails + "</Details>");
metadataError++;
}
}
outputfile.println("<SyntaxErrors>" + syntaxError + "</SyntaxErrors>");
outputfile.println("<GraphicErrors>" + graphicError + "</GraphicErrors>");
outputfile.println("<FontErrors>" + fontError + "</FontErrors>");
outputfile.println("<TransparencyErrors>" + transparencyError + "</TransparencyErrors>");
outputfile.println("<AnnotationErrors>" + annotationError + "</AnnotationErrors>");
outputfile.println("<ActionErrors>" + actionError + "</ActionErrors>");
outputfile.println("<MetadataErrors>" + metadataError + "</MetadataErrors>");
shortSummary.println("<ErrorsCount>" + errorslen + "</ErrorsCount>");
}
}
outputfile.println("</PdfAFile>");
shortSummary.println("</PdfAFile>");
}
}
} catch (IOException e) {
outputfile.println("<Error>" + e + "</Error>");
JOptionPane.showMessageDialog(null, e, "error message", JOptionPane.ERROR_MESSAGE);
}
}
}
}
shortSummary.println("<Summary>");
shortSummary.println("<ExaminedPdfAFiles>" + examinedPdfa + "</ExaminedPdfAFiles>");
shortSummary.println("<ValidPdfAFiles>" + validPdfa + "</ValidPdfAFiles>");
shortSummary.println("<InvalidPdfAFiles>" + invalidPdfa + "</InvalidPdfAFiles>");
shortSummary.println("</Summary>");
outputfile.println("</PdfBoxValidation>");
shortSummary.println("</PdfBoxValidationSummary>");
shortSummary.close();
outputfile.close();
} catch (FileNotFoundException e) {
logger.error("Error analyzing " + e);
JOptionPane.showMessageDialog(null, e, "error message", JOptionPane.ERROR_MESSAGE);
}
}Example 6
| Project: tizzit-master File: PdfPreviewFrameTest.java View source code |
@Test
public void testSetDocumentContent() throws Exception {
PDDocument document = new PDDocument();
ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream();
PDPage blankPage = new PDPage();
document.addPage(blankPage);
document.save(byteOutputStream);
document.close();
PdfPreviewFrame previewFrame = createPartialMockForAllMethodsExcept(PdfPreviewFrame.class, "setDocumentContent");
previewFrame.setDocumentContent(byteOutputStream.toByteArray());
Object object = Whitebox.getInternalState(previewFrame, "pdffile");
assertNotNull(object);
assertTrue(object instanceof PDFFile);
}Example 7
| Project: brigen-base-master File: PagesAppender.java View source code |
@Override public PDDocument append(int end, int start, PDDocument document) throws IOException { PDPageable pageable; try { pageable = new PDPageable(document); } catch (IllegalArgumentExceptionPrinterException | e) { throw new IOException(e); } int pages = pageable.getNumberOfPages(); if (0 < pages) { PDDocument overDoc = new PDDocument(); PDDocument underDoc = new PDDocument(); for (int i = 0; i < pages; i++) { int page = i + 1; PDPageContentStream overStream; { PDPage overPage = new PDPage(); overDoc.addPage(overPage); overStream = new PDPageContentStream(overDoc, overPage, true, true); } PDPageContentStream underStream; { PDPage underPage = new PDPage(); underDoc.addPage(underPage); underStream = new PDPageContentStream(underDoc, underPage, true, true); } PDRectangle rect; { PDPage pdPage = (PDPage) document.getDocumentCatalog().getAllPages().get(i); rect = pdPage.getMediaBox(); } appendUnderContent(end, start, pages, page, underStream, rect); appendOverContent(end, start, pages, page, overStream, rect); underStream.close(); overStream.close(); } { Overlay overlay = new Overlay(); document = overlay.overlay(document, underDoc); } { Overlay overlay = new Overlay(); document = overlay.overlay(overDoc, document); } } return document; }
Example 8
| Project: com.revolsys.open-master File: PdfViewport.java View source code |
private PDFont getFont(final String path) throws IOException {
PDFont font = this.fonts.get(path);
if (font == null) {
final InputStream fontStream = PDDocument.class.getResourceAsStream("/org/apache/pdfbox/resources/ttf/ArialMT.ttf");
font = PDTrueTypeFont.loadTTF(this.document, fontStream);
this.fonts.put("/org/apache/pdfbox/resources/ttf/ArialMT.ttf", font);
}
return font;
}Example 9
| Project: GeoBI-master File: ImageOutputScalableFactory.java View source code |
private List<ImageInfo> createImages(PJsonObject jsonSpec, File tmpFile, RenderingContext context) throws IOException {
List<ImageInfo> images = new ArrayList<ImageInfo>();
PDDocument pdf = PDDocument.load(tmpFile);
try {
List<PDPage> pages = pdf.getDocumentCatalog().getAllPages();
for (PDPage page : pages) {
BufferedImage img = page.convertToImage(BufferedImage.TYPE_INT_RGB, calculateDPI(context, jsonSpec));
File file = File.createTempFile("pdfToImage", "tiff");
ImageIO.write(img, "TIF", file);
images.add(new ImageInfo(file, img.getWidth(), img.getHeight()));
}
} finally {
pdf.close();
}
return images;
}Example 10
| Project: java-wkhtmltopdf-wrapper-master File: PdfTest.java View source code |
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN a html template containing special characters that java stores in utf-16 internally
Pdf pdf = new Pdf();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}Example 11
| Project: josm-plugins-master File: PdfBoxParser.java View source code |
public void parse(File file, int maxPaths, ProgressMonitor monitor) throws IOException {
monitor.beginTask(tr("Parsing PDF", 1));
try (PDDocument document = PDDocument.load(file)) {
if (document.isEncrypted()) {
throw new IllegalArgumentException(tr("Encrypted documents not supported."));
}
List<?> allPages = document.getDocumentCatalog().getAllPages();
if (allPages.size() != 1) {
throw new IllegalArgumentException(tr("The PDF file must have exactly one page."));
}
PDPage page = (PDPage) allPages.get(0);
PDRectangle pageSize = page.findMediaBox();
Integer rotationVal = page.getRotation();
int rotation = 0;
if (rotationVal != null) {
rotation = rotationVal.intValue();
}
new PageDrawer().drawPage(new GraphicsProcessor(target, rotation, maxPaths, monitor), page);
this.target.bounds = new Rectangle2D.Double(pageSize.getLowerLeftX(), pageSize.getLowerLeftY(), pageSize.getWidth(), pageSize.getHeight());
}
monitor.finishTask();
}Example 12
| Project: ontopia-master File: PDFFormatModule.java View source code |
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
try {
PDDocument pdoc = PDDocument.load(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
PDFTextStripper stripper = new PDFTextStripper();
String s = stripper.getText(pdoc);
pdoc.close();
char[] c = s.toCharArray();
handler.startRegion("document");
handler.text(c, 0, c.length);
handler.endRegion();
} catch (Exception e) {
throw new OntopiaRuntimeException(e);
}
}Example 13
| Project: OpenLegislation-master File: TranscriptPdfView.java View source code |
public static void writeTranscriptPdf(Transcript transcript, OutputStream outputStream) throws IOException, COSVisitorException {
if (transcript == null) {
throw new IllegalArgumentException("Supplied transcript cannot be null when converting to pdf.");
}
try (PDDocument doc = new PDDocument()) {
PDFont font = PDType1Font.COURIER;
List<List<String>> pages = TranscriptTextUtils.getPdfFormattedPages(transcript.getText());
for (List<String> page : pages) {
PDPage pg = new PDPage(PDPage.PAGE_SIZE_LETTER);
PDPageContentStream contentStream = new PDPageContentStream(doc, pg);
drawBorder(contentStream);
contentStream.beginText();
contentStream.setFont(font, fontSize);
moveStreamToTopOfPage(contentStream);
int lineCount = drawPageText(page, contentStream);
drawStenographer(transcript, contentStream, lineCount);
contentStream.endText();
contentStream.close();
doc.addPage(pg);
}
doc.save(outputStream);
}
}Example 14
| Project: pdfbox-master File: CatalogValidationProcess.java View source code |
@Override
public void validate(PreflightContext ctx) throws ValidationException {
PDDocument pdfbox = ctx.getDocument();
this.catalog = pdfbox.getDocumentCatalog();
if (this.catalog == null) {
ctx.addValidationError(new ValidationError(ERROR_SYNTAX_NOCATALOG, "There are no Catalog entry in the Document"));
} else {
validateActions(ctx);
validateLang(ctx);
validateNames(ctx);
validateOCProperties(ctx);
validateOutputIntent(ctx);
}
}Example 15
| Project: Plain-of-JARs-master File: webcomic2pdf.java View source code |
public static void main(String[] args) throws Exception {
String version = "1.0.3";
String program = "Webcomic2PDF";
System.out.println(program + " " + version);
File directory = new File("pages");
if (!directory.exists()) {
directory.mkdir();
} else {
File[] files = directory.listFiles();
if (files != null) {
for (int i = 0; i < files.length; i++) {
files[i].delete();
}
}
}
// end of if-else
try {
int pages_count = 0;
int current_page = 1;
int i = 1;
int current_pages_count = 0;
System.out.println("");
System.out.println("List of all supported comics:");
ObjectMapper mapper = new ObjectMapper();
BufferedReader fileReader = new BufferedReader(new InputStreamReader(webcomic2pdf.class.getResourceAsStream("jar_files/comics.json")));
JsonNode rootNode = mapper.readTree(fileReader);
JsonNode comics = rootNode.get("comics");
System.out.println("#\tName");
for (int comic = 0; comic < comics.size(); comic++) {
System.out.println(comic + "\t" + comics.get(comic).get("name").textValue());
}
System.out.println("");
System.out.print("Select comic #:");
int comic = Integer.parseInt(System.console().readLine());
String comic_name = comics.get(comic).get("name").textValue();
String comic_url = comics.get(comic).get("url").textValue();
String comic_image_selector = comics.get(comic).get("image_selector").textValue();
String comic_last_page_selector = comics.get(comic).get("last_page_selector").textValue();
String comic_first_page = comics.get(comic).get("first_page").textValue();
String comic_url_parameter = comics.get(comic).get("url_parameter").textValue();
pages_count = getPages(comic_first_page, comic_last_page_selector, comic_url_parameter);
Logger log = LogManager.getLogManager().getLogger("");
//Logger.getLogger( webcomic2pdf.class.getName() );
for (Handler h : log.getHandlers()) {
h.setLevel(Level.OFF);
}
System.out.println(comic_name);
System.out.println("Total available pages:" + pages_count);
System.out.print("From page #:");
int from = Integer.parseInt(System.console().readLine());
System.out.print("To page #:");
int to = Integer.parseInt(System.console().readLine());
current_page = from;
current_pages_count = to - from + 1;
while (current_page <= to) {
String image_real = getPage(current_page, comic_url, comic_image_selector);
String content_length = null;
InputStream is = null;
HttpURLConnection conn = (HttpURLConnection) (new URL(image_real.toString()).openConnection());
conn.setConnectTimeout(60000);
conn.setReadTimeout(60000);
conn.connect();
content_length = conn.getHeaderField("content-length");
is = conn.getInputStream();
String[] array = image_real.toString().split("/");
String image_local = array[array.length - 1];
OutputStream outstream = new FileOutputStream(new File("pages/" + current_page + "_" + image_local));
long fileSize = Long.valueOf(content_length).longValue();
long bytesRead = 0;
int percentage = -1;
byte[] buffer = new byte[4096];
int len;
while ((len = is.read(buffer)) > 0) {
outstream.write(buffer, 0, len);
bytesRead += len;
int n = (int) (100 * bytesRead / fileSize);
percentage = n;
String n_perct = n + "% ";
System.out.print("\rDownloading page " + i + " of " + current_pages_count + " " + n_perct + "");
}
outstream.close();
i++;
current_page++;
}
File[] myarray = directory.listFiles(new FileFilter() {
public boolean accept(File dir) {
return dir.toString().endsWith(".jpg") && dir.isFile();
}
});
System.out.println("");
if (myarray.length > 0) {
System.out.println("Generating PDF");
PDDocument document = new PDDocument();
for (int k = 0; k < myarray.length; k++) {
InputStream in = new FileInputStream(myarray[k]);
BufferedImage bimg = ImageIO.read(in);
float width = bimg.getWidth();
float height = bimg.getHeight();
PDPage page = new PDPage(new PDRectangle(width, height));
document.addPage(page);
PDXObjectImage img = new PDJpeg(document, new FileInputStream(myarray[k]));
PDPageContentStream contentStream = new PDPageContentStream(document, page);
contentStream.drawImage(img, 0, 0);
contentStream.close();
in.close();
}
document.save(comic_name + ".pdf");
document.close();
}
File[] files_del = directory.listFiles();
if (files_del != null) {
for (int k = 0; k < files_del.length; k++) {
files_del[k].delete();
}
}
directory.delete();
System.out.println("Done");
} catch (IOException e) {
}
}Example 16
| Project: stocks-master File: CreateTextFromPDFHandler.java View source code |
@Execute
public void execute(@Named(IServiceConstants.ACTIVE_PART) MPart part, @Named(IServiceConstants.ACTIVE_SHELL) Shell shell) throws IOException {
// open file dialog to pick pdf files
FileDialog fileDialog = new FileDialog(shell, SWT.OPEN | SWT.SINGLE);
fileDialog.setText(Messages.PDFImportDebugTextExtraction);
fileDialog.setFilterNames(new String[] { Messages.PDFImportFilterName });
//$NON-NLS-1$
fileDialog.setFilterExtensions(new String[] { "*.pdf" });
fileDialog.open();
String fileName = fileDialog.getFileName();
if (fileName == null || fileName.isEmpty())
return;
File file = new File(fileDialog.getFilterPath(), fileName);
try (PDDocument doc = PDDocument.load(file)) {
PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setSortByPosition(true);
String text = textStripper.getText(doc);
new DisplayTextDialog(shell, text).open();
} catch (IOException e) {
PortfolioPlugin.log(e);
MessageDialog.openError(shell, Messages.LabelError, e.getMessage());
}
}Example 17
| Project: with-aes-master File: SynchronizedMetaDataValidation.java View source code |
/**
* Check if document information entries and XMP information are synchronized
*
* @param document
* the PDF Document
* @param metadata
* the XMP MetaData
* @return List of validation errors
* @throws ValidationException
*/
public List<ValidationError> validateMetadataSynchronization(PDDocument document, XMPMetadata metadata) throws ValidationException {
List<ValidationError> ve = new ArrayList<ValidationError>();
if (document == null) {
throw new ValidationException("Document provided is null");
} else {
PDDocumentInformation dico = document.getDocumentInformation();
if (metadata == null) {
throw new ValidationException("Metadata provided are null");
} else {
DublinCoreSchema dc = metadata.getDublinCoreSchema();
// TITLE
analyzeTitleProperty(dico, dc, ve);
// AUTHOR
analyzeAuthorProperty(dico, dc, ve);
// SUBJECT
analyzeSubjectProperty(dico, dc, ve);
AdobePDFSchema pdf = metadata.getAdobePDFSchema();
// KEYWORDS
analyzeKeywordsProperty(dico, pdf, ve);
// PRODUCER
analyzeProducerProperty(dico, pdf, ve);
XMPBasicSchema xmp = metadata.getXMPBasicSchema();
// CREATOR TOOL
analyzeCreatorToolProperty(dico, xmp, ve);
// CREATION DATE
analyzeCreationDateProperty(dico, xmp, ve);
// MODIFY DATE
analyzeModifyDateProperty(dico, xmp, ve);
}
}
return ve;
}Example 18
| Project: wkhtmltopdf-master File: PdfTest.java View source code |
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN a html template containing special characters that java stores in utf-16 internally
Pdf pdf = new Pdf();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}Example 19
| Project: AGIA-master File: SplitPDFTasklet.java View source code |
private int splitFile(Resource sSourceResource, ChunkContext sChunkContext) throws Exception {
Map<String, Object> aDestinationParams = new HashMap<String, Object>();
aDestinationParams.put(ResourceFactoryConstants.PARAM_SOURCE, sSourceResource);
aDestinationParams.put(ResourceFactoryConstants.PARAM_STEP_EXEC, ((sChunkContext != null) && (sChunkContext.getStepContext() != null)) ? sChunkContext.getStepContext().getStepExecution() : null);
Resource aDestination = null;
int aResult = 0;
PDDocumentContainer aDocumentContainer = null;
try {
aDocumentContainer = documentFactory.getDocument(sSourceResource.getFile());
List<PDDocument> documents = aDocumentContainer.getParts();
for (int i = 0; i < documents.size(); i++) {
PDDocument doc = documents.get(i);
// Output file factory
int aTryCount = 10;
do {
aDestination = destinationFactory.getResource(aDestinationParams);
aTryCount--;
} while (!forceReplace && (aTryCount > 0) && (aDestination != null) && aDestination.exists());
if ((aTryCount == 0) && !forceReplace) {
throw new SplitPDFException("Cannot create a new destination filename");
}
if (aDestination != null) {
if (aDestination.exists() && LOGGER.isWarnEnabled()) {
LOGGER.warn("Replacing {}", aDestination.getFile().getAbsolutePath());
}
writeDocument(doc, aDestination.getFile().getAbsolutePath());
doc.close();
} else {
throw new SplitPDFException("No destination specified");
}
aResult++;
}
} finally {
if (aDocumentContainer != null) {
aDocumentContainer.close();
}
}
return aResult;
}Example 20
| Project: batchers-master File: MonthlyTaxReportServiceTest.java View source code |
@Test
public void generateReportWithCorrectData() throws PDFGenerationException, IOException {
byte[] pdfBytes = monthlyTaxReportService.generateReport(3L, TEST_YEAR, TEST_MONTH);
PDDocument pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
assertThat(pdfDocument).containsText("WEBSERVICE RETURNS SUCCESS " + SUCCESS_AMOUNT + " euro");
pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
assertThat(pdfDocument).containsText("WEBSERVICE RETURNS FAILURE " + FAILED_AMOUNT + " euro");
pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
assertThat(pdfDocument).containsText("PERIOD: " + 5 + " " + TEST_YEAR);
}Example 21
| Project: BBAW_CMS-master File: PdfParserImpl.java View source code |
/**
* Parse a pdf-document and return the object returned by the
* {@link ISaveStrategy} .
*
* @return Object returned by the {@link ISaveStrategy}
* @throws ApplicationException
* @throws IllegalArgumentException
* if the uri is null or empty.
* @throws IllegalStateException
* if the {@link ISaveStrategy} wasn't set before.
*/
public Object parse(final String startUri, final String uri) throws ApplicationException {
if (uri == null || uri.isEmpty()) {
throw new IllegalArgumentException("The value for the parameter parser in the method parse() in PdfParserImpl mustn't be empty.");
}
if (this.saveStrategy == null) {
throw new IllegalStateException("You must define a saveStategy before calling the parse()-method in ResourceParser.");
}
try {
PDDocument document;
InputStream input = this.resourceReader.read(uri);
document = PDDocument.load(input);
List<String> pagesTexts = new ArrayList<String>();
String text = "";
PDFTextStripper stripper = new PDFTextStripper();
for (int i = 1; i <= document.getNumberOfPages(); i++) {
stripper.setStartPage(i);
stripper.setEndPage(i);
text = stripper.getText(document);
;
pagesTexts.add(text);
}
document.close();
input.close();
PdfDocument doc = (PdfDocument) this.saveStrategy.generateDocumentModel(uri, uri, pagesTexts);
// Set the standard metadata (page
doc.setMetadata(new MetadataRecord());
return doc;
} catch (IOException e) {
throw new ApplicationException("Problem while parsing file " + uri + " -- exception: " + e.getMessage() + "\n");
}
}Example 22
| Project: camel-master File: PdfProducer.java View source code |
private Object doAppend(Exchange exchange) throws IOException, BadSecurityHandlerException, CryptographyException, InvalidPasswordException, COSVisitorException {
LOG.debug("Got {} operation, going to append text to provided pdf.", pdfConfiguration.getOperation());
String body = exchange.getIn().getBody(String.class);
PDDocument document = exchange.getIn().getHeader(PDF_DOCUMENT_HEADER_NAME, PDDocument.class);
if (document == null) {
throw new IllegalArgumentException(String.format("%s header is expected for append operation", PDF_DOCUMENT_HEADER_NAME));
}
if (document.isEncrypted()) {
DecryptionMaterial decryptionMaterial = exchange.getIn().getHeader(DECRYPTION_MATERIAL_HEADER_NAME, DecryptionMaterial.class);
if (decryptionMaterial == null) {
throw new IllegalArgumentException(String.format("%s header is expected for %s operation " + "on encrypted document", DECRYPTION_MATERIAL_HEADER_NAME, pdfConfiguration.getOperation()));
}
document.openProtection(decryptionMaterial);
document.setAllSecurityToBeRemoved(true);
}
ProtectionPolicy protectionPolicy = exchange.getIn().getHeader(PROTECTION_POLICY_HEADER_NAME, ProtectionPolicy.class);
appendToPdfDocument(body, document, protectionPolicy);
OutputStream byteArrayOutputStream = new ByteArrayOutputStream();
document.save(byteArrayOutputStream);
return byteArrayOutputStream;
}Example 23
| Project: ddf-master File: GeoPdfParserImpl.java View source code |
/**
* Generates a WKT compliant String from a PDF Document if it contains GeoPDF information.
* Currently, only WGS84 Projections are supported (GEOGRAPHIC GeoPDF ProjectionType).
*
* @param pdfDocument - The PDF document
* @return the WKT String
* @throws IOException
*/
@Override
public String apply(PDDocument pdfDocument) throws IOException {
ToDoubleVisitor toDoubleVisitor = new ToDoubleVisitor();
LinkedList<String> polygons = new LinkedList<>();
for (PDPage pdPage : pdfDocument.getPages()) {
COSDictionary cosObject = pdPage.getCOSObject();
COSBase lgiDictObject = cosObject.getObjectFromPath(LGIDICT);
// Handle Multiple Map Frames
if (lgiDictObject instanceof COSArray) {
for (int i = 0; i < ((COSArray) lgiDictObject).size(); i++) {
COSDictionary lgidict = (COSDictionary) cosObject.getObjectFromPath(LGIDICT + "/[" + i + "]");
COSDictionary projectionArray = (COSDictionary) lgidict.getDictionaryObject(PROJECTION);
if (projectionArray != null) {
String projectionType = ((COSString) projectionArray.getItem(PROJECTION_TYPE)).getString();
if (GEOGRAPHIC.equals(projectionType)) {
COSArray neatlineArray = (COSArray) cosObject.getObjectFromPath(LGIDICT + "/[" + i + "]/" + NEATLINE);
String wktString = getWktFromNeatLine(lgidict, neatlineArray, toDoubleVisitor);
polygons.add(wktString);
} else {
LOGGER.debug("Unsupported projection type {}. Map Frame will be skipped.", projectionType);
}
} else {
LOGGER.debug("No projection array found on the map frame. Map Frame will be skipped.");
}
}
// Handle One Map Frame
} else if (lgiDictObject instanceof COSDictionary) {
COSDictionary lgidict = (COSDictionary) lgiDictObject;
COSDictionary projectionArray = (COSDictionary) lgidict.getDictionaryObject(PROJECTION);
if (projectionArray != null) {
String projectionType = ((COSString) projectionArray.getItem(PROJECTION_TYPE)).getString();
if (GEOGRAPHIC.equals(projectionType)) {
COSArray neatlineArray = (COSArray) cosObject.getObjectFromPath(LGIDICT + "/" + NEATLINE);
if (neatlineArray == null) {
neatlineArray = generateNeatLineFromPDFDimensions(pdPage);
}
polygons.add(getWktFromNeatLine(lgidict, neatlineArray, toDoubleVisitor));
} else {
LOGGER.debug("Unsupported projection type {}. Map Frame will be skipped.", projectionType);
}
} else {
LOGGER.debug("No projection array found on the map frame. Map Frame will be skipped.");
}
}
}
if (polygons.size() == 0) {
LOGGER.debug("No GeoPDF information found on PDF during transformation. Metacard location will not be set.");
return null;
}
if (polygons.size() == 1) {
return POLYGON + polygons.get(0) + "))";
} else {
return polygons.stream().map( polygon -> "((" + polygon + "))").collect(Collectors.joining(",", MULTIPOLYGON, ")"));
}
}Example 24
| Project: dlibrary-master File: CitationDocument.java View source code |
/**
* Creates a
* cited document from the given bitstream of the given item. This
* requires that bitstream is contained in item.
* <p>
* The Process for adding a cover page is as follows:
* <ol>
* <li> Load source file into PdfReader and create a
* Document to put our cover page into.</li>
* <li> Create cover page and add content to it.</li>
* <li> Concatenate the coverpage and the source
* document.</li>
* </p>
*
* @param bitstream The source bitstream being cited. This must be a PDF.
* @return The temporary File that is the finished, cited document.
* @throws java.io.FileNotFoundException
* @throws SQLException
* @throws org.dspace.authorize.AuthorizeException
*/
public File makeCitedDocument(Bitstream bitstream) throws IOException, SQLException, AuthorizeException, COSVisitorException {
PDDocument document = new PDDocument();
PDDocument sourceDocument = new PDDocument();
try {
Item item = (Item) bitstream.getParentObject();
sourceDocument = sourceDocument.load(bitstream.retrieve());
PDPage coverPage = new PDPage(PDPage.PAGE_SIZE_LETTER);
generateCoverPage(document, coverPage, item);
addCoverPageToDocument(document, sourceDocument, coverPage);
document.save(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
return new File(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
} finally {
sourceDocument.close();
document.close();
}
}Example 25
| Project: DSpace-master File: CitationDocumentServiceImpl.java View source code |
@Override
public File makeCitedDocument(Context context, Bitstream bitstream) throws IOException, SQLException, AuthorizeException {
PDDocument document = new PDDocument();
PDDocument sourceDocument = new PDDocument();
try {
Item item = (Item) bitstreamService.getParentObject(context, bitstream);
sourceDocument = sourceDocument.load(bitstreamService.retrieve(context, bitstream));
// TODO: needs to be configurable
PDPage coverPage = new PDPage(PDRectangle.LETTER);
generateCoverPage(context, document, coverPage, item);
addCoverPageToDocument(document, sourceDocument, coverPage);
document.save(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
return new File(tempDir.getAbsolutePath() + "/bitstream.cover.pdf");
} finally {
sourceDocument.close();
document.close();
}
}Example 26
| Project: DSpace-SVN-Deprecated-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
PDFParser parser = new PDFParser(metadata);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
}
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null) {
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
}
if (log.isDebugEnabled()) {
log.debug("PDF Info dict title=\"" + title + "\"");
}
item.addDC("title", null, "en", title);
String value = docinfo.getAuthor();
if (value != null) {
item.addDC("contributor", "author", null, value);
if (log.isDebugEnabled()) {
log.debug("PDF Info dict author=\"" + value + "\"");
}
}
value = docinfo.getCreator();
if (value != null) {
item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
}
value = docinfo.getProducer();
if (value != null) {
item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
}
value = docinfo.getSubject();
if (value != null) {
item.addDC("description", "abstract", null, value);
}
value = docinfo.getKeywords();
if (value != null) {
item.addDC("subject", "other", null, value);
}
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue = docinfo.getCreationDate();
if (calValue == null) {
calValue = docinfo.getModificationDate();
}
if (calValue != null) {
item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
}
item.update();
} finally {
if (cos != null) {
cos.close();
}
}
}Example 27
| Project: Europeana-Cloud-master File: PdfBoxExtractor.java View source code |
@Override
public String extractText(InputStream is) {
if (is == null) {
LOGGER.warn("No data for extraction.");
return null;
}
PDFParser parser;
String parsedText = null;
PDFTextStripper pdfStripper = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
try {
parser = new PDFParser(is);
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
PDDocumentInformation info = pdDoc.getDocumentInformation();
Set<String> mdKeys = info.getMetadataKeys();
extractedMetadata = new HashMap<>();
for (String key : mdKeys) {
String value = (String) info.getPropertyStringValue(key);
extractedMetadata.put(key, value);
}
//possible NULL pointer if document is encrypted
parsedText = pdfStripper.getText(pdDoc);
} catch (IOException ex) {
LOGGER.warn("Can not extract text from pdf because: " + ex.getMessage());
} finally {
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (IOException ex) {
}
}
return parsedText;
}Example 28
| Project: grobid-master File: FigureTableVisualizer.java View source code |
private static void processPdfFile(File input, File outputFolder) throws Exception {
inputPdf = input;
annotated = false;
annotatedFigure = false;
final PDDocument document = PDDocument.load(input);
File outPdf = new File("/tmp/testFigures.pdf");
final Engine engine = setupEngine();
File contentDir = new File("/tmp/contentDir");
FileUtils.deleteDirectory(contentDir);
File assetPath = new File(contentDir, "tei");
GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder().pdfAssetPath(assetPath).withPreprocessImages(false).withProcessVectorGraphics(true).build();
DocumentSource documentSource = DocumentSource.fromPdf(input);
File pdf2xmlDirectory = new File(contentDir, "pdf2xml");
pdf2xmlDirectory.mkdirs();
FileUtils.copyFileToDirectory(input, contentDir);
FileUtils.copyFile(documentSource.getXmlFile(), new File(pdf2xmlDirectory, "input.xml"));
FileUtils.copyDirectory(new File(documentSource.getXmlFile().getAbsolutePath() + "_data"), new File(pdf2xmlDirectory, documentSource.getXmlFile().getName() + "_data"));
System.out.println(documentSource.getXmlFile());
blacklistedPages = getVectorGraphicPages(pdf2xmlDirectory);
Document teiDoc = engine.fullTextToTEIDoc(documentSource, config);
PDDocument out = annotateFigureAndTables(document, documentSource.getXmlFile(), teiDoc, false, false, true, true);
if (out != null) {
out.save(outPdf);
if (singleFile) {
if (Desktop.isDesktopSupported()) {
Desktop.getDesktop().open(outPdf);
}
}
}
if (outputFolder != null) {
if (annotated) {
Engine.getCntManager().i("TABLES_TEST", "ANNOTATED_PDFS");
FileUtils.copyFile(outPdf, new File(outputFolder, annotated ? (annotatedFigure ? input.getName() + "_annotatedFigure.pdf" : input.getName() + "_annotated.pdf") : input.getName()));
}
}
}Example 29
| Project: iswc2012metadata-master File: TaskParsePdf.java View source code |
private static void extractText(File f) throws IOException {
PDDocument pddDocument = PDDocument.load(f);
PDFTextStripper textStripper = new PDFTextStripper();
String content = textStripper.getText(pddDocument);
TreeMap<PROP, String> temp = new TreeMap<PROP, String>();
temp.put(PROP.lineSeparator, textStripper.getLineSeparator());
temp.put(PROP.paragraphStart, textStripper.getParagraphStart());
System.out.println(temp);
DataPaperInPdf parser = new DataPaperInPdf(f.getName());
for (String line : content.split(temp.get(PROP.lineSeparator))) {
parser.processLine(line);
if (DataPaperInPdf.STATE.content.equals(parser.state)) {
break;
}
}
parser.printReport();
System.out.println("-----");
//System.out.println(content.substring(0, 500));
/*
PDDocumentInformation info = pddDocument.getDocumentInformation();
System.out.println( "Page Count=" + pddDocument.getNumberOfPages() );
System.out.println( "Title=" + info.getTitle() );
System.out.println( "Author=" + info.getAuthor() );
System.out.println( "Subject=" + info.getSubject() );
System.out.println( "Keywords=" + info.getKeywords() );
System.out.println( "Creator=" + info.getCreator() );
System.out.println( "Producer=" + info.getProducer() );
System.out.println( "Creation Date=" + info.getCreationDate() );
System.out.println( "Modification Date=" + info.getModificationDate());
System.out.println( "Trapped=" + info.getTrapped() );
*/
}Example 30
| Project: liferay-portal-master File: PDFProcessorImpl.java View source code |
private int _getPreviewFilesCount(File encryptedFile, File decryptedFile) {
String[] decryptPasswords = ArrayUtil.append(PropsValues.DL_FILE_ENTRY_PREVIEW_GENERATION_DECRYPT_PASSWORDS_PDFBOX, StringPool.BLANK);
for (String decryptPassword : decryptPasswords) {
try (PDDocument pdDocument = PDDocument.load(encryptedFile, decryptPassword)) {
pdDocument.setAllSecurityToBeRemoved(true);
pdDocument.save(decryptedFile);
return pdDocument.getNumberOfPages();
} catch (IOException ioe) {
if (!(ioe instanceof InvalidPasswordException)) {
_log.error(ioe, ioe);
}
}
}
return 0;
}Example 31
| Project: modeshape-master File: PdfBasicMetadata.java View source code |
/*
* Check that given file is supported by this sequencer.
*/
public boolean check() throws Exception {
try (PDDocument document = PDDocument.load(in)) {
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDFPageable pageable = new PDFPageable(document);
PageFormat firstPage = pageable.getPageFormat(0);
encrypted = document.isEncrypted();
pageCount = document.getNumberOfPages();
orientation = ORIENTATION_STRINGS[firstPage.getOrientation()];
version = String.valueOf(document.getDocument().getVersion());
String catalogVersion = catalog.getVersion();
if (catalogVersion != null && !catalogVersion.isEmpty()) {
// According to specs version saved here should be determining instead
// the version in header. It is barely used, though.
version = catalogVersion;
}
if (!encrypted) {
PDDocumentInformation metadata = document.getDocumentInformation();
author = metadata.getAuthor();
creationDate = metadata.getCreationDate();
creator = metadata.getCreator();
keywords = metadata.getKeywords();
modificationDate = metadata.getModificationDate();
producer = metadata.getProducer();
subject = metadata.getSubject();
title = metadata.getTitle();
}
// extract all attached files from all pages
int pageNumber = 0;
for (Object page : catalog.getPages()) {
pageNumber += 1;
PdfPageMetadata pageMetadata = new PdfPageMetadata();
pageMetadata.setPageNumber(pageNumber);
for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) {
if (annotation instanceof PDAnnotationFileAttachment) {
PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata();
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();
attachmentMetadata.setSubject(fann.getSubject());
attachmentMetadata.setName(fileSpec.getFilename());
attachmentMetadata.setCreationDate(embeddedFile.getCreationDate());
attachmentMetadata.setModificationDate(embeddedFile.getModDate());
attachmentMetadata.setMimeType(embeddedFile.getSubtype());
attachmentMetadata.setData(embeddedFile.toByteArray());
pageMetadata.addAttachment(attachmentMetadata);
}
}
pages.add(pageMetadata);
}
return true;
}
}Example 32
| Project: NeighborNote-master File: PDFPreview.java View source code |
public int getPageCount(String filePath) {
try {
String whichOS = System.getProperty("os.name");
if (whichOS.contains("Windows")) {
filePath = filePath.replace("\\", "/");
}
PDDocument document = null;
document = PDDocument.load(filePath);
return document.getNumberOfPages();
} catch (Exception e) {
return 0;
}
}Example 33
| Project: nevernote-master File: PDFPreview.java View source code |
public int getPageCount(String filePath) {
try {
String whichOS = System.getProperty("os.name");
if (whichOS.contains("Windows")) {
filePath = filePath.replace("\\", "/");
}
PDDocument document = null;
document = PDDocument.load(filePath);
return document.getNumberOfPages();
} catch (Exception e) {
return 0;
}
}Example 34
| Project: pdf-image-compare-master File: PdfToImageConverter.java View source code |
/**
* Split a PDF document into images.
*
* @param pdDocument the source document
* @param imageFormat the requested image format, e.g. "jpeg"
* @param startPage the first extracted page
* @param endPage the las extracted page
* @param resolution the resolution of the extracted images
* @param color the color model, e.g. "rgb", "gray"
* @return a list of images
* @throws Exception the conversion failed
*/
@SuppressWarnings("unchecked")
public List<BufferedImage> toImages(PDDocument pdDocument, String imageFormat, int startPage, int endPage, int resolution, String color) throws Exception {
/**
Validate.notNull(pdDocument, "pdDocument is null");
Validate.notEmpty(imageFormat, "imageFormat is null");
Validate.isTrue(startPage > 0, "invalid start page : " + startPage);
Validate.isTrue(endPage >= startPage, "invalid end page : " + endPage);
Validate.isTrue(resolution >= 0, "invalid resolution : " + resolution);
*/
List<BufferedImage> result = new ArrayList<BufferedImage>();
int imageType = getImageType(color);
List<PDPage> pages = pdDocument.getDocumentCatalog().getAllPages();
int pagesSize = pages.size();
for (int i = startPage - 1; i < endPage && i < pagesSize; i++) {
PDPage page = pages.get(i);
PDRectangle cropBox = page.findCropBox();
int currResolution = calculateResolution(resolution, cropBox.getWidth(), cropBox.getHeight());
BufferedImage image = page.convertToImage(imageType, currResolution);
result.add(image);
}
return result;
}Example 35
| Project: PDF-to-unusual-HTML-master File: PDTrueTypeFont.java View source code |
/**
* This will load a TTF to be embedded into a document.
*
* @param doc The PDF document that will hold the embedded font.
* @param stream a ttf input stream.
* @return a PDTrueTypeFont instance.
* @throws IOException If there is an error loading the data.
*/
public static PDTrueTypeFont loadTTF(PDDocument doc, InputStream stream) throws IOException {
PDTrueTypeFont retval = new PDTrueTypeFont();
PDFontDescriptorDictionary fd = new PDFontDescriptorDictionary();
retval.setFontDescriptor(fd);
PDStream fontStream = new PDStream(doc, stream, false);
fontStream.getStream().setInt(COSName.LENGTH1, fontStream.getByteArray().length);
fontStream.addCompression();
fd.setFontFile2(fontStream);
// As the stream was close within the PDStream constructor, we have to recreate it
stream = fontStream.createInputStream();
try {
retval.loadDescriptorDictionary(fd, stream);
} finally {
stream.close();
}
//only support winansi encoding right now, should really
//just use Identity-H with unicode mapping
retval.setFontEncoding(new WinAnsiEncoding());
retval.setEncoding(COSName.WIN_ANSI_ENCODING);
return retval;
}Example 36
| Project: rdf-indexer-master File: RdfTextSpider.java View source code |
/**
* Extract the text from the PDF specified by the URI
* @param uri
* @return
* @throws IOException
*/
private byte[] scrapeExternalPDF(final String uri) throws IOException {
InputStream is = null;
GetMethod get = new GetMethod(uri);
;
PDDocument pdfDoc = null;
try {
int result;
result = httpClient.executeMethod(get);
if (result != 200) {
throw new IOException(result + " code returned for URL: " + uri);
}
is = get.getResponseBodyAsStream();
pdfDoc = PDDocument.load(is);
PDFTextStripper pdfStrip = new PDFTextStripper();
return pdfStrip.getText(pdfDoc).getBytes();
} catch (IOException e) {
throw e;
} finally {
try {
get.releaseConnection();
IOUtils.closeQuietly(is);
if (pdfDoc != null) {
pdfDoc.close();
}
} catch (Exception e) {
}
}
}Example 37
| Project: streamflow-core-master File: TaskFormDraftSummaryContext.java View source code |
private PDDocument generatePdf(SubmittedFormValue submittedFormValue) throws Throwable {
FormDraftDTO form = role(FormDraftDTO.class);
FormPdfTemplate.Data selectedTemplate = role(FormPdfTemplate.Data.class);
AttachedFile.Data template = (AttachedFile.Data) selectedTemplate.formPdfTemplate().get();
if (template == null) {
ProxyUser proxyUser = role(ProxyUser.class);
template = (AttachedFile.Data) ((FormPdfTemplate.Data) proxyUser.organization().get()).formPdfTemplate().get();
if (template == null) {
template = (AttachedFile.Data) ((DefaultPdfTemplate.Data) proxyUser.organization().get()).defaultPdfTemplate().get();
}
}
String uri = null;
if (template != null) {
uri = template.uri().get();
}
CaseId.Data idData = role(CaseId.Data.class);
return pdfGenerator.generateSubmittedFormPdf(submittedFormValue, idData, uri, locale);
}Example 38
| Project: vtechworks-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
PDFParser parser = new PDFParser(metadata);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
}
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null) {
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
}
if (log.isDebugEnabled()) {
log.debug("PDF Info dict title=\"" + title + "\"");
}
item.addDC("title", null, "en", title);
String value = docinfo.getAuthor();
if (value != null) {
item.addDC("contributor", "author", null, value);
if (log.isDebugEnabled()) {
log.debug("PDF Info dict author=\"" + value + "\"");
}
}
value = docinfo.getCreator();
if (value != null) {
item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
}
value = docinfo.getProducer();
if (value != null) {
item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
}
value = docinfo.getSubject();
if (value != null) {
item.addDC("description", "abstract", null, value);
}
value = docinfo.getKeywords();
if (value != null) {
item.addDC("subject", "other", null, value);
}
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue = docinfo.getCreationDate();
if (calValue == null) {
calValue = docinfo.getModificationDate();
}
if (calValue != null) {
item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
}
item.update();
} finally {
if (cos != null) {
cos.close();
}
}
}Example 39
| Project: catma-core-master File: PDFContentHandler.java View source code |
/* (non-Javadoc)
* @see de.catma.document.source.contenthandler.SourceContentHandler#load(java.io.InputStream)
*/
public void load(InputStream is) throws IOException {
PDDocument document = null;
try {
document = PDDocument.load(is, false);
if (document.isEncrypted()) {
throw new IOException("can not open pdf document because it is encrypted");
}
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent()) {
throw new IOException("You do not have permission to extract text");
}
PDFTextStripper stripper = new PDFTextStripper("UTF-8");
stripper.setForceParsing(false);
stripper.setSortByPosition(false);
stripper.setShouldSeparateByBeads(true);
stripper.setStartPage(1);
stripper.setEndPage(Integer.MAX_VALUE);
ByteArrayOutputStream os = new ByteArrayOutputStream();
Writer w = new OutputStreamWriter(os);
try {
stripper.writeText(document, w);
} finally {
w.close();
}
// some pdfs seem to include non valid unicode characters
// and this causes problems when converting text to HTML
// for GUI delivery and during indexing
setContent(os.toString().replaceAll("[^\\x09\\x0A\\x0D\\x20-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]", "?"));
} finally {
if (document != null) {
document.close();
}
}
}Example 40
| Project: extension-aws-master File: PdfParser.java View source code |
public Parse parse(InputStream inContent) {
Parse results = new Parse();
PDDocument pdf = null;
try {
PDFParser parser = new PDFParser(inContent);
// new ByteArrayInputStream(inContent));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
DocumentEncryption decryptor = new DocumentEncryption(pdf);
// Just try using the default password and move on
decryptor.decryptDocument("");
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
//TODO: Write this out to a temp file that will be indexed seperately
String text = null;
String title = null;
try {
text = stripper.getText(pdf);
} catch (Throwable e) {
log.error("Could not parse", e);
text = "";
}
text = scrubChars(text);
results.setText(text);
results.setPages(pdf.getNumberOfPages());
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
results.setTitle(title);
if (pdf.getNumberOfPages() > 0) {
PDPage page = (PDPage) pdf.getDocumentCatalog().getAllPages().get(0);
PDRectangle mediaBox = page.getMediaBox();
if (mediaBox == null) {
mediaBox = page.getArtBox();
}
if (mediaBox != null) {
results.put("width", String.valueOf(Math.round(mediaBox.getWidth())));
results.put("height", String.valueOf(Math.round(mediaBox.getHeight())));
}
}
//Thread.sleep(500); // Slow down PDF's loading
} catch (CryptographyException e) {
log.error("Error decrypting document. " + e);
} catch (InvalidPasswordException e) {
log.error("Can't decrypt document - invalid password. " + e);
} catch (Exception e) {
log.error("Can't be handled as pdf document. " + e);
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
}
}
return results;
}Example 41
| Project: FXDesktopSearch-master File: PDFPreviewGenerator.java View source code |
@Override
public Preview createPreviewFor(File aFile) {
try (PDDocument theDocument = PDDocument.load(aFile)) {
PDPageTree thePages = theDocument.getPages();
if (thePages.getCount() == 0) {
return null;
}
PDPage theFirstPage = (PDPage) thePages.get(0);
PDRectangle mBox = theFirstPage.getMediaBox();
float theWidthPt = mBox.getWidth();
// Math.round(widthPt * scaling);
int theWidthPx = THUMB_WIDTH;
// Math.round(heightPt * scaling);
int theHeightPx = THUMB_HEIGHT;
// resolution / 72.0F;
float theScaling = THUMB_WIDTH / theWidthPt;
BufferedImage theImage = new BufferedImage(theWidthPx, theHeightPx, BufferedImage.TYPE_INT_RGB);
Graphics2D theGraphics = (Graphics2D) theImage.getGraphics();
theGraphics.setBackground(new Color(255, 255, 255, 0));
theGraphics.clearRect(0, 0, theImage.getWidth(), theImage.getHeight());
PDFRenderer theRenderer = new PDFRenderer(theDocument);
theRenderer.renderPageToGraphics(0, theGraphics, theScaling);
int rotation = theFirstPage.getRotation();
if ((rotation == 90) || (rotation == 270)) {
int w = theImage.getWidth();
int h = theImage.getHeight();
BufferedImage rotatedImg = new BufferedImage(w, h, theImage.getType());
Graphics2D g = rotatedImg.createGraphics();
g.rotate(Math.toRadians(rotation), w / 2, h / 2);
g.drawImage(theImage, null, 0, 0);
}
theGraphics.dispose();
return new Preview(theImage);
} catch (Exception e) {
LOGGER.error("Error creating preview for " + aFile, e);
return null;
}
}Example 42
| Project: java-image-processing-survival-guide-master File: PdfBoxPreviewTest.java View source code |
@Test
public void shouldCreatePdfPreviewImages() throws Exception {
final int imageType = TYPE_INT_RGB;
// final PDDocument pdDocument = PDDocument.load("./../../pdf/test-large-scan.pdf");
final PDDocument pdDocument = PDDocument.load("./../../pdf/erste-document-01.pdf");
final List<BufferedImage> images = toImages(pdDocument, START_PAGE, LAST_PAGE, DPI_72, imageType);
assertNotNull(images);
assertFalse(images.isEmpty());
assertEquals(images.get(0).getType(), imageType);
for (int i = 0; i < images.size(); i++) {
File targetImageFile = createOutputFileName("shouldCreatePdfPreviewImages", "page-" + i, "jpeg");
writeBufferedImage(images.get(i), "jpeg", targetImageFile);
}
}Example 43
| Project: knowledge_vault-master File: MetadataExtractor.java View source code |
/**
* Extract metadata from PDF
*/
public static PdfMetadata pdfExtractor(InputStream is) throws IOException {
PDDocument doc = PDDocument.load(is);
PDDocumentInformation info = doc.getDocumentInformation();
PdfMetadata md = new PdfMetadata();
md.setNumberOfPages(doc.getNumberOfPages());
md.setTitle(info.getTitle());
md.setAuthor(info.getAuthor());
md.setSubject(info.getSubject());
md.setKeywords(info.getKeywords());
md.setCreator(info.getCreator());
md.setProducer(info.getProducer());
md.setTrapped(info.getTrapped());
md.setCreationDate(info.getCreationDate());
md.setModificationDate(info.getModificationDate());
log.info("pdfExtractor: {}", md);
return md;
}Example 44
| Project: nuxeo-versions-difference-master File: TestPdfBoxN.java View source code |
private boolean setMain(String FileName) throws Exception {
file = new File(FileName);
if (!file.isFile()) {
System.err.println("File " + "test.pdf" + " does not exist.");
return false;
}
try {
parser = new PDFParser(new FileInputStream(file));
} catch (IOException e) {
System.err.println("Unable to open PDF Parser. " + e.getMessage());
return false;
}
try {
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
} catch (Exception e) {
return false;
}
return true;
}Example 45
| Project: OmegaT-master File: PdfFilter.java View source code |
@Override
public BufferedReader createReader(File infile, String encoding) throws IOException, TranslationException {
PDFTextStripper stripper;
stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
stripper.setSortByPosition(true);
try (PDDocument document = PDDocument.load(infile)) {
String text = stripper.getText(document);
return new BufferedReader(new StringReader(text));
} catch (NoClassDefFoundError ex) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, OStrings.getString("PDFFILTER_ENCRYPTED_FILE"), infile);
throw new TranslationException(ex);
}
}Example 46
| Project: openolat-master File: ImageHelperImpl.java View source code |
@Override
public Size thumbnailPDF(VFSLeaf pdfFile, VFSLeaf thumbnailFile, int maxWidth, int maxHeight) {
InputStream in = null;
PDDocument document = null;
try {
WorkThreadInformations.setInfoFiles(null, pdfFile);
WorkThreadInformations.set("Generate thumbnail VFSLeaf=" + pdfFile);
in = pdfFile.getInputStream();
document = PDDocument.load(in);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (Exception e) {
log.info("PDF document is encrypted: " + pdfFile);
throw new CannotGenerateThumbnailException("PDF document is encrypted: " + pdfFile);
}
}
List pages = document.getDocumentCatalog().getAllPages();
PDPage page = (PDPage) pages.get(0);
BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_BGR, 72);
Size size = scaleImage(image, thumbnailFile, maxWidth, maxHeight);
if (size != null) {
return size;
}
return null;
} catch (CannotGenerateThumbnailException e) {
return null;
} catch (Exception e) {
log.warn("Unable to create image from pdf file.", e);
return null;
} finally {
WorkThreadInformations.unset();
FileUtils.closeSafely(in);
if (document != null) {
try {
document.close();
} catch (IOException e) {
}
}
}
}Example 47
| Project: sakai-cle-master File: PDFContentDigester.java View source code |
public String getContent(ContentResource contentResource) {
if (contentResource == null) {
throw new RuntimeException("Null contentResource passed to getContent");
}
InputStream contentStream = null;
PDFParser parser = null;
PDDocument pddoc = null;
try {
contentStream = contentResource.streamContent();
parser = new PDFParser(new BufferedInputStream(contentStream));
parser.parse();
pddoc = parser.getPDDocument();
if (pddoc != null) {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
CharArrayWriter cw = new CharArrayWriter();
stripper.writeText(pddoc, cw);
return SearchUtils.appendCleanString(cw.toCharArray(), null).toString();
}
} catch (ServerOverloadException e) {
String eMessage = e.getMessage();
if (eMessage == null) {
eMessage = e.toString();
}
throw new RuntimeException("Failed to get content for indexing: cause: ServerOverloadException: " + eMessage, e);
} catch (IOException e) {
String eMessage = e.getMessage();
if (eMessage == null) {
eMessage = e.toString();
}
throw new RuntimeException("Failed to get content for indexing: cause: IOException: " + eMessage, e);
} finally {
if (pddoc != null) {
try {
pddoc.close();
} catch (IOException e) {
log.debug(e);
}
}
if (contentStream != null) {
try {
contentStream.close();
} catch (IOException e) {
log.debug(e);
}
}
}
return null;
}Example 48
| Project: tabula-java-master File: Debug.java View source code |
public static void renderPage(String pdfPath, String outPath, int pageNumber, Rectangle area, boolean drawTextChunks, boolean drawSpreadsheets, boolean drawRulings, boolean drawIntersections, boolean drawColumns, boolean drawCharacters, boolean drawArea, boolean drawCells, boolean drawUnprocessedRulings, boolean drawProjectionProfile, boolean drawClippingPaths, boolean drawDetectedTables) throws IOException {
PDDocument document = PDDocument.load(new File(pdfPath));
ObjectExtractor oe = new ObjectExtractor(document);
Page page = oe.extract(pageNumber + 1);
if (area != null) {
page = page.getArea(area);
}
PDPage p = (PDPage) document.getPage(pageNumber);
BufferedImage image = Utils.pageConvertToImage(p, 72, ImageType.RGB);
Graphics2D g = (Graphics2D) image.getGraphics();
if (drawTextChunks) {
debugTextChunks(g, page);
}
if (drawSpreadsheets) {
debugSpreadsheets(g, page);
}
if (drawRulings) {
debugRulings(g, page);
}
if (drawIntersections) {
debugIntersections(g, page);
}
if (drawColumns) {
debugColumns(g, page);
}
if (drawCharacters) {
debugCharacters(g, page);
}
if (drawArea) {
g.setColor(Color.ORANGE);
drawShape(g, area);
}
if (drawCells) {
debugCells(g, area, page);
}
if (drawUnprocessedRulings) {
debugNonCleanRulings(g, page);
}
if (drawProjectionProfile) {
debugProjectionProfile(g, page);
}
if (drawClippingPaths) {
// TODO: Enable when oe.clippingPaths is done
//drawShapes(g, oe.clippingPaths,
// new BasicStroke(2f, BasicStroke.CAP_BUTT, BasicStroke.JOIN_MITER, 10f, new float[] { 3f }, 0f));
}
if (drawDetectedTables) {
debugDetectedTables(g, page);
}
document.close();
ImageIOUtil.writeImage(image, outPath, 72);
}Example 49
| Project: tika-master File: PDF2XHTML.java View source code |
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
* @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
PDF2XHTML pdf2XHTML = null;
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
config.configure(pdf2XHTML);
pdf2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@Override
public void flush() {
}
@Override
public void close() {
}
});
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
} else {
throw new TikaException("Unable to extract PDF content", e);
}
}
if (pdf2XHTML.exceptions.size() > 0) {
//throw the first
throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
}
}Example 50
| Project: xcmis-master File: PDFDocumentRenditionProvider.java View source code |
/**
* {@inheritDoc}
*/
public RenditionContentStream getRenditionStream(ContentStream stream) throws IOException {
PDDocument pdf = null;
try {
pdf = PDDocument.load(stream.getStream());
PDPage page = (PDPage) pdf.getDocumentCatalog().getAllPages().get(0);
BufferedImage image = page.convertToImage();
// Determine scale and be sure both width and height are not greater the max
int scale = (int) Math.max(Math.floor((image.getHeight() / maxHeight) + 1.0d), Math.floor((image.getWidth() / maxWidth) + 1.0d));
int height = image.getHeight() / scale;
int width = image.getWidth() / scale;
BufferedImage scaledImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
Graphics2D graphics2D = scaledImage.createGraphics();
graphics2D.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR);
graphics2D.drawImage(image, 0, 0, width, height, null);
graphics2D.dispose();
ByteArrayOutputStream out = new ByteArrayOutputStream();
ImageIO.write(scaledImage, "png", out);
RenditionContentStream renditionStream = new RenditionContentStream(out.toByteArray(), null, new MimeType("image", " png"), getKind(), height, width);
return renditionStream;
} finally {
if (pdf != null) {
pdf.close();
}
}
}Example 51
| Project: CZ3003_Backend-master File: CReport.java View source code |
public static void genReport(JSONArray pObjAry) throws IOException, COSVisitorException {
String imagePath = "C:\\Users\\Bryden\\Desktop\\pie-sample.png";
List<List<String>> lstContents = new ArrayList<>();
List<String> aryLst = new ArrayList<>();
aryLst.add("Incident Type");
aryLst.add("");
lstContents.add(aryLst);
for (Object obj : pObjAry) {
JSONObject objJson = (JSONObject) obj;
Iterator<?> keys = objJson.keySet().iterator();
while (keys.hasNext()) {
String key = (String) keys.next();
// loop to get the dynamic key
String value = (String) objJson.get(key);
List<String> aryValues = new ArrayList<>();
aryValues.add(key);
aryValues.add(value);
lstContents.add(aryValues);
}
}
try (// Create a document and add a page to it
PDDocument document = new PDDocument()) {
PDPage page = new PDPage(PDPage.PAGE_SIZE_A4);
document.addPage(page);
// Create a new font object selecting one of the PDF base fonts
PDFont font = PDType1Font.HELVETICA_BOLD;
InputStream in = Files.newInputStream(Paths.get(imagePath));
PDJpeg img = new PDJpeg(document, in);
// Define a text content stream using the selected font, moving the cursor and drawing the text "Hello World"
try (// Start a new content stream which will "hold" the to be created content
PDPageContentStream contentStream = new PDPageContentStream(document, page)) {
// Define a text content stream using the selected font, moving the cursor and drawing the text "Hello World"
contentStream.beginText();
contentStream.setFont(font, 20);
contentStream.moveTextPositionByAmount(70, 720);
contentStream.drawString("Incident Summary " + new Date());
contentStream.endText();
contentStream.beginText();
contentStream.setFont(font, 20);
contentStream.moveTextPositionByAmount(100, 670);
contentStream.drawString("Statistics");
contentStream.endText();
contentStream.drawImage(img, 10, 10);
drawTable(page, contentStream, 650, 100, lstContents);
// Make sure that the content stream is closed:
}
img.clear();
// Save the results and ensure that the document is properly closed:
document.save("Hello World.pdf");
}
}Example 52
| Project: drc-master File: PdfContentExtractor.java View source code |
/**
* @param pdfName
* The full path to the PDF file to extract content from
* @return The PageInfo object for the PDF
*/
public static PageInfo extractContentFromPdf(String pdfName) {
try {
location = pdfName;
PDDocument document = PDDocument.load(new File(pdfName));
PdfContentExtractor x = initExtractor(document);
PageInfo result = x.toPageInfo();
document.close();
return result;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}Example 53
| Project: gcs-master File: PdfRenderer.java View source code |
public static BufferedImage create(PDDocument pdf, int pageIndex, float scale, String textToHighlight) {
try {
PDFRenderer renderer = new PDFRenderer(pdf);
scale = scale * Toolkit.getDefaultToolkit().getScreenResolution() / 72f;
BufferedImage img = renderer.renderImage(pageIndex, scale);
if (textToHighlight != null) {
Graphics2D gc = img.createGraphics();
gc.setStroke(new BasicStroke(0.1f));
gc.scale(scale, scale);
PdfRenderer processor = new PdfRenderer(gc, textToHighlight);
processor.setSortByPosition(true);
processor.setStartPage(pageIndex + 1);
processor.setEndPage(pageIndex + 1);
try (DummyWriter writer = new DummyWriter()) {
processor.writeText(pdf, writer);
}
gc.dispose();
}
return img;
} catch (Exception exception) {
Log.error(exception);
return null;
}
}Example 54
| Project: infoLink-master File: TextExtractor.java View source code |
public InfolisFile extract(InfolisFile inFile, int startPage, boolean tokenize) throws IOException {
String asText = null;
// TODO make configurable
String outFileName = SerializationUtils.changeFileExtension(inFile.getFileName(), "txt");
// if no output directory is given, create temporary output files
if (null == getExecution().getOutputDirectory() || getExecution().getOutputDirectory().equals("")) {
String EXTRACTED_DIR_PREFIX = "extracted-";
String tempDir = Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), EXTRACTED_DIR_PREFIX).toString();
FileUtils.forceDeleteOnExit(new File(tempDir));
outFileName = SerializationUtils.changeBaseDir(outFileName, tempDir);
} else {
outFileName = SerializationUtils.changeBaseDir(outFileName, getExecution().getOutputDirectory());
}
InfolisFile outFile = new InfolisFile();
outFile.setFileName(outFileName);
outFile.setOriginalName(inFile.getFileName());
outFile.setMediaType("text/plain");
if (getExecution().getOverwriteTextfiles() == false) {
File _outFile = new File(outFileName);
if (_outFile.exists()) {
debug(log, "File exists: {}, skipping text extraction for {}", _outFile, inFile);
asText = FileUtils.readFileToString(_outFile, "utf-8");
outFile.setMd5(SerializationUtils.getHexMd5(asText));
outFile.setFileStatus("AVAILABLE");
return outFile;
}
}
InputStream inStream = null;
OutputStream outStream = null;
PDDocument pdfIn = null;
try {
inStream = getInputFileResolver().openInputStream(inFile);
try {
pdfIn = PDDocument.load(inStream);
asText = extractText(pdfIn, startPage);
if (null == asText) {
throw new IOException("extractText returned null!");
}
if (getExecution().isRemoveBib()) {
asText = removeBibSection(asText);
}
if (getExecution().isTokenize()) {
asText = tokenizeText(asText);
}
Set<String> tagsToSet = getExecution().getTags();
tagsToSet.addAll(inFile.getTags());
tagsToSet.addAll(executionTags);
outFile.setTags(tagsToSet);
outFile.setMd5(SerializationUtils.getHexMd5(asText));
outFile.setFileStatus("AVAILABLE");
try {
outStream = getOutputFileResolver().openOutputStream(outFile);
try {
IOUtils.write(asText, outStream);
} catch (IOException e) {
warn(log, "Error copying text to output stream: " + e);
throw e;
}
} catch (IOException e) {
warn(log, "Error opening output stream to text file: " + e);
throw e;
}
return outFile;
} catch (IOException e) {
warn(log, "Error reading PDF from stream: " + e);
throw e;
}
} catch (IOException e) {
warn(log, "Error opening input stream: " + e);
throw e;
} catch (Exception e) {
warn(log, "Error converting PDF to text: " + e);
throw e;
} finally {
if (null != outStream)
outStream.close();
if (null != inStream)
inStream.close();
if (null != pdfIn)
pdfIn.close();
}
}Example 55
| Project: jabref-2.9.2-master File: PdfContentImporter.java View source code |
@Override
public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException {
final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1);
PDDocument document = null;
try {
document = PDDocument.load(in);
} catch (IOException e) {
logger.log(Level.SEVERE, "Could not load document", e);
return res;
}
try {
if (document.isEncrypted()) {
logger.log(Level.INFO, Globals.lang("Encrypted documents are not supported"));
//return res;
}
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(1);
stripper.setEndPage(1);
stripper.setSortByPosition(true);
stripper.setParagraphEnd(System.getProperty("line.separator"));
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
String textResult = writer.toString();
String doi = Util.getDOI(textResult);
if (doi.length() < textResult.length()) {
// A DOI was found in the text
// We do NO parsing of the text, but use the DOI fetcher
ImportInspector i = new ImportInspector() {
@Override
public void toFront() {
}
@Override
public void setProgress(int current, int max) {
}
@Override
public void addEntry(BibtexEntry entry) {
// add the entry to the result object
res.add(entry);
}
};
doiToBibTeXFetcher.processQuery(doi, i, status);
if (res.size() != 0) {
// if something has been found, return the result
return res;
} else {
// otherwise, we just parse the PDF
}
}
String author = null;
String editor = null;
String institution = null;
String abstractT = null;
String keywords = null;
String title = null;
String conference = null;
String DOI = null;
String series = null;
String volume = null;
String number = null;
String pages = null;
// year is a class variable as the method extractYear() uses it;
String publisher = null;
BibtexEntryType type = BibtexEntryType.INPROCEEDINGS;
final String lineBreak = System.getProperty("line.separator");
split = textResult.split(lineBreak);
// idea: split[] contains the different lines
// blocks are separated by empty lines
// treat each block
// or do special treatment at authors (which are not broken)
// therefore, we do a line-based and not a block-based splitting
// i points to the current line
// curString (mostly) contains the current block
// the different lines are joined into one and thereby separated by " "
proceedToNextNonEmptyLine();
if (i >= split.length) {
// return empty list
return res;
}
curString = split[i];
i = i + 1;
if (curString.length() > 4) {
// special case: possibly conference as first line on the page
extractYear();
if (curString.contains("Conference")) {
fillCurStringWithNonEmptyLines();
conference = curString;
curString = "";
} else {
// e.g. Copyright (c) 1998 by the Genetics Society of America
// future work: get year using RegEx
String lower = curString.toLowerCase();
if (lower.contains("copyright")) {
fillCurStringWithNonEmptyLines();
publisher = curString;
curString = "";
}
}
}
// start: title
fillCurStringWithNonEmptyLines();
title = streamlineTitle(curString);
curString = "";
//i points to the next non-empty line
// after title: authors
author = null;
while ((i < split.length) && (!split[i].equals(""))) {
// author names are unlikely to be split among different lines
// treat them line by line
curString = streamlineNames(split[i]);
if (author == null) {
author = curString;
} else {
if (curString.equals("")) {
// if split[i] is "and" then "" is returned by streamlineNames -> do nothing
} else {
author = author.concat(" and ").concat(curString);
}
}
i++;
}
curString = "";
i++;
// then, abstract and keywords follow
while (i < split.length) {
curString = split[i];
if ((curString.length() >= "Abstract".length()) && (curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract"))) {
if (curString.length() == "Abstract".length()) {
// only word "abstract" found -- skip line
curString = "";
} else {
curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak);
}
i++;
// whereas we need linebreak as separator
while ((i < split.length) && (!split[i].equals(""))) {
curString = curString.concat(split[i]).concat(lineBreak);
i++;
}
abstractT = curString;
i++;
} else if ((curString.length() >= "Keywords".length()) && (curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords"))) {
if (curString.length() == "Keywords".length()) {
// only word "Keywords" found -- skip line
curString = "";
} else {
curString = curString.substring("Keywords".length() + 1).trim();
}
i++;
fillCurStringWithNonEmptyLines();
keywords = removeNonLettersAtEnd(curString);
} else {
String lower = curString.toLowerCase();
int pos = lower.indexOf("technical");
if (pos >= 0) {
type = BibtexEntryType.TECHREPORT;
pos = curString.trim().lastIndexOf(' ');
if (pos >= 0) {
// assumption: last character of curString is NOT ' '
// otherwise pos+1 leads to an out-of-bounds exception
number = curString.substring(pos + 1);
}
}
i++;
proceedToNextNonEmptyLine();
}
}
i = split.length - 1;
while (i >= 0) {
readLastBlock();
// i now points to the block before or is -1
// curString contains the last block, separated by " "
extractYear();
int pos = curString.indexOf("(Eds.)");
if ((pos >= 0) && (publisher == null)) {
// looks like a Springer last line
// e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
publisher = "Springer";
editor = streamlineNames(curString.substring(0, pos - 1));
//+2 because of ":" after (Eds.) and the subsequent space
curString = curString.substring(pos + "(Eds.)".length() + 2);
String[] springerSplit = curString.split(", ");
if (springerSplit.length >= 4) {
conference = springerSplit[0];
String seriesData = springerSplit[1];
int lastSpace = seriesData.lastIndexOf(' ');
series = seriesData.substring(0, lastSpace);
volume = seriesData.substring(lastSpace + 1);
pages = springerSplit[2].substring(4);
if (springerSplit[3].length() >= 4) {
year = springerSplit[3].substring(0, 4);
}
}
} else {
if (DOI == null) {
pos = curString.indexOf("DOI");
if (pos < 0)
pos = curString.indexOf("doi");
if (pos >= 0) {
pos += 3;
char delimiter = curString.charAt(pos);
if ((delimiter == ':') || (delimiter == ' ')) {
pos++;
}
int nextSpace = curString.indexOf(' ', pos);
if (nextSpace > 0)
DOI = curString.substring(pos, nextSpace);
else
DOI = curString.substring(pos);
}
}
if ((publisher == null) && (curString.indexOf("IEEE") >= 0)) {
// IEEE has the conference things at the end
publisher = "IEEE";
if (conference == null) {
pos = curString.indexOf('$');
if (pos > 0) {
// we found the price
// before the price, the ISSN is stated
// skip that
pos -= 2;
while ((pos >= 0) && (curString.charAt(pos) != ' ')) pos--;
if (pos > 0) {
conference = curString.substring(0, pos);
}
}
}
}
// String lower = curString.toLowerCase();
// if (institution == null) {
//
// }
}
}
BibtexEntry entry = new BibtexEntry();
entry.setType(type);
if (author != null)
entry.setField("author", author);
if (editor != null)
entry.setField("editor", editor);
if (institution != null)
entry.setField("institution", institution);
if (abstractT != null)
entry.setField("abstract", abstractT);
if (keywords != null)
entry.setField("keywords", keywords);
if (title != null)
entry.setField("title", title);
if (conference != null)
entry.setField("booktitle", conference);
if (DOI != null)
entry.setField("doi", DOI);
if (series != null)
entry.setField("series", series);
if (volume != null)
entry.setField("volume", volume);
if (number != null)
entry.setField("number", number);
if (pages != null)
entry.setField("pages", pages);
if (year != null)
entry.setField("year", year);
if (publisher != null)
entry.setField("publisher", publisher);
entry.setField("review", textResult);
res.add(entry);
} catch (NoClassDefFoundError e) {
if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) {
status.showMessage(Globals.lang("Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/."));
} else {
logger.log(Level.SEVERE, e.getLocalizedMessage(), e);
}
} finally {
document.close();
}
return res;
}Example 56
| Project: jabref-master File: PdfContentImporter.java View source code |
@Override
public ParserResult importDatabase(Path filePath, Charset defaultEncoding) {
final ArrayList<BibEntry> result = new ArrayList<>(1);
try (FileInputStream fileStream = new FileInputStream(filePath.toFile());
PDDocument document = XMPUtil.loadWithAutomaticDecryption(fileStream)) {
String firstPageContents = getFirstPageContents(document);
Optional<DOI> doi = DOI.findInText(firstPageContents);
if (doi.isPresent()) {
ParserResult parserResult = new ParserResult(result);
Optional<BibEntry> entry = new DoiFetcher(importFormatPreferences).performSearchById(doi.get().getDOI());
entry.ifPresent(parserResult.getDatabase()::insertEntry);
return parserResult;
}
// idea: split[] contains the different lines
// blocks are separated by empty lines
// treat each block
// or do special treatment at authors (which are not broken)
// therefore, we do a line-based and not a block-based splitting
// i points to the current line
// curString (mostly) contains the current block
// the different lines are joined into one and thereby separated by " "
lines = firstPageContents.split(System.lineSeparator());
proceedToNextNonEmptyLine();
if (i >= lines.length) {
// return empty list
return new ParserResult();
}
// we start at the current line
curString = lines[i];
// i might get incremented later and curString modified, too
i = i + 1;
String author;
String editor = null;
String abstractT = null;
String keywords = null;
String title;
String conference = null;
String DOI = null;
String series = null;
String volume = null;
String number = null;
String pages = null;
// year is a class variable as the method extractYear() uses it;
String publisher = null;
EntryType type = BibtexEntryTypes.INPROCEEDINGS;
if (curString.length() > 4) {
// special case: possibly conference as first line on the page
extractYear();
if (curString.contains("Conference")) {
fillCurStringWithNonEmptyLines();
conference = curString;
curString = "";
} else {
// e.g. Copyright (c) 1998 by the Genetics Society of America
// future work: get year using RegEx
String lower = curString.toLowerCase(Locale.ROOT);
if (lower.contains("copyright")) {
fillCurStringWithNonEmptyLines();
publisher = curString;
curString = "";
}
}
}
// start: title
fillCurStringWithNonEmptyLines();
title = streamlineTitle(curString);
curString = "";
//i points to the next non-empty line
// after title: authors
author = null;
while ((i < lines.length) && !"".equals(lines[i])) {
// author names are unlikely to be lines among different lines
// treat them line by line
curString = streamlineNames(lines[i]);
if (author == null) {
author = curString;
} else {
if ("".equals(curString)) {
// if lines[i] is "and" then "" is returned by streamlineNames -> do nothing
} else {
author = author.concat(" and ").concat(curString);
}
}
i++;
}
curString = "";
i++;
// then, abstract and keywords follow
while (i < lines.length) {
curString = lines[i];
if ((curString.length() >= "Abstract".length()) && "Abstract".equalsIgnoreCase(curString.substring(0, "Abstract".length()))) {
if (curString.length() == "Abstract".length()) {
// only word "abstract" found -- skip line
curString = "";
} else {
curString = curString.substring("Abstract".length() + 1).trim().concat(System.lineSeparator());
}
i++;
// whereas we need linebreak as separator
while ((i < lines.length) && !"".equals(lines[i])) {
curString = curString.concat(lines[i]).concat(System.lineSeparator());
i++;
}
abstractT = curString.trim();
i++;
} else if ((curString.length() >= "Keywords".length()) && "Keywords".equalsIgnoreCase(curString.substring(0, "Keywords".length()))) {
if (curString.length() == "Keywords".length()) {
// only word "Keywords" found -- skip line
curString = "";
} else {
curString = curString.substring("Keywords".length() + 1).trim();
}
i++;
fillCurStringWithNonEmptyLines();
keywords = removeNonLettersAtEnd(curString);
} else {
String lower = curString.toLowerCase(Locale.ROOT);
int pos = lower.indexOf("technical");
if (pos >= 0) {
type = BibtexEntryTypes.TECHREPORT;
pos = curString.trim().lastIndexOf(' ');
if (pos >= 0) {
// assumption: last character of curString is NOT ' '
// otherwise pos+1 leads to an out-of-bounds exception
number = curString.substring(pos + 1);
}
}
i++;
proceedToNextNonEmptyLine();
}
}
i = lines.length - 1;
while (i >= 0) {
readLastBlock();
// i now points to the block before or is -1
// curString contains the last block, separated by " "
extractYear();
int pos = curString.indexOf("(Eds.)");
if ((pos >= 0) && (publisher == null)) {
// looks like a Springer last line
// e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
publisher = "Springer";
editor = streamlineNames(curString.substring(0, pos - 1));
//+2 because of ":" after (Eds.) and the subsequent space
curString = curString.substring(pos + "(Eds.)".length() + 2);
String[] springerSplit = curString.split(", ");
if (springerSplit.length >= 4) {
conference = springerSplit[0];
String seriesData = springerSplit[1];
int lastSpace = seriesData.lastIndexOf(' ');
series = seriesData.substring(0, lastSpace);
volume = seriesData.substring(lastSpace + 1);
pages = springerSplit[2].substring(4);
if (springerSplit[3].length() >= 4) {
year = springerSplit[3].substring(0, 4);
}
}
} else {
if (DOI == null) {
pos = curString.indexOf("DOI");
if (pos < 0) {
pos = curString.indexOf(FieldName.DOI);
}
if (pos >= 0) {
pos += 3;
char delimiter = curString.charAt(pos);
if ((delimiter == ':') || (delimiter == ' ')) {
pos++;
}
int nextSpace = curString.indexOf(' ', pos);
if (nextSpace > 0) {
DOI = curString.substring(pos, nextSpace);
} else {
DOI = curString.substring(pos);
}
}
}
if ((publisher == null) && curString.contains("IEEE")) {
// IEEE has the conference things at the end
publisher = "IEEE";
if (conference == null) {
pos = curString.indexOf('$');
if (pos > 0) {
// we found the price
// before the price, the ISSN is stated
// skip that
pos -= 2;
while ((pos >= 0) && (curString.charAt(pos) != ' ')) {
pos--;
}
if (pos > 0) {
conference = curString.substring(0, pos);
}
}
}
}
}
}
BibEntry entry = new BibEntry();
entry.setType(type);
if (author != null) {
entry.setField(FieldName.AUTHOR, author);
}
if (editor != null) {
entry.setField(FieldName.EDITOR, editor);
}
if (abstractT != null) {
entry.setField(FieldName.ABSTRACT, abstractT);
}
if (!Strings.isNullOrEmpty(keywords)) {
entry.setField(FieldName.KEYWORDS, keywords);
}
if (title != null) {
entry.setField(FieldName.TITLE, title);
}
if (conference != null) {
entry.setField(FieldName.BOOKTITLE, conference);
}
if (DOI != null) {
entry.setField(FieldName.DOI, DOI);
}
if (series != null) {
entry.setField(FieldName.SERIES, series);
}
if (volume != null) {
entry.setField(FieldName.VOLUME, volume);
}
if (number != null) {
entry.setField(FieldName.NUMBER, number);
}
if (pages != null) {
entry.setField(FieldName.PAGES, pages);
}
if (year != null) {
entry.setField(FieldName.YEAR, year);
}
if (publisher != null) {
entry.setField(FieldName.PUBLISHER, publisher);
}
result.add(entry);
} catch (EncryptedPdfsNotSupportedException e) {
return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
} catch (IOException exception) {
return ParserResult.fromError(exception);
} catch (FetcherException e) {
return ParserResult.fromErrorMessage(e.getMessage());
}
return new ParserResult(result);
}Example 57
| Project: java-thumbnailer-master File: PDFBoxThumbnailer.java View source code |
@Override
public void generateThumbnail(File input, File output) throws IOException, ThumbnailerException {
FileDoesNotExistException.check(input);
if (input.length() == 0)
throw new FileDoesNotExistException("File is empty");
FileUtils.deleteQuietly(output);
PDDocument document = null;
try {
try {
document = PDDocument.load(input);
} catch (IOException e) {
throw new ThumbnailerException("Could not load PDF File", e);
}
BufferedImage tmpImage = writeImageFirstPage(document, BufferedImage.TYPE_INT_RGB);
if (tmpImage.getWidth() == thumbWidth) {
ImageIO.write(tmpImage, "PNG", output);
} else {
ResizeImage resizer = new ResizeImage(thumbWidth, thumbHeight);
resizer.resizeMethod = ResizeImage.NO_RESIZE_ONLY_CROP;
resizer.setInputImage(tmpImage);
resizer.writeOutput(output);
}
} finally {
if (document != null) {
try {
document.close();
} catch (IOException e) {
}
}
}
}Example 58
| Project: lucene-solr-master File: ReducePDFSize.java View source code |
public static void main(String[] args) throws IOException {
if (2 != args.length) {
throw new RuntimeException("arg0 must be input file, org1 must be output file");
}
String in = args[0];
String out = args[1];
PDDocument doc = null;
try {
doc = PDDocument.load(new File(in));
doc.setAllSecurityToBeRemoved(true);
for (COSObject cosObject : doc.getDocument().getObjects()) {
COSBase base = cosObject.getObject();
// if it's a stream: decode it, then re-write it using FLATE_DECODE
if (base instanceof COSStream) {
COSStream stream = (COSStream) base;
byte[] bytes;
try {
bytes = new PDStream(stream).toByteArray();
} catch (IOException ex) {
throw new RuntimeException("can't serialize byte[] from: " + cosObject.getObjectNumber() + " " + cosObject.getGenerationNumber() + " obj: " + ex.getMessage(), ex);
}
stream.removeItem(COSName.FILTER);
OutputStream streamOut = stream.createOutputStream(COSName.FLATE_DECODE);
streamOut.write(bytes);
streamOut.close();
}
}
doc.getDocumentCatalog();
doc.save(out);
} finally {
if (doc != null) {
doc.close();
}
}
}Example 59
| Project: MEditor-master File: KrameriusImageSupport.java View source code |
/**
* Read image.
*
* @param url
* the url
* @param type
* the type
* @param page
* the page
* @return the image
* @throws IOException
* Signals that an I/O exception has occurred.
*/
public static Image readImage(URL url, ImageMimeType type, int page) throws IOException {
if (type.javaNativeSupport()) {
return ImageIO.read(url.openStream());
} else if ((type.equals(ImageMimeType.DJVU)) || (type.equals(ImageMimeType.VNDDJVU)) || (type.equals(ImageMimeType.XDJVU))) {
com.lizardtech.djvu.Document doc = new com.lizardtech.djvu.Document(url);
doc.setAsync(false);
DjVuPage[] p = new DjVuPage[1];
// read page from the document - index 0, priority 1, favorFast true
int size = doc.size();
if ((page != 0) && (page >= size)) {
page = 0;
}
p[0] = doc.getPage(page, 1, true);
p[0].setAsync(false);
DjVuImage djvuImage = new DjVuImage(p, true);
Rectangle pageBounds = djvuImage.getPageBounds(0);
Image[] images = djvuImage.getImage(new JPanel(), new Rectangle(pageBounds.width, pageBounds.height));
if (images.length == 1) {
Image img = images[0];
return img;
} else
return null;
} else if (type.equals(ImageMimeType.PDF)) {
try (PDDocument document = PDDocument.load(url.openStream())) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
int resolution = 96;
BufferedImage image = pdfRenderer.renderImageWithDPI(page, resolution, ImageType.RGB);
return image;
}
} else
throw new IllegalArgumentException("unsupported mimetype '" + type.getValue() + "'");
}Example 60
| Project: nuxeo-core-master File: PDF2TextConverter.java View source code |
@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
PDDocument document = null;
File f = null;
OutputStream fas = null;
try {
document = PDDocument.load(blobHolder.getBlob().getStream());
// NXP-1556: if document is protected an IOException will be raised
// Instead of catching the exception based on its message string
// lets avoid sending messages that will generate this error
// code taken from PDFTextStripper.writeText source.
// only care about standard encryption and if it was decrypted with
// the user password
AccessPermission permission = document.getCurrentAccessPermission();
if (permission.canExtractContent()) {
PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();
// use the position information to heuristically organize the
// extracted paragraphs. This is also important for
// right-to-left languages.
textStripper.setSortByPosition(true);
String text = textStripper.getText(document);
// replace non breaking space by regular spaces (why?)
// text = text.replace("\u00a0", " ");
f = File.createTempFile("pdfboplugin", ".txt");
fas = new FileOutputStream(f);
fas.write(text.getBytes("UTF-8"));
return new SimpleCachableBlobHolder(new FileBlob(new FileInputStream(f), "text/plain", "UTF-8"));
} else {
return new SimpleCachableBlobHolder(new StringBlob(""));
}
} catch (Exception e) {
throw new ConversionException("Error during text extraction with PDFBox", e);
} finally {
if (document != null) {
try {
document.close();
} catch (Exception e) {
log.error("Error while closing PDFBox document", e);
}
}
if (fas != null) {
try {
fas.close();
} catch (IOException e) {
log.error(e);
}
}
if (f != null) {
f.delete();
}
}
}Example 61
| Project: nuxeo-master File: PDF2TextConverter.java View source code |
@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
PDDocument document = null;
File f = null;
OutputStream fas = null;
try {
document = PDDocument.load(blobHolder.getBlob().getStream());
// NXP-1556: if document is protected an IOException will be raised
// Instead of catching the exception based on its message string
// lets avoid sending messages that will generate this error
// code taken from PDFTextStripper.writeText source.
// only care about standard encryption and if it was decrypted with
// the user password
AccessPermission permission = document.getCurrentAccessPermission();
if (permission.canExtractContent()) {
PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();
// use the position information to heuristically organize the
// extracted paragraphs. This is also important for
// right-to-left languages.
textStripper.setSortByPosition(true);
String text = textStripper.getText(document);
// replace non breaking space by regular spaces (why?)
// text = text.replace("\u00a0", " ");
f = Framework.createTempFile("pdfboplugin", ".txt");
fas = new FileOutputStream(f);
fas.write(text.getBytes("UTF-8"));
try (FileInputStream is = new FileInputStream(f)) {
Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8");
return new SimpleCachableBlobHolder(blob);
}
} else {
return new SimpleCachableBlobHolder(Blobs.createBlob(""));
}
} catch (IOException e) {
throw new ConversionException("Error during text extraction with PDFBox", e);
} finally {
if (document != null) {
try {
document.close();
} catch (IOException e) {
log.error("Error while closing PDFBox document", e);
}
}
if (fas != null) {
try {
fas.close();
} catch (IOException e) {
log.error(e);
}
}
if (f != null) {
f.delete();
}
}
}Example 62
| Project: nuxeo-services-master File: BaseConverterTest.java View source code |
public static boolean isPDFA(File pdfFile) throws Exception {
PDDocument pddoc = PDDocument.load(pdfFile);
XMPMetadata xmp = pddoc.getDocumentCatalog().getMetadata().exportXMPMetadata();
Document doc = xmp.getXMPDocument();
// <rdf:Description xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
// rdf:about="">
// <pdfaid:part>1</pdfaid:part>
// <pdfaid:conformance>A</pdfaid:conformance>
// </rdf:Description>
NodeList list = doc.getElementsByTagName("pdfaid:conformance");
return list != null && "A".equals(list.item(0).getTextContent());
}Example 63
| Project: paper2ebook-master File: Transformer.java View source code |
/**
* Output a PDF with as many pages as there are interesting areas in the
* input document
*/
@Override
public PDDocument extract() throws IOException {
PDDocument extractedDocument = new PDDocument();
extractedDocument.setDocumentInformation(sourceDocument.getDocumentInformation());
extractedDocument.getDocumentCatalog().setViewerPreferences(sourceDocument.getDocumentCatalog().getViewerPreferences());
@SuppressWarnings("unchecked") List<PDPage> pages = sourceDocument.getDocumentCatalog().getAllPages();
int pageCounter = 1;
for (PDPage page : pages) {
if (pageCounter >= startPage && pageCounter <= endPage) {
List<PDRectangle> zoomedFragments = getFragments(page);
for (PDRectangle fragment : zoomedFragments) {
PDPage outputPage = extractedDocument.importPage(page);
outputPage.setCropBox(fragment);
outputPage.setMediaBox(page.getMediaBox());
outputPage.setResources(page.findResources());
outputPage.setRotation(page.findRotation());
// TODO: rotate the page in landscape mode is width > height
}
}
pageCounter++;
}
return extractedDocument;
}Example 64
| Project: PDFExtract-master File: PDFBoxSource.java View source code |
// -------------------------- STATIC METHODS -------------------------- @NotNull protected static PDDocument openPdfDocument(@NotNull final File pdfFile, @Nullable final String password) { long t0 = System.currentTimeMillis(); MDC.put("doc", pdfFile.getName()); log.info("LOG00120:Opening PDF file " + pdfFile + "."); try { final PDDocument document = PDDocument.load(pdfFile); if (document.isEncrypted()) { if (password != null) { try { document.decrypt(password); } catch (Exception e) { throw new RuntimeException("Error while reading encrypted PDF:", e); } } else { log.warn("File claims to be encrypted, a password should be provided"); } } log.debug("load()took" + (System.currentTimeMillis() - t0) + "ms"); return document; } catch (IOException e) { MDC.put("doc", ""); throw new RuntimeException("Error while reading " + pdfFile + ".", e); } }
Example 65
| Project: seng310-ebookme-master File: PdfExtractor.java View source code |
public void extract(InputStream stream, Charset charset, String mimeType, Map result) throws ExtractorException {
// setup a PDDocument
PDDocument document = null;
try {
try {
PDFParser parser = new PDFParser(stream);
parser.parse();
document = parser.getPDDocument();
} catch (IOException e) {
throw new ExtractorException(e);
}
// decrypt and extract info from this document
processDocument(document, result);
} finally {
if (document != null) {
// close the document
try {
document.close();
} catch (IOException e) {
throw new ExtractorException(e);
}
}
}
}Example 66
| Project: syncope-master File: BinaryPDFPreviewer.java View source code |
@Override
public Component preview(final byte[] uploadedBytes) {
firstPage = null;
PDDocument document = null;
try {
document = PDDocument.load(new ByteArrayInputStream(uploadedBytes));
if (document.isEncrypted()) {
LOG.info("Document is encrypted, no preview is possible");
} else {
firstPage = new PDFRenderer(document).renderImage(0, RESOLUTION, IMAGE_TYPE);
}
} catch (IOException e) {
LOG.error("While generating thumbnail from first page", e);
} finally {
IOUtils.closeQuietly(document);
}
Fragment fragment;
if (firstPage == null) {
fragment = new Fragment("preview", "noPreviewFragment", this);
} else {
fragment = new Fragment("preview", "previewFragment", this);
fragment.add(new NonCachingImage("previewImage", new ThumbnailImageResource(firstPage)));
}
WebMarkupContainer previewContainer = new WebMarkupContainer("previewContainer");
previewContainer.setOutputMarkupId(true);
previewContainer.add(fragment);
return this.addOrReplace(previewContainer);
}Example 67
| Project: amos-ss15-proj4-master File: ZipGenerator.java View source code |
public void generate(OutputStream out, Locale locale, float height, Employee employee, int fontSize, String zipPassword) throws ZipException, NoSuchMessageException, IOException, COSVisitorException, CloneNotSupportedException {
final ZipOutputStream zout = new ZipOutputStream(out);
if (zipPassword == null) {
// Use default password if none is set.
zipPassword = "fragebogen";
}
ZipParameters params = new ZipParameters();
params.setFileNameInZip("employee.txt");
params.setCompressionLevel(Zip4jConstants.COMP_DEFLATE);
params.setCompressionLevel(Zip4jConstants.DEFLATE_LEVEL_ULTRA);
params.setEncryptFiles(true);
params.setReadHiddenFiles(false);
params.setEncryptionMethod(Zip4jConstants.ENC_METHOD_AES);
params.setAesKeyStrength(Zip4jConstants.AES_STRENGTH_256);
params.setPassword(zipPassword);
params.setSourceExternalStream(true);
zout.putNextEntry(null, params);
zout.write((AppContext.getApplicationContext().getMessage("HEADER", null, locale) + "\n\n").getBytes());
zout.write((AppContext.getApplicationContext().getMessage("print.section.personalData", null, locale) + "\n\n").getBytes());
Iterator it = employee.getPersonalDataFields().entrySet().iterator();
while (it.hasNext()) {
Map.Entry pair = (Map.Entry) it.next();
zout.write((pair.getKey() + ": " + pair.getValue() + '\n').getBytes());
// avoids a ConcurrentModificationException
it.remove();
}
zout.write(("\n\n" + AppContext.getApplicationContext().getMessage("print.section.taxes", null, locale) + "\n\n").getBytes());
it = employee.getTaxesFields().entrySet().iterator();
while (it.hasNext()) {
Map.Entry pair = (Map.Entry) it.next();
zout.write((pair.getKey() + ": " + pair.getValue() + '\n').getBytes());
// avoids a ConcurrentModificationException
it.remove();
}
zout.closeEntry();
// Create a document and add a page to it
PDDocument document = new PDDocument();
PDPage page = new PDPage();
document.addPage(page);
float y = -1;
int margin = 100;
// Create a new font object selecting one of the PDF base fonts
PDFont font = PDType1Font.TIMES_ROMAN;
// Start a new content stream which will "hold" the to be created content
PDPageContentStream contentStream = new PDPageContentStream(document, page);
// Define a text content stream using the selected font, moving the cursor and drawing the text "Hello World"
contentStream.beginText();
y = page.getMediaBox().getHeight() - margin + height;
contentStream.moveTextPositionByAmount(margin, y);
/*
List<String> list = StringUtils.splitEqually(fileContent, 90);
for (String e : list) {
contentStream.moveTextPositionByAmount(0, -15);
contentStream.drawString(e);
}
*/
contentStream.setFont(PDType1Font.TIMES_BOLD, 36);
contentStream.drawString(AppContext.getApplicationContext().getMessage("HEADER", null, locale));
contentStream.setFont(PDType1Font.TIMES_BOLD, 14);
contentStream.moveTextPositionByAmount(0, -4 * height);
contentStream.drawString(AppContext.getApplicationContext().getMessage("print.section.personalData", null, locale));
contentStream.moveTextPositionByAmount(0, -2 * height);
contentStream.setFont(font, fontSize);
it = employee.getPersonalDataFields().entrySet().iterator();
while (it.hasNext()) {
StringBuffer nextLineToDraw = new StringBuffer();
Map.Entry pair = (Map.Entry) it.next();
nextLineToDraw.append(pair.getKey());
nextLineToDraw.append(": ");
nextLineToDraw.append(pair.getValue());
contentStream.drawString(nextLineToDraw.toString());
contentStream.moveTextPositionByAmount(0, -height);
// avoids a ConcurrentModificationException
it.remove();
}
contentStream.setFont(PDType1Font.TIMES_BOLD, 14);
contentStream.moveTextPositionByAmount(0, -2 * height);
contentStream.drawString(AppContext.getApplicationContext().getMessage("print.section.taxes", null, locale));
contentStream.moveTextPositionByAmount(0, -2 * height);
contentStream.setFont(font, fontSize);
it = employee.getTaxesFields().entrySet().iterator();
while (it.hasNext()) {
StringBuffer nextLineToDraw = new StringBuffer();
Map.Entry pair = (Map.Entry) it.next();
nextLineToDraw.append(pair.getKey());
nextLineToDraw.append(": ");
nextLineToDraw.append(pair.getValue());
contentStream.drawString(nextLineToDraw.toString());
contentStream.moveTextPositionByAmount(0, -height);
// avoids a ConcurrentModificationException
it.remove();
}
contentStream.endText();
// Make sure that the content stream is closed:
contentStream.close();
// Save the results and ensure that the document is properly closed:
ByteArrayOutputStream pdfout = new ByteArrayOutputStream();
document.save(pdfout);
document.close();
ZipParameters params2 = (ZipParameters) params.clone();
params2.setFileNameInZip("employee.pdf");
zout.putNextEntry(null, params2);
zout.write(pdfout.toByteArray());
zout.closeEntry();
// Write the zip to client
zout.finish();
zout.flush();
zout.close();
}Example 68
| Project: ARX-master File: ARXCertificate.java View source code |
/**
* Renders the document into the given output stream
*
* @param stream
* @throws IOException
*/
public void save(OutputStream stream) throws IOException {
// Render
Document document = new Document(style.gethMargin(), style.gethMargin(), style.getvMargin(), style.getvMargin());
for (Element element : this.elements) {
element.render(document, 0, this.style);
}
// Save to temp file
File tmp = File.createTempFile("arx", "certificate");
document.save(tmp);
// Load and watermark
PDDocument pdDocument = PDDocument.load(tmp);
Watermark watermark = new Watermark(pdDocument);
watermark.mark(pdDocument);
// Save
pdDocument.save(stream);
pdDocument.close();
tmp.delete();
}Example 69
| Project: cider-master File: pdfIdiom.java View source code |
@Override
public Model parse(DataSource source) throws ParserException {
// create an empty Model
Model model = ModelFactory.createDefaultModel();
Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource();
// open pdf document
final PDDocument theDocument;
final PDFParser parser;
try {
parser = new PDFParser(source.getStream());
parser.parse();
theDocument = parser.getPDDocument();
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new ParserException(e.getMessage(), source.getURI());
}
if (theDocument.isEncrypted()) {
try {
theDocument.openProtection(new StandardDecryptionMaterial(""));
} catch (BadSecurityHandlerException e) {
throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e);
} catch (IOException e) {
throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
} catch (CryptographyException e) {
throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e);
}
final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("PDF cannot be decrypted", source.getURI());
}
// get metadata
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
if (theDocInfo != null) {
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
docAuthor = theDocInfo.getAuthor();
docKeywordStr = theDocInfo.getKeywords();
}
if (docAuthor != null && docAuthor.length() > 0) {
resource.addProperty(VCARD.FN, docAuthor);
resource.addProperty(DC.creator, docAuthor);
}
if (docSubject != null && docSubject.length() > 0) {
resource.addProperty(DC.subject, docSubject);
}
if (docTitle != null && docTitle.length() > 0) {
resource.addProperty(DC.title, docTitle);
}
String[] docKeywords = null;
if (docKeywordStr != null && docKeywordStr.length() > 0) {
docKeywords = docKeywordStr.split(" |,");
resource.addProperty(DC.coverage, concat(docKeywords));
}
// get the content
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Writer writer;
try {
writer = new OutputStreamWriter(baos, "UTF-8");
} catch (UnsupportedEncodingException e1) {
writer = new OutputStreamWriter(baos);
}
try {
final PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(theDocument, writer);
theDocument.close();
writer.close();
} catch (IOException e) {
if (writer != null)
try {
writer.close();
} catch (final Exception ex) {
}
throw new ParserException("PDF content reader", source.getURI(), e);
}
String content;
try {
content = new String(baos.toByteArray(), "UTF-8");
} catch (UnsupportedEncodingException e) {
content = new String(baos.toByteArray());
}
if (content != null && content.length() > 0) {
resource.addProperty(CIDER.data_content_text, content);
}
return model;
}Example 70
| Project: dkpro-core-master File: Pdf2CasConverter.java View source code |
public void writeText(final CAS aCas, final InputStream aIs) throws IOException {
final PDDocument doc = PDDocument.load(aIs);
try {
if (doc.isEncrypted()) {
throw new IOException("Encrypted documents currently not supported");
}
cas = aCas;
text = new StringBuilder();
writeText(doc);
} finally {
doc.close();
}
}Example 71
| Project: DrakkarKeel-master File: PdfParser.java View source code |
/**
* Para extraer contenido del pdf
*
* @param f
* @return
*/
public boolean analyzePdfDocument(File f) {
try {
pdoc = PDDocument.load(f);
if (!pdoc.isEncrypted() && pdoc.getCurrentAccessPermission().canExtractContent() && pdoc.getNumberOfPages() != 0) {
this.numberPages = pdoc.getNumberOfPages();
pdfText = new PDFTextStripper();
swriter = new StringWriter();
////////////////////datos
pinf = pdoc.getDocumentInformation();
if (pinf == null) {
OutputMonitor.printLine("The document does not have available information.", OutputMonitor.INFORMATION_MESSAGE);
} else {
setTitle(pinf.getTitle());
setAuthor(pinf.getAuthor());
setNumberpages(pdoc.getNumberOfPages());
setCalCreation(pinf.getCreationDate());
setCalModification(pinf.getModificationDate());
pdfText.writeText(pdoc, swriter);
allContent = swriter.getBuffer().toString();
}
pdoc.close();
swriter.close();
return true;
} else {
OutputMonitor.printLine("Encrypted document.", OutputMonitor.INFORMATION_MESSAGE);
}
} catch (Exception ex) {
OutputMonitor.printStream("", ex);
} finally {
if (pdoc != null) {
try {
pdoc.close();
} catch (IOException ex) {
OutputMonitor.printStream("IO", ex);
}
}
}
return false;
}Example 72
| Project: EasySendToKindle-master File: FileUtil.java View source code |
public static void pdf2txt(String file) throws Exception {
boolean sort = false;
String pdfFile = file;
String textFile = null;
String encoding = "UTF-8";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
Writer output = null;
PDDocument document = null;
try {
try {
URL url = new URL(pdfFile);
document = PDDocument.load(pdfFile);
String fileName = url.getFile();
if (fileName.length() > 4) {
File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt");
textFile = outputFile.getName();
}
} catch (MalformedURLException e) {
document = PDDocument.load(pdfFile);
if (pdfFile.length() > 4) {
textFile = pdfFile.substring(0, pdfFile.length() - 4) + ".txt";
}
}
output = new OutputStreamWriter(new FileOutputStream(textFile), encoding);
PDFTextStripper stripper = null;
stripper = new PDFTextStripper();
stripper.setSortByPosition(sort);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
stripper.writeText(document, output);
} finally {
if (output != null) {
output.close();
}
if (document != null) {
document.close();
}
}
}Example 73
| Project: eid-applet-master File: PdfSpikeTest.java View source code |
@Test
public void testSignPDF() throws Exception {
// create a sample PDF file
Document document = new Document();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
PdfWriter.getInstance(document, baos);
document.open();
Paragraph titleParagraph = new Paragraph("This is a test.");
titleParagraph.setAlignment(Paragraph.ALIGN_CENTER);
document.add(titleParagraph);
document.newPage();
Paragraph textParagraph = new Paragraph("Hello world.");
document.add(textParagraph);
document.close();
File tmpFile = File.createTempFile("test-", ".pdf");
LOG.debug("tmp file: " + tmpFile.getAbsolutePath());
FileUtils.writeByteArrayToFile(tmpFile, baos.toByteArray());
// eID
PcscEid pcscEid = new PcscEid(new TestView(), new Messages(Locale.getDefault()));
if (false == pcscEid.isEidPresent()) {
LOG.debug("insert eID card");
pcscEid.waitForEidPresent();
}
List<X509Certificate> signCertificateChain = pcscEid.getSignCertificateChain();
Certificate[] certs = new Certificate[signCertificateChain.size()];
for (int idx = 0; idx < certs.length; idx++) {
certs[idx] = signCertificateChain.get(idx);
}
// open the pdf
FileInputStream pdfInputStream = new FileInputStream(tmpFile);
File signedTmpFile = File.createTempFile("test-signed-", ".pdf");
PdfReader reader = new PdfReader(pdfInputStream);
FileOutputStream pdfOutputStream = new FileOutputStream(signedTmpFile);
PdfStamper stamper = PdfStamper.createSignature(reader, pdfOutputStream, '\0', null, true);
// add extra page
Rectangle pageSize = reader.getPageSize(1);
int pageCount = reader.getNumberOfPages();
int extraPageIndex = pageCount + 1;
stamper.insertPage(extraPageIndex, pageSize);
// calculate unique signature field name
int signatureNameIndex = 1;
String signatureName;
AcroFields existingAcroFields = reader.getAcroFields();
List<String> existingSignatureNames = existingAcroFields.getSignatureNames();
do {
signatureName = "Signature" + signatureNameIndex;
signatureNameIndex++;
} while (existingSignatureNames.contains(signatureName));
LOG.debug("new unique signature name: " + signatureName);
PdfSignatureAppearance signatureAppearance = stamper.getSignatureAppearance();
signatureAppearance.setCrypto(null, certs, null, PdfSignatureAppearance.SELF_SIGNED);
signatureAppearance.setCertificationLevel(PdfSignatureAppearance.CERTIFIED_NO_CHANGES_ALLOWED);
signatureAppearance.setReason("PDF Signature Test");
signatureAppearance.setLocation("Belgium");
signatureAppearance.setVisibleSignature(new Rectangle(54, 440, 234, 566), extraPageIndex, signatureName);
signatureAppearance.setExternalDigest(new byte[128], new byte[20], "RSA");
signatureAppearance.preClose();
byte[] content = IOUtils.toByteArray(signatureAppearance.getRangeStream());
byte[] hash = MessageDigest.getInstance("SHA-1").digest(content);
byte[] signatureBytes = pcscEid.sign(hash, "SHA-1");
pcscEid.close();
PdfSigGenericPKCS sigStandard = signatureAppearance.getSigStandard();
PdfPKCS7 signature = sigStandard.getSigner();
signature.setExternalDigest(signatureBytes, hash, "RSA");
PdfDictionary dictionary = new PdfDictionary();
dictionary.put(PdfName.CONTENTS, new PdfString(signature.getEncodedPKCS1()).setHexWriting(true));
signatureAppearance.close(dictionary);
LOG.debug("signed tmp file: " + signedTmpFile.getAbsolutePath());
// verify the signature
reader = new PdfReader(new FileInputStream(signedTmpFile));
AcroFields acroFields = reader.getAcroFields();
ArrayList<String> signatureNames = acroFields.getSignatureNames();
for (String signName : signatureNames) {
LOG.debug("signature name: " + signName);
LOG.debug("signature covers whole document: " + acroFields.signatureCoversWholeDocument(signName));
LOG.debug("document revision " + acroFields.getRevision(signName) + " of " + acroFields.getTotalRevisions());
PdfPKCS7 pkcs7 = acroFields.verifySignature(signName);
Calendar signDate = pkcs7.getSignDate();
LOG.debug("signing date: " + signDate.getTime());
LOG.debug("Subject: " + PdfPKCS7.getSubjectFields(pkcs7.getSigningCertificate()));
LOG.debug("Document modified: " + !pkcs7.verify());
Certificate[] verifyCerts = pkcs7.getCertificates();
for (Certificate certificate : verifyCerts) {
X509Certificate x509Certificate = (X509Certificate) certificate;
LOG.debug("cert subject: " + x509Certificate.getSubjectX500Principal());
}
}
/*
* Reading the signature using Apache PDFBox.
*/
PDDocument pdDocument = PDDocument.load(signedTmpFile);
COSDictionary trailer = pdDocument.getDocument().getTrailer();
/*
* PDF Reference - third edition - Adobe Portable Document Format -
* Version 1.4 - 3.6.1 Document Catalog
*/
COSDictionary documentCatalog = (COSDictionary) trailer.getDictionaryObject(COSName.ROOT);
/*
* 8.6.1 Interactive Form Dictionary
*/
COSDictionary acroForm = (COSDictionary) documentCatalog.getDictionaryObject(COSName.ACRO_FORM);
COSArray fields = (COSArray) acroForm.getDictionaryObject(COSName.FIELDS);
for (int fieldIdx = 0; fieldIdx < fields.size(); fieldIdx++) {
COSDictionary field = (COSDictionary) fields.getObject(fieldIdx);
String fieldType = field.getNameAsString("FT");
if ("Sig".equals(fieldType)) {
COSDictionary signatureDictionary = (COSDictionary) field.getDictionaryObject(COSName.V);
/*
* TABLE 8.60 Entries in a signature dictionary
*/
COSString signatoryName = (COSString) signatureDictionary.getDictionaryObject(COSName.NAME);
if (null != signatoryName) {
LOG.debug("signatory name: " + signatoryName.getString());
}
COSString reason = (COSString) signatureDictionary.getDictionaryObject(COSName.REASON);
if (null != reason) {
LOG.debug("reason: " + reason.getString());
}
COSString location = (COSString) signatureDictionary.getDictionaryObject(COSName.LOCATION);
if (null != location) {
LOG.debug("location: " + location.getString());
}
Calendar signingTime = signatureDictionary.getDate(COSName.M);
if (null != signingTime) {
LOG.debug("signing time: " + signingTime.getTime());
}
String signatureHandler = signatureDictionary.getNameAsString(COSName.FILTER);
LOG.debug("signature handler: " + signatureHandler);
}
}
}Example 74
| Project: elexis-3-base-master File: PrintVaccinationEntriesHandler.java View source code |
private void createPDF(Patient patient, Image image) throws IOException, COSVisitorException {
PDDocumentInformation pdi = new PDDocumentInformation();
Mandant mandant = (Mandant) ElexisEventDispatcher.getSelected(Mandant.class);
pdi.setAuthor(mandant.getName() + " " + mandant.getVorname());
pdi.setCreationDate(new GregorianCalendar());
pdi.setTitle("Impfausweis " + patient.getLabel());
PDDocument document = new PDDocument();
document.setDocumentInformation(pdi);
PDPage page = new PDPage();
page.setMediaBox(PDPage.PAGE_SIZE_A4);
document.addPage(page);
PDRectangle pageSize = page.findMediaBox();
PDFont font = PDType1Font.HELVETICA_BOLD;
PDFont subFont = PDType1Font.HELVETICA;
PDPageContentStream contentStream = new PDPageContentStream(document, page);
contentStream.beginText();
contentStream.setFont(font, 14);
contentStream.moveTextPositionByAmount(40, pageSize.getUpperRightY() - 40);
contentStream.drawString(patient.getLabel());
contentStream.endText();
String dateLabel = sdf.format(Calendar.getInstance().getTime());
String title = Person.load(mandant.getId()).get(Person.TITLE);
String mandantLabel = title + " " + mandant.getName() + " " + mandant.getVorname();
contentStream.beginText();
contentStream.setFont(subFont, 10);
contentStream.moveTextPositionByAmount(40, pageSize.getUpperRightY() - 55);
contentStream.drawString("Ausstellung " + dateLabel + ", " + mandantLabel);
contentStream.endText();
BufferedImage imageAwt = convertToAWT(image.getImageData());
PDXObjectImage pdPixelMap = new PDPixelMap(document, imageAwt);
contentStream.drawXObject(pdPixelMap, 40, 30, pageSize.getWidth() - 80, pageSize.getHeight() - 100);
contentStream.close();
String outputPath = CoreHub.userCfg.get(PreferencePage.VAC_PDF_OUTPUTDIR, CoreHub.getWritableUserDir().getAbsolutePath());
if (outputPath.equals(CoreHub.getWritableUserDir().getAbsolutePath())) {
SWTHelper.showInfo("Kein Ausgabeverzeichnis definiert", "Ausgabe erfolgt in: " + outputPath + "\nDas Ausgabeverzeichnis kann unter Einstellungen\\Klinische Hilfsmittel\\Impfplan definiert werden.");
}
File outputDir = new File(outputPath);
File pdf = new File(outputDir, "impfplan_" + patient.getPatCode() + ".pdf");
document.save(pdf);
document.close();
Desktop.getDesktop().open(pdf);
}Example 75
| Project: geoserver-master File: PDFGetMapTest.java View source code |
/**
* Returns the last tiling pattern found during a render of the PDF document. Can be used to extract
* one tiling pattern that gets actually used to render shapes (meant to be used against a document
* that only has a single tiling pattern)
*
* @param pdfDocument
* @return
* @throws InvalidPasswordException
* @throws IOException
*/
PDTilingPattern getTilingPattern(byte[] pdfDocument) throws InvalidPasswordException, IOException {
// load the document using PDFBOX (iText is no good for parsing tiling patterns, mostly works
// well for text and image extraction, spent a few hours trying to use it with no results)
PDDocument doc = PDDocument.load(pdfDocument);
PDPage page = doc.getPage(0);
// use a graphics stream engine, it's the only thing I could find that parses the PDF
// deep enough to allow catching the tiling pattern in parsed form
AtomicReference<PDTilingPattern> pattern = new AtomicReference<>();
PDFStreamEngine engine = new PDFGraphicsStreamEngine(page) {
@Override
public void strokePath() throws IOException {
}
@Override
public void shadingFill(COSName shadingName) throws IOException {
}
@Override
public void moveTo(float x, float y) throws IOException {
}
@Override
public void lineTo(float x, float y) throws IOException {
}
@Override
public Point2D getCurrentPoint() throws IOException {
return null;
}
@Override
public void fillPath(int windingRule) throws IOException {
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException {
}
@Override
public void endPath() throws IOException {
}
@Override
public void drawImage(PDImage pdImage) throws IOException {
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException {
}
@Override
public void closePath() throws IOException {
}
@Override
public void clip(int windingRule) throws IOException {
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {
}
};
// setup the tiling pattern trap
engine.addOperator(new SetNonStrokingColorN() {
@Override
public void process(Operator operator, List<COSBase> arguments) throws IOException {
super.process(operator, arguments);
PDColor color = context.getGraphicsState().getNonStrokingColor();
if (context.getGraphicsState().getNonStrokingColorSpace() instanceof PDPattern) {
PDPattern colorSpace = (PDPattern) context.getGraphicsState().getNonStrokingColorSpace();
PDAbstractPattern ap = colorSpace.getPattern(color);
if (ap instanceof PDTilingPattern) {
pattern.set((PDTilingPattern) ap);
}
}
}
});
// run it
engine.processPage(page);
return pattern.get();
}Example 76
| Project: is.idega.idegaweb.marathon-master File: PDFTester.java View source code |
public void doIt(String message, String outfile) throws IOException, COSVisitorException {
// the document
PDDocument doc = null;
try {
doc = new PDDocument();
// Page 1
PDFont font = PDType1Font.HELVETICA;
PDPage page = new PDPage();
page.setMediaBox(PDPage.PAGE_SIZE_A4);
doc.addPage(page);
float fontSize = 12.0f;
PDRectangle pageSize = page.findMediaBox();
float centeredXPosition = (pageSize.getWidth() - fontSize / 1000f) / 2f;
float stringWidth = font.getStringWidth(message);
float centeredYPosition = (pageSize.getHeight() - (stringWidth * fontSize) / 1000f) / 3f;
PDPageContentStream contentStream = new PDPageContentStream(doc, page, false, false);
contentStream.setFont(font, fontSize);
contentStream.beginText();
// counterclockwise rotation
for (int i = 0; i < 8; i++) {
contentStream.setTextRotation(i * Math.PI * 0.25, centeredXPosition, pageSize.getHeight() - centeredYPosition);
contentStream.drawString(message + " " + i);
}
// clockwise rotation
for (int i = 0; i < 8; i++) {
contentStream.setTextRotation(-i * Math.PI * 0.25, centeredXPosition, centeredYPosition);
contentStream.drawString(message + " " + i);
}
contentStream.endText();
contentStream.close();
// Page 2
page = new PDPage();
page.setMediaBox(PDPage.PAGE_SIZE_A4);
doc.addPage(page);
fontSize = 1.0f;
contentStream = new PDPageContentStream(doc, page, false, false);
contentStream.setFont(font, fontSize);
contentStream.beginText();
// text scaling
for (int i = 0; i < 10; i++) {
contentStream.setTextScaling(12 + (i * 6), 12 + (i * 6), 100, 100 + i * 50);
contentStream.drawString(message + " " + i);
}
contentStream.endText();
contentStream.close();
// Page 3
page = new PDPage();
page.setMediaBox(PDPage.PAGE_SIZE_A4);
doc.addPage(page);
fontSize = 1.0f;
contentStream = new PDPageContentStream(doc, page, false, false);
contentStream.setFont(font, fontSize);
contentStream.beginText();
int i = 0;
// text scaling combined with rotation
contentStream.setTextMatrix(12, 0, 0, 12, centeredXPosition, centeredYPosition * 1.5);
contentStream.drawString(message + " " + i++);
contentStream.setTextMatrix(0, 18, -18, 0, centeredXPosition, centeredYPosition * 1.5);
contentStream.drawString(message + " " + i++);
contentStream.setTextMatrix(-24, 0, 0, -24, centeredXPosition, centeredYPosition * 1.5);
contentStream.drawString(message + " " + i++);
contentStream.setTextMatrix(0, -30, 30, 0, centeredXPosition, centeredYPosition * 1.5);
contentStream.drawString(message + " " + i++);
contentStream.endText();
contentStream.close();
doc.save(outfile);
} finally {
if (doc != null) {
doc.close();
}
}
}Example 77
| Project: pdf2alto-master File: PrintWordLocations.java View source code |
public void processDocuments(String[] args) throws Exception {
if (args.length != 1) {
usage();
} else {
PDDocument document = null;
try {
document = PDDocument.load(args[0]);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintWordLocations printer = new PrintWordLocations();
List allPages = document.getDocumentCatalog().getAllPages();
System.out.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?><alto xmlns=\"http://www.loc.gov/standards/alto/alto-v2.0.xsd\"><Description><MeasurementUnit>inch1200</MeasurementUnit></Description><Layout>");
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
if (page.getCropBox() != null) {
PDRectangle mediaBox = (PDRectangle) page.getMediaBox();
PDRectangle cropBox = (PDRectangle) page.getCropBox();
printer.setOffset(new MarginOffset(cropBox.getLowerLeftX() - mediaBox.getLowerLeftX(), cropBox.getLowerLeftY() - mediaBox.getLowerLeftY()));
}
System.out.println("<Page>");
System.out.println("<PrintSpace>");
System.out.println("<TextBlock>");
System.out.println("<TextLine>");
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
endOfPage();
System.out.println("</TextLine>");
System.out.println("</TextBlock>");
System.out.println("</PrintSpace>");
System.out.println("</Page>");
}
System.out.println("</Layout></alto>");
} finally {
if (document != null) {
document.close();
}
}
}
}Example 78
| Project: s2robot-master File: PdfExtractor.java View source code |
/*
* (non-Javadoc)
*
* @see org.seasar.robot.extractor.Extractor#getText(java.io.InputStream,
* java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new RobotSystemException("The inputstream is null.");
}
synchronized (pdfBoxLockObj) {
PDDocument document = null;
try {
document = PDDocument.load(in, null, force);
if (document.isEncrypted() && params != null) {
String password = params.get(ExtractData.PDF_PASSWORD);
if (password == null) {
password = getPassword(params.get(ExtractData.URL), params.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
}
if (password != null) {
final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password);
document.openProtection(sdm);
final AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent()) {
throw new IOException("You do not have permission to extract text.");
}
}
}
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final Writer output = new OutputStreamWriter(baos, encoding);
final PDFTextStripper stripper = new PDFTextStripper(encoding);
stripper.setForceParsing(force);
final AtomicBoolean done = new AtomicBoolean(false);
final PDDocument doc = document;
final Set<Exception> exceptionSet = new HashSet<>();
Thread task = new Thread(new Runnable() {
@Override
public void run() {
try {
stripper.writeText(doc, output);
} catch (Exception e) {
exceptionSet.add(e);
} finally {
done.set(true);
}
}
});
task.setDaemon(true);
task.start();
task.join(timeout);
if (!done.get()) {
for (int i = 0; i < 100 && !done.get(); i++) {
task.interrupt();
Thread.sleep(50);
}
throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
} else if (!exceptionSet.isEmpty()) {
throw exceptionSet.iterator().next();
}
output.flush();
final ExtractData extractData = new ExtractData(baos.toString(encoding));
extractMetadata(document, extractData);
return extractData;
} catch (final Exception e) {
throw new ExtractException(e);
} finally {
if (document != null) {
try {
document.close();
} catch (final IOException e) {
}
}
}
}
}Example 79
| Project: sad-analyzer-master File: PdfParser.java View source code |
/**
* @see input.parser.SadParser#getSad(java.lang.String)
*/
public Section getSad(String pathTemplate, String urlSad) {
Section section = new CompositeSection();
if (!pathTemplate.isEmpty()) {
structureXml = new XmlReader(pathTemplate);
}
try {
File input = new File(urlSad);
PDDocument doc;
doc = PDDocument.load(input);
PDDocumentOutline root = doc.getDocumentCatalog().getDocumentOutline();
if (root != null) {
// Se pide el primer nodo del árbol
PDOutlineItem item = root.getFirstChild();
if (structureXml != null) {
if (validateTemplate(item)) {
section = parserSections(item, doc);
} else {
section = null;
}
} else {
section = parserSections(item, doc);
}
} else {
Item s = new Item();
s.setText(extractText(0, doc, doc.getNumberOfPages()));
s.setName(input.getName());
((CompositeSection) section).addSection(s);
}
return section;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}Example 80
| Project: shr5rcp-master File: SourceBookView.java View source code |
@Override
protected IStatus run(IProgressMonitor monitor) {
try {
PDDocument pdDocument = getpdfDoc(file, monitor);
if (pdDocument == null)
return Status.OK_STATUS;
final String text1 = getTextFromPage(src, pdDocument);
pageMap.put(key, text1);
Display.getDefault().asyncExec(new Runnable() {
@Override
public void run() {
displayedText.setValue(text1);
processText(text1, src);
}
});
} catch (IOException e) {
Activator.logError(e);
}
return Status.OK_STATUS;
}Example 81
| Project: smartly-master File: PDFUtils.java View source code |
public static void forEachPage(final File pdfFile, final boolean nonSequential, Delegates.Function<Boolean> callback) throws IOException {
final PDDocument doc = nonSequential ? PDDocument.loadNonSeq(pdfFile, null) : PDDocument.load(pdfFile);
try {
final List<PDPage> pages = doc.getDocumentCatalog().getAllPages();
final int len = pages.size();
int count = 1;
for (final PDPage page : pages) {
if (null != callback) {
if (callback.handle(page, count, len)) {
count++;
} else {
break;
}
}
}
} finally {
doc.close();
}
}Example 82
| Project: trantor-pdf-converter-master File: PdfDoc.java View source code |
public static void pdfToPngPreview(String pdf, String output) throws IOException {
PDDocument pdDoc = null;
try {
pdDoc = PDDocument.load(pdf);
List pdPages = pdDoc.getDocumentCatalog().getAllPages();
ListIterator pageIter = pdPages.listIterator();
PDPage firstPage = (PDPage) pageIter.next();
BufferedImage img = firstPage.convertToImage(BufferedImage.TYPE_INT_RGB, Consts.PREVIEW_DPI);
ImageIO.write(img, Consts.PNG, new File(output));
} catch (Exception ex) {
Logger.getLogger(PdfDoc.class.getName()).log(Level.SEVERE, null, ex);
} finally {
if (null != pdDoc) {
pdDoc.close();
}
}
}Example 83
| Project: webarchive-discovery-master File: PDFParser.java View source code |
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
PDDocument pdfDocument = null;
TemporaryResources tmp = new TemporaryResources();
try {
// PDFBox can process entirely in memory, or can use a temp file
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not already
TikaInputStream tstream = TikaInputStream.cast(stream);
if (tstream != null && tstream.hasFile()) {
// File based, take that as a cue to use a temporary file
RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
} else {
// Go for the normal, stream based in-memory parsing
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
}
if (pdfDocument.isEncrypted()) {
String password = null;
// Did they supply a new style Password Provider?
PasswordProvider passwordProvider = context.get(PasswordProvider.class);
if (passwordProvider != null) {
password = passwordProvider.getPassword(metadata);
}
// Fall back on the old style metadata if set
if (password == null && metadata.get(PASSWORD) != null) {
password = metadata.get(PASSWORD);
}
// If no password is given, use an empty string as the default
if (password == null) {
password = "";
}
try {
pdfDocument.decrypt(password);
} catch (Exception e) {
}
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
PDF2XHTML.process(pdfDocument, handler, metadata, extractAnnotationText, enableAutoSpace, suppressDuplicateOverlappingText, sortByPosition);
} catch (Exception e) {
log.error("Exception while parsing PDF: " + e);
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
tmp.dispose();
}
}Example 84
| Project: Xponents-master File: PDFConverter.java View source code |
/**
* Implementation is informed by PDFBox authors.
*
* @param doc
* @return
* @throws IOException
*/
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Adapted from LucenePDFDocument.java from PDFBox lucene project
*
* This class is used to create a document for the lucene search engine.
* This should easily plug into the IndexHTML or IndexFiles that comes
* with the lucene project. This class will populate the following
* fields.
* <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
* <tr>
* <td>path</td> <td>File system path if loaded from a file</td> </tr>
* <tr>
* <td>url</td> <td>URL to PDF document</td> </tr> <tr>
* <td>contents</td>
* <td>Entire contents of PDF document, indexed but not stored</td>
* </tr>
* <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
* <tr>
* <td>modified</td> <td>The modified date/time according to the url or
* path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
* Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
* <td>From PDF meta-data if available</td> </tr> <tr>
* <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
* </table>
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.23 $
*
* @throws IOException If there is an error parsing the document.
*/
PDDocument pdfDocument = null;
ConvertedDocument textdoc = new ConvertedDocument(doc);
try {
pdfDocument = PDDocument.load(doc);
if (pdfDocument.isEncrypted()) {
//Just try using the default password and move on
// Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
textdoc.addProperty("encrypted", "YES");
}
//create a writer where to append the text content.
StringWriter writer = new StringWriter();
stripper.resetEngine();
stripper.writeText(pdfDocument, writer);
PDDocumentInformation info = pdfDocument.getDocumentInformation();
if (info != null) {
textdoc.addAuthor(info.getAuthor());
try {
textdoc.addCreateDate(info.getCreationDate());
} catch (IOException io) {
}
textdoc.addProperty("creator_tool", info.getCreator());
textdoc.addProperty("keywords", info.getKeywords());
/* try {
metadata.add("ModificationDate", info.getModificationDate());
} catch (IOException io) {
//ignore, bad date but continue with indexing
} */
//metadata.add("Producer", info.getProducer());
textdoc.addProperty("subject", info.getSubject());
String ttl = info.getTitle();
if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
ttl = textdoc.filename;
}
textdoc.addTitle(ttl);
// metadata.add("Trapped", info.getTrapped());
// TODO: Character set is what?
textdoc.setEncoding("UTF-8");
}
// Note: the buffer to string operation is costless;
// the char array value of the writer buffer and the content string
// is shared as long as the buffer content is not modified, which will
// not occur here.
textdoc.setText(writer.getBuffer().toString());
return textdoc;
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
}Example 85
| Project: xtf-dsc-master File: PDFToString.java View source code |
//////////////////////////////////////////////////////////////////////////////
/** Convert a PDF file into an XML string.
*
* @param PDFInputStream The stream of PDF data to convert to an
* XML string.
*
* @return
* If successful, a string containing the XML equivalent of the source
* PDF file. If an error occurred, this method returns <code>null</code>.
*
*/
static String convert(InputStream PDFInputStream) throws IOException {
// Make a stripper if we haven't already.
if (stripper == null)
stripper = new PDFTextStripper();
// Workaround: using PDFTextStripper normally results in a Window
// being created. However, since we're running in a servlet container, this
// isn't generally desirable (and often isn't possible.) So we let AWT know
// that it's running in "headless" mode, and this prevents the window from
// being created.
//
System.setProperty("java.awt.headless", "true");
XMLFormatter formatter = new XMLFormatter();
try {
PDDocument pdfDoc = null;
try {
// Get hold of the PDF document to convert.
pdfDoc = PDDocument.load(PDFInputStream);
// If the document is encrypted, we've got a problem.
if (pdfDoc.isEncrypted()) {
Trace.info("*** PDF File is Encrypted. File Skipped.");
throw new Exception();
}
// Start the XML with an XML format tag.
formatter.procInstr("xml version=\"1.0\" encoding=\"utf-8\"");
// Set up the tab size and blank line formatting.
formatter.tabSize(4);
formatter.blankLineAfterTag(false);
// Determine how many pages there are in the PDF file.
int pageCount = pdfDoc.getNumberOfPages();
// Create an all-enclosing document tag summarizing
// the original document name and the number of pages.
//
formatter.beginTag("pdfDocument");
formatter.attr("pageCount", pageCount);
// Process each page in the PDF document.
for (int i = 1; i <= pageCount; i++) {
// Start with a new page tag.
formatter.beginTag("pdfPage");
formatter.attr("number", i);
// Tell the stripper to only process the current page.
stripper.setStartPage(i);
stripper.setEndPage(i);
// Get the text for this page.
String pdfText = stripper.getText(pdfDoc);
// Escape and normalize characters.
pdfText = XMLIndexSource.normalize(pdfText);
// Tack the text onto the XML output, nicely formatted
// into lines of 128 characters or less.
//
formatter.text(pdfText, 128);
formatter.newLineAfterText();
// End the current page tag.
formatter.endTag();
}
// for( int i = 1; i <= pageCount; i++ )
// End any remaining open tags (should only be the pdfDocument
// tag.)
//
formatter.endAllTags();
}// If anything went wrong, say what it was.
catch (Throwable t) {
Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
Trace.error(" With message: " + t.getMessage());
} finally // Finally, close up the the PDF document.
{
if (pdfDoc != null)
pdfDoc.close();
}
}// Shunt out any other exceptions.
catch (Throwable t) {
Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
Trace.error(" With message: " + t.getMessage());
}
// Return the resulting XML string to the caller.
return formatter.toString();
}Example 86
| Project: xtf-master File: PDFToString.java View source code |
//////////////////////////////////////////////////////////////////////////////
/** Convert a PDF file into an XML string.
*
* @param PDFInputStream The stream of PDF data to convert to an
* XML string.
*
* @return
* If successful, a string containing the XML equivalent of the source
* PDF file. If an error occurred, this method returns <code>null</code>.
*
*/
static String convert(InputStream PDFInputStream) throws IOException {
// Make a stripper if we haven't already.
if (stripper == null)
stripper = new PDFTextStripper();
// Workaround: using PDFTextStripper normally results in a Window
// being created. However, since we're running in a servlet container, this
// isn't generally desirable (and often isn't possible.) So we let AWT know
// that it's running in "headless" mode, and this prevents the window from
// being created.
//
System.setProperty("java.awt.headless", "true");
XMLFormatter formatter = new XMLFormatter();
try {
PDDocument pdfDoc = null;
try {
// Get hold of the PDF document to convert.
pdfDoc = PDDocument.load(PDFInputStream);
// If the document is encrypted, we've got a problem.
if (pdfDoc.isEncrypted()) {
Trace.info("*** PDF File is Encrypted. File Skipped.");
throw new Exception();
}
// Start the XML with an XML format tag.
formatter.procInstr("xml version=\"1.0\" encoding=\"utf-8\"");
// Set up the tab size and blank line formatting.
formatter.tabSize(4);
formatter.blankLineAfterTag(false);
// Determine how many pages there are in the PDF file.
int pageCount = pdfDoc.getNumberOfPages();
// Create an all-enclosing document tag summarizing
// the original document name and the number of pages.
//
formatter.beginTag("pdfDocument");
formatter.attr("pageCount", pageCount);
// Process each page in the PDF document.
for (int i = 1; i <= pageCount; i++) {
// Start with a new page tag.
formatter.beginTag("pdfPage");
formatter.attr("number", i);
// Tell the stripper to only process the current page.
stripper.setStartPage(i);
stripper.setEndPage(i);
// Get the text for this page.
String pdfText = stripper.getText(pdfDoc);
// Escape and normalize characters.
pdfText = XMLIndexSource.normalize(pdfText);
// Tack the text onto the XML output, nicely formatted
// into lines of 128 characters or less.
//
formatter.text(pdfText, 128);
formatter.newLineAfterText();
// End the current page tag.
formatter.endTag();
}
// for( int i = 1; i <= pageCount; i++ )
// End any remaining open tags (should only be the pdfDocument
// tag.)
//
formatter.endAllTags();
}// If anything went wrong, say what it was.
catch (Throwable t) {
Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
Trace.error(" With message: " + t.getMessage());
} finally // Finally, close up the the PDF document.
{
if (pdfDoc != null)
pdfDoc.close();
}
}// Shunt out any other exceptions.
catch (Throwable t) {
Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
Trace.error(" With message: " + t.getMessage());
}
// Return the resulting XML string to the caller.
return formatter.toString();
}Example 87
| Project: DynamicSpotter-master File: ResourceViewer.java View source code |
// creates image data from the first page of the pdf file
private ImageData createImageDataFromPdf(String resourceFile) {
try {
PDDocument document = PDDocument.load(resourceFile);
@SuppressWarnings("unchecked") List<PDPage> pages = document.getDocumentCatalog().getAllPages();
if (pages.isEmpty()) {
throw new SWTException(SWT.ERROR_INVALID_IMAGE);
}
BufferedImage bufferedImage = pages.get(0).convertToImage(PDF_IMAGE_TYPE, PDF_VIEW_RESOLUTION);
return ImageUtils.convertToImageData(bufferedImage);
} catch (IOException e) {
throw new SWTException(SWT.ERROR_IO);
}
}Example 88
| Project: ServerDevelopmentGuideV2-master File: CmisCustomPdfWatermarkServiceWrapper.java View source code |
@Override
public ContentStream getContentStream(String repositoryId, String objectId, String streamId, BigInteger offset, BigInteger length, ExtensionsData extension) {
slflog("getContentStream override from Chameleon module --------------", repositoryId);
long startTime = System.currentTimeMillis();
CallContext sharedContext = this.getCallContext();
// Get the native domain object from the call context if one is shared
// by the vendor (example only)
// Your CMIS vendor's documentation must expose the name of any shared
// objects they place here for extensions.
// Object objShared = sharedContext.get("shared_key_name_from_vendor");
ContentStream retVal = getWrappedService().getContentStream(repositoryId, objectId, streamId, offset, length, extension);
if (sharedContext.getUsername().equalsIgnoreCase(userToWatermark)) {
if ((retVal != null) && (retVal.getMimeType().contains("pdf"))) {
InputStream rawStream = retVal.getStream();
// return a pdfbox document object
// for debugging only - load to pdfbox and stream out
// PDDocument modifiedPDF = watermarkPDF_loadOnly(rawStream);
// actual watermark code
PDDocument modifiedPDF = watermarkPDF(rawStream);
// Extra credit here. Replace with TempStoreOutputStream or find
// another way to handle very large objects in a small memory
// footprint.
// ByteArrayOutputStream out = new ByteArrayOutputStream();
TempStoreOutputStream out;
TempStoreOutputStreamFactory outFactory = (TempStoreOutputStreamFactory) sharedContext.get(CallContext.STREAM_FACTORY);
if (outFactory != null) {
// reuse the server factory configuration
out = outFactory.newOutputStream();
} else {
// there is no default ThresholdOutputStreamFactory
// -> create a stream manually:
// default temp directory, max 4MiB in main memory,
// unlimited content size
out = new ThresholdOutputStream(null, 4 * 1024 * 1024, -1);
}
try {
modifiedPDF.save(out);
modifiedPDF.close();
// new
InputStream modifiedInputStream = out.getInputStream();
// ByteArrayInputStream(out.toByteArray());
// Extra credit here. Handle offset and length if provided
// by the client.
// now write the stream back to the ContentStream object
retVal = new ContentStreamImpl(retVal.getFileName(), null, "application/pdf", modifiedInputStream);
} catch (Exception e) {
slflog("error transposing stream getContentStream ", e.getMessage());
LOG.error("Could not watermark PDF document: {}", e.getMessage(), e);
throw new CmisRuntimeException("Could not watermark PDF document!");
}
}
// if pdf stream
}
// if user matches filter param
LOG.info("[CmisCustomServiceWrapper] Exiting method getContentStream. time (ms):" + (System.currentTimeMillis() - startTime));
return retVal;
}Example 89
| Project: xwiki-enterprise-master File: PDFTest.java View source code |
private String getPDFContent(URL url) throws Exception {
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
InputStream is = connection.getInputStream();
PDDocument pdd = PDDocument.load(is);
String text;
try {
PDFText2HTML stripper = new PDFText2HTML();
text = stripper.getText(pdd);
} finally {
if (pdd != null) {
pdd.close();
}
if (is != null) {
is.close();
}
}
return text;
}Example 90
| Project: converge-1.x-master File: MetaDataService.java View source code |
/** {@inheritDoc } */
@Override
public String extractContent(MediaItemRendition mir) {
String contentType = mir.getContentType();
String story = "";
if (contentType == null) {
LOG.log(Level.WARNING, "Content type is null");
return story;
}
if (contentType.equals("application/pdf")) {
// Extract text in PDF
try {
URL originalFile = new URL(mir.getAbsoluteFilename());
PDDocument doc = null;
try {
// Read PDF
PDFParser parser = new PDFParser(originalFile.openStream());
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDDocument pdDoc = new PDDocument(cosDoc);
PDFTextStripper stripper = new PDFTextStripper();
story = stripper.getText(pdDoc);
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
}
}
} catch (MalformedURLException ex) {
}
} else if (contentType.equals("application/msword") || contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
try {
URL originalFile = new URL(mir.getAbsoluteFilename());
HWPFDocument doc = new HWPFDocument(originalFile.openStream());
WordExtractor extractor = new WordExtractor(doc);
story = extractor.getText();
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
}
return story;
}Example 91
| Project: CrossRefX-master File: PlainTextDialog.java View source code |
/**
* @Method: pdfParser
*
* input : File
* output: String
*
* Diese Methode lie�t den Text aus der Pdf Datei aus und gibt den Text als String zur�ck
*/
private String pdfParser(File pdfFile) {
NDC.push("pdfParser");
PDDocument document = null;
try {
document = PDDocument.load(pdfFile);
} catch (IOException e) {
logger.error("Could not load document", e);
NDC.pop();
return null;
}
if (document.isEncrypted()) {
NDC.pop();
return "Encrypted documents are not supported";
}
PDFTextStripper stripper;
try {
stripper = new PDFTextStripper();
} catch (IOException e) {
logger.error("Could not create stripper", e);
NDC.pop();
return null;
}
stripper.setStartPage(1);
stripper.setEndPage(2);
String text;
try {
text = stripper.getText(document);
} catch (Exception e) {
logger.error("Could not parse PDF", e);
NDC.pop();
return null;
}
NDC.pop();
return text;
}Example 92
| Project: flaming-sailor-master File: PDFParser.java View source code |
/**
* get a TextPage out of the PDF, ignoring characters smaller than minHeight.
*
* @param pdfFile the File to extract it out of
* @param minHeight minimum height to ignore
* @return a Page
*/
public List<TextPage> getTextPages(File pdfFile, float minHeight) {
fileName = pdfFile.getName();
outString = new StringWriter();
this.minHeight = minHeight;
this.textPageList = new ArrayList<>();
Map<String, Map<Integer, Long>> fontCounts = new HashMap<>();
document = null;
try {
document = PDDocument.load(pdfFile);
catalog = document.getDocumentCatalog();
allpages = catalog.getAllPages();
this.writeText(document, outString);
outString.close();
outString = null;
// document.close();
} catch (IOException e) {
logger.error("I/O Error:" + pdfFile.getName(), e);
} finally {
if (document != null) {
try {
document.close();
document = null;
} catch (IOException e) {
logger.error("I/O error closing file:" + pdfFile.getName(), e);
}
}
}
// the page is currently a set of lines with text pieces.
// next steps
// 1. remove header/footer boilerplate
// 2. get font stats
// 3. construct higher order components
//
TextPage.removeBoilerplate(textPageList, TextPage.LEVENSHTEIN_DISTANCE);
long histogram[] = null;
for (TextPage page : textPageList) {
double avgLeft = page.getAvgLeft();
double avgRight = page.getAvgRight();
double avgWidth = page.getAvgWidth();
long lineCount = page.getLineCount();
Double charDensity = page.getCharDensity();
histogram = Component.mergeHistogram(page.getHistogram(), histogram);
if (lineCount > 0) {
docAvgLeft += avgLeft * lineCount;
docAvgWidth += avgWidth * lineCount;
docAvgRight += avgRight * lineCount;
docCharDensity += charDensity * lineCount;
docLineCount += lineCount;
}
Map<String, Map<Integer, Long>> pageFontCounts = page.getFontCounts();
for (Map.Entry<String, Map<Integer, Long>> e : pageFontCounts.entrySet()) {
Map<Integer, Long> fontTally = fontCounts.get(e.getKey());
if (fontTally == null) {
fontTally = new HashMap<>();
}
for (Map.Entry<Integer, Long> pageFontTally : e.getValue().entrySet()) {
Long tally = fontTally.get(pageFontTally.getKey());
if (tally == null) {
fontTally.put(pageFontTally.getKey(), pageFontTally.getValue());
} else {
fontTally.put(pageFontTally.getKey(), tally + pageFontTally.getValue());
}
}
fontCounts.put(e.getKey(), fontTally);
}
}
docAvgLeft /= docLineCount;
docAvgRight /= docLineCount;
docAvgWidth /= docLineCount;
docCharDensity /= docLineCount;
linesPerPage = docLineCount / textPageList.size();
normalizeFontCounts(fontCounts);
normalizedHistogram = Component.getNormalizedHistogram(histogram);
logger.info(Component.normHistoGramToString(normalizedHistogram) + String.format(" H:%5.1f W:%6.1f D:%4.2f P:%4.2f", (double) highestFreqSize, docAvgWidth, docCharDensity, 1.0));
for (TextPage page : textPageList) {
page.constructPageComponents(highestFreqSize, this.minFontSize, this.maxFontSize, normalizedFontCounts, normalizedFonts, normalizedSizes, docAvgLeft, docAvgRight, docAvgWidth, docCharDensity, linesPerPage, normalizedHistogram);
}
return textPageList;
}Example 93
| Project: gsearch-master File: TransformerToText.java View source code |
private StringBuffer getTextFromPDF(byte[] doc) throws GenericSearchException {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF");
StringBuffer docText = new StringBuffer();
ByteArrayInputStream bais = null;
try {
bais = new ByteArrayInputStream(doc);
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new ByteArrayInputStream: ", e);
throw new GenericSearchException("getTextFromPDF new ByteArrayInputStream: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new ByteArrayInputStream");
PDFParser parser;
try {
parser = new PDFParser(bais);
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFParser: ", e);
throw new GenericSearchException("getTextFromPDF new PDFParser: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFParser");
try {
parser.parse();
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.parse: ", e);
throw new GenericSearchException("getTextFromPDF parser.parse: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.parse");
COSDocument cosDoc = null;
try {
cosDoc = parser.getDocument();
} catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.getDocument: ", e);
throw new GenericSearchException("getTextFromPDF parser.getDocument: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.getDocument");
PDDocument pdDoc = null;
try {
pdDoc = new PDDocument(cosDoc);
} catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDDocument: ", e);
throw new GenericSearchException("getTextFromPDF new PDDocument: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDDocument isEncrypted=" + pdDoc.isEncrypted() + " getNumberOfPages=" + pdDoc.getNumberOfPages());
PDFTextStripper stripper;
try {
stripper = new PDFTextStripper();
} catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFTextStripper: ", e);
throw new GenericSearchException("getTextFromPDF new PDFTextStripper: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFTextStripper getStartPage=" + stripper.getStartPage() + " getEndPage=" + stripper.getEndPage());
String docString = "";
try {
docString = stripper.getText(pdDoc);
} catch (Exception e) {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText: ", e);
throw new GenericSearchException("getTextFromPDF stripper.getText: ", e);
} finally {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText finally");
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText");
docText = new StringBuffer(docString);
// put space instead of characters not allowed in the indexing stylesheet
char c;
for (int i = 0; i < docText.length(); i++) {
c = docText.charAt(i);
if (c < 32 && c != 9 && c != 10 && c != 13) {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF index=" + i + " char=" + c + " set to 32");
docText.replace(i, i + 1, " ");
}
}
return docText;
}Example 94
| Project: jucy-master File: TextIndexer.java View source code |
private Reader getReader(File file) throws IOException {
FileInputStream input = new FileInputStream(file);
BufferedInputStream bin = new BufferedInputStream(input);
String fileending = GH.getFileEnding(file.getName());
if (fileending.equalsIgnoreCase("pdf")) {
PDDocument pdfDocument = null;
try {
// if (file.length() > MAX_TOTALSIZE/2) {
// System.gc();
// }
pdfDocument = PDDocument.load(bin, getScratchRaf(), true);
if (pdfDocument.isEncrypted()) {
return null;
}
PDFTextStripper stripper = new PDFTextStripper();
// create a writer where to append the text content.
Reader reader;
if (file.length() < MAX_RAMSIZE_FOR_PDF) {
StringWriter writer = new StringWriter();
stripper.writeText(pdfDocument, writer);
String contents = writer.getBuffer().toString();
reader = new StringReader(contents);
} else {
final File f = new File(PI.getTempPath(), "index.tmp");
FileWriter fw = new FileWriter(f);
try {
stripper.writeText(pdfDocument, fw);
} finally {
GH.close(fw);
}
FileReader fr = new FileReader(f) {
@Override
public void close() throws IOException {
super.close();
if (!f.delete()) {
f.deleteOnExit();
}
}
};
reader = fr;
}
return reader;
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
} else {
return new FileReader(file);
}
}Example 95
| Project: OCRaptor-master File: PDF2XHTML.java View source code |
/**
* Converts the given PDF document (and related metadata) to a stream of XHTML
* SAX events sent to the given content handler.
*
* @param document
* PDF document
* @param handler
* SAX content handler
* @param metadata
* PDF metadata
* @throws SAXException
* if the content handler fails to process SAX events
* @throws TikaException
* if the PDF document can not be processed
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);
pdf2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@Override
public void flush() {
}
@Override
public void close() {
}
});
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
} else {
throw new TikaException("Unable to extract PDF content", e);
}
}
}Example 96
| Project: opensearchserver-master File: ViewerController.java View source code |
private void loadPdfBox() throws IOException, CryptographyException, SearchLibException, InterruptedException {
PDDocument document = null;
try {
document = PDDocument.loadNonSeq(tempFile, null);
// Trying to open with empty password
boolean isEncrypted = document.isEncrypted();
if (isEncrypted)
document.decrypt("");
loadGS(isEncrypted ? "" : null);
List<Rectangle> boxList = new ArrayList<Rectangle>(0);
checkPdfBoxHighlight(document, boxList);
checkHocrHighlight(currentImage.getWidth(), currentImage.getHeight(), boxList);
ImageUtils.yellowHighlight(currentImage, boxList, 0.1F);
numberOfPages = document.getNumberOfPages();
} finally {
if (document != null)
IOUtils.close(document);
}
}Example 97
| Project: pdfxtk-master File: ProcessFile.java View source code |
/*
public static String STR_INFILE = "";
public static String STR_OUTPUT_PATH = ".";
public static int STR_CURR_PAGE_NO = -1;
public static final String STR_IMAGE_PREFIX = "-imgPrefix";
*/
/*
* possible conversions:
* pdf -> xml, pdf -> xhtml,
* gecko -> xml, gecko -> xhtml
*/
public static List<Page> processPDF(byte[] theFile, PageProcessor pp, int startPage, int endPage, String encoding, String password, List<AdjacencyGraph<GenericSegment>> adjGraphList, boolean GUI) throws DocumentProcessingException {
boolean toConsole = false;
if (password == null)
password = "";
if (encoding == null || encoding == "")
encoding = DEFAULT_ENCODING;
if (startPage == 0)
startPage = 1;
if (endPage == 0)
endPage = Integer.MAX_VALUE;
ByteArrayInputStream inStream = new ByteArrayInputStream(theFile);
PDDocument document = null;
try {
PDFObjectExtractor extractor = new PDFObjectExtractor();
// PDDocument document = null;
document = PDDocument.load(inStream);
// document.print();
if (document.isEncrypted()) {
try {
document.decrypt(password);
} catch (InvalidPasswordException e) {
if (!(password == null || password == "")) {
throw new DocumentProcessingException("Error: The supplied password is incorrect.");
} else {
throw new DocumentProcessingException("Error: The document is encrypted.");
}
} catch (CryptographyException e) {
throw new DocumentProcessingException(e);
}
}
extractor.setStartPage(startPage);
extractor.setEndPage(endPage);
// stripper.writeText( document, output );
List<PDFPage> thePages = extractor.findObjects(document);
List<Page> theResult = new ArrayList<Page>();
startPage = extractor.getStartPage();
endPage = extractor.getEndPage();
// now the DU part
Iterator<PDFPage> pageIter = thePages.iterator();
int currentPage = -1;
while (pageIter.hasNext()) {
currentPage++;
PDFPage thePage = pageIter.next();
Page resultPage = pp.processPage(thePage);
theResult.add(resultPage);
if (adjGraphList != null)
adjGraphList.add(pp.getAdjGraph());
}
// 17.11.10 document-wide processing for headers, footers, etc.
if (!GUI)
theResult = pp.processDocPages(theResult, null);
// move to finally block somewhere?
if (document != null) {
document.close();
}
return theResult;
} catch (IOException e) {
e.printStackTrace();
throw new DocumentProcessingException(e);
}
}Example 98
| Project: sd-dss-master File: PdfBoxSignatureService.java View source code |
@Override
public byte[] digest(final InputStream toSignDocument, final SignatureParameters parameters, final DigestAlgorithm digestAlgorithm, final Map.Entry<String, PdfDict>... extraDictionariesToAddBeforeSign) throws DSSException {
final byte[] signatureValue = DSSUtils.EMPTY_BYTE_ARRAY;
File toSignFile = null;
File signedFile = null;
PDDocument pdDocument = null;
try {
toSignFile = DSSPDFUtils.getFileFromPdfData(toSignDocument);
pdDocument = PDDocument.load(toSignFile);
addExtraDictionaries(pdDocument, extraDictionariesToAddBeforeSign);
PDSignature pdSignature = createSignatureDictionary(parameters);
signedFile = File.createTempFile("sd-dss-", "-signed.pdf");
final FileOutputStream fileOutputStream = DSSPDFUtils.getFileOutputStream(toSignFile, signedFile);
final byte[] digestValue = signDocumentAndReturnDigest(parameters, signatureValue, signedFile, fileOutputStream, pdDocument, pdSignature, digestAlgorithm);
return digestValue;
} catch (IOException e) {
throw new DSSException(e);
} finally {
DSSUtils.delete(toSignFile);
DSSUtils.delete(signedFile);
DSSPDFUtils.close(pdDocument);
}
}Example 99
| Project: spimedb-master File: Multimedia.java View source code |
@Override
public NObject apply(NObject p, NObject x) {
final String url = x.get("url_in");
String xid = x.id();
if (url == null) {
return x;
}
try {
long exp;
InputStream stream;
long fileSize;
if (url.startsWith("file:")) {
File f = new File(url.substring(5));
exp = f.lastModified();
stream = new FileInputStream(f);
fileSize = f.length();
} else {
URL uu = new URL(url);
URLConnection con = uu.openConnection();
exp = con.getExpiration();
if (exp == 0)
exp = con.getLastModified();
fileSize = con.getContentLengthLong();
stream = con.getInputStream();
}
if (stream == null) {
throw new FileNotFoundException();
}
//TODO store a hashcode of the data as well as the time for additional integrity
if (p != null) {
String whenCached = p.get("url_cached");
if (!(whenCached == null || Long.valueOf(whenCached) < exp)) {
logger.debug("cached: {}", url);
//still valid
return p;
}
}
logger.info("load: {}", url);
GeoNObject y = new GeoNObject(x);
y.put("url_cached", Long.toString(exp));
boolean isKMLorKMZ = url.endsWith(".kml") || url.endsWith(".kmz");
boolean isGeoJSON = url.endsWith(".geojson");
if (!isKMLorKMZ && !isGeoJSON) /* handled separately below */
{
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
final RecursiveParserWrapper tikaWrapper = new RecursiveParserWrapper(tika, tikaFactory);
if (stream instanceof FileInputStream) {
y.put("data", url);
} else {
//buffer the bytes for saving
byte[] bytes = IOUtils.readFully(stream, (int) fileSize);
stream = new ByteArrayInputStream(bytes);
y.put("data", bytes);
}
tikaWrapper.parse(stream, new DefaultHandler(), metadata, context);
stream.close();
List<Metadata> m = tikaWrapper.getMetadata();
m.forEach( md -> {
for (String k : md.names()) {
String[] v = md.getValues(k);
String kk = tikiToField(k);
if (kk != null) {
Object vv = v.length > 1 ? v : v[0];
if (vv instanceof String) {
try {
int ivv = Integer.parseInt((String) vv);
vv = ivv;
} catch (Exception e) {
}
}
y.put(kk, vv);
}
}
});
}
if (isKMLorKMZ) {
new KML(db, y).url(url).run();
} else if (isGeoJSON) {
GeoJSON.load(url, GeoJSON.baseGeoJSONBuilder, db);
}
x = y;
} catch (Exception e) {
logger.error("url_in removal: {}", e);
}
Object mime = x.get(NObject.TYPE);
if (mime != null && (mime.equals("image/jpeg") || mime.equals("image/png"))) {
x = new MutableNObject(x).name(titleify(xid)).put(NObject.DESC, null).put("thumbnail", "data");
}
if ("application/pdf".equals(mime) && x.has("pageCount") && x.has(NObject.DESC)) /* leaf */
{
int pageCount = x.get("pageCount");
//float docPri = Util.lerp(1f / (pageCount), 0.75f, 0.25f);
String parentContent = x.get(NObject.DESC);
String author = x.get("author");
//db.runLater(docPri, () -> {
Document parentDOM = Jsoup.parse(parentContent);
Elements pagesHTML = parentDOM.select(".page");
PDDocument document = null;
try {
InputStream is;
if (url.startsWith("file:")) {
is = fileStream(url);
} else {
is = new URL(url).openStream();
}
document = PDDocument.load(is);
PDFRenderer renderer = new PDFRenderer(document);
for (int _page = 0; _page < pageCount; _page++) {
final int pageActual = _page;
final int page = _page + 1;
logger.info("paginate: {} {}", xid, page);
Document pd = Document.createShell("");
pd.body().appendChild(pagesHTML.get(pageActual).removeAttr("class"));
Elements cc = cleaner.clean(pd).body().children();
String[] pdb = cc.stream().filter( xx -> !xx.children().isEmpty() || xx.hasText()).map(//just use <p> contents
xx -> xx.tagName().equals("p") ? xx.text() : xx).map(Object::toString).toArray(String[]::new);
// List<JsonNode> jdb = new ArrayList(pdb.size());
// pdb.forEach(e -> {
// if (e.children().isEmpty() && e.text().isEmpty())
// return;
// jdb.add(html2json(e));
// });
//x.name();
String docTitle = parentDOM.title();
if (docTitle == null || docTitle.isEmpty()) {
docTitle = titleify(xid);
}
BufferedImage img = renderer.renderImageWithDPI(pageActual, (float) pdfPageImageDPI, ImageType.RGB);
//boolean result = ImageIOUtil.writeImage(img, outputFile, pdfPageImageDPI);
ByteArrayOutputStream os = new ByteArrayOutputStream(img.getWidth() * img.getHeight() * 3);
boolean result = ImageIOUtil.writeImage(img, "jpg", os, pdfPageImageDPI, thumbnailQuality);
byte[] thumbnail = os.toByteArray();
String text = pdb.length > 0 ? Joiner.on('\n').join(pdb) : null;
db.add(new MutableNObject(xid + "/" + page).name(docTitle + " - (" + page + " of " + (pageCount + 1) + ")").withTags(xid).put("author", author).put("url", //HACK browser loads the specific page when using the '#' anchor
url).put(NObject.TYPE, "application/pdf").put("data", xid + "#page=" + page).put("page", page).put(NObject.DESC, text).put(/*.putLater("textParse", 0.1f, ()-> {
return (pdb.length > 0) ? Stream.of(pdb).map(
t -> NLP.toString(NLP.parse(t))
).collect(Collectors.joining("\n")) : null;
})*/
"thumbnail", thumbnail));
}
} catch (IOException f) {
logger.error("error: {} {}", xid, f);
} finally {
if (document != null)
try {
document.close();
} catch (IOException e) {
}
}
}
//clean and update parent DOM
//String xname = x.name();
//String desc = x.get(NObject.DESC);
x = new MutableNObject(x).name(titleify(xid)).put(NObject.DESC, null);
return x;
}Example 100
| Project: yacy_search_server-master File: pdfParser.java View source code |
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false))
throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);
// create a pdf parser
PDDocument pdfDoc;
try {
// the pdfparser is a big pain
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
MemoryUsageSetting mus = MemoryUsageSetting.setupMixed(200 * 1024 * 1024);
pdfDoc = PDDocument.load(source, mus);
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
} finally {
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}
if (pdfDoc.isEncrypted()) {
final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent()) {
try {
pdfDoc.close();
} catch (final IOException ee) {
}
throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
}
}
// extracting some metadata
PDDocumentInformation info = pdfDoc.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
Date docDate = new Date();
if (info != null) {
docTitle = info.getTitle();
docSubject = info.getSubject();
docAuthor = info.getAuthor();
docPublisher = info.getProducer();
if (docPublisher == null || docPublisher.isEmpty())
docPublisher = info.getCreator();
docKeywordStr = info.getKeywords();
if (info.getModificationDate() != null)
docDate = info.getModificationDate().getTime();
// unused:
// info.getTrapped());
}
info = null;
if (docTitle == null || docTitle.isEmpty()) {
docTitle = MultiProtocolURL.unescape(location.getFileName());
}
if (docTitle == null) {
docTitle = docSubject;
}
String[] docKeywords = null;
if (docKeywordStr != null) {
docKeywords = docKeywordStr.split(" |,");
}
Document[] result = null;
try {
// get the links
final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
// get the fulltext (either per document or for each page)
final PDFTextStripper stripper = new PDFTextStripper();
if (individualPages) {
// this is a hack which stores individual pages of the source pdf into individual index documents
// the new documents will get a virtual link with a post argument page=X appended to the original url
// collect text
int pagecount = pdfDoc.getNumberOfPages();
String[] pages = new String[pagecount];
for (int page = 1; page <= pagecount; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
pages[page - 1] = stripper.getText(pdfDoc);
//System.out.println("PAGE " + page + ": " + pages[page - 1]);
}
// create individual documents for each page
assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
result = new Document[Math.min(pages.length, pdflinks.size())];
String loc = location.toNormalform(true);
for (int page = 0; page < result.length; page++) {
result[page] = new Document(// these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0d, 0.0d, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), null, null, false, docDate);
}
} else {
// collect the whole text at once
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
// get first 3 pages (always)
stripper.setEndPage(3);
writer.append(stripper.getText(pdfDoc));
// remember text in case of interrupting thread
contentBytes = writer.getBytes();
if (pdfDoc.getNumberOfPages() > 3) {
// spare creating/starting thread if all pages read
// continue with page 4 (terminated, resulting in no text)
stripper.setStartPage(4);
// set to default
stripper.setEndPage(Integer.MAX_VALUE);
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final PDDocument pdfDocC = pdfDoc;
final Thread t = new Thread() {
@Override
public void run() {
Thread.currentThread().setName("pdfParser.getText:" + location);
try {
writer.append(stripper.getText(pdfDocC));
} catch (final Throwable e) {
}
}
};
t.start();
// pdfbox likes to forget to terminate ... (quite often)
t.join(3000);
if (t.isAlive())
t.interrupt();
// get final text before closing writer
contentBytes = writer.getBytes();
// free writer resources
writer.close();
}
Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null)
pdflinksCombined.addAll(pdflinksx);
result = new Document[] { new Document(location, mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0d, 0.0d, contentBytes, pdflinksCombined, null, null, false, docDate) };
}
} catch (final Throwable e) {
} finally {
try {
pdfDoc.close();
} catch (final Throwable e) {
}
}
// clear resources in pdfbox. they say that is resolved but it's not. see:
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351
// https://issues.apache.org/jira/browse/PDFBOX-441
// the pdfbox still generates enormeous number of object allocations and don't delete these
// the following Object are statically stored and never flushed:
// COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
// COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
// the great number of these objects can easily be seen in Java Visual VM
// we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
pdfDoc = null;
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
return result;
}Example 101
| Project: qi4j-sdk-master File: PDFWriter.java View source code |
protected void writeImpl(File file, ApplicationDetailDescriptor descriptor, List<GraphDisplay> graphDisplays) throws IOException, COSVisitorException {
try {
doc = new PDDocument();
for (GraphDisplay graphDisplay : graphDisplays) {
writeGraphPage(graphDisplay);
}
writePage(descriptor);
if (curContentStream != null) {
curContentStream.close();
curContentStream = null;
}
doc.save(new FileOutputStream(file));
} finally {
if (curContentStream != null) {
curContentStream.close();
curContentStream = null;
}
if (doc != null) {
doc.close();
doc = null;
}
}
}