Java Examples for org.pdfbox.pdmodel.PDDocument

The following java examples will help you to understand the usage of org.pdfbox.pdmodel.PDDocument. These source code samples are taken from different open source projects.

Example 1
Project: xwiki-clams-core-master  File: PDFTextExtractor.java View source code
public String getText(byte[] data) throws Exception {
    PDDocument pdfDocument = null;
    try {
        PDFParser parser = new PDFParser(new ByteArrayInputStream(data));
        parser.parse();
        pdfDocument = parser.getPDDocument();
        Writer writer = new CharArrayWriter();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(pdfDocument, writer);
        return writer.toString();
    } finally {
        if (pdfDocument != null)
            pdfDocument.close();
    }
}
Example 2
Project: carbon-registry-master  File: PDFIndexer.java View source code
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
    COSDocument cosDoc = null;
    try {
        PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
        parser.parse();
        cosDoc = parser.getDocument();
        PDFTextStripper stripper = new PDFTextStripper();
        String docText = stripper.getText(new PDDocument(cosDoc));
        return new IndexDocument(fileData.path, docText, null);
    } catch (IOException e) {
        String msg = "Failed to write to the index";
        log.error(msg, e);
        throw new SolrException(ErrorCode.SERVER_ERROR, msg);
    } finally {
        if (cosDoc != null) {
            try {
                cosDoc.close();
            } catch (IOException e) {
                log.error("Failed to close pdf doc stream ", e);
            }
        }
    }
}
Example 3
Project: openmicroscopy-master  File: PdfParser.java View source code
@Override
public void run() {
    try {
        document = PDDocument.load(file);
    } catch (IOException e) {
        log.warn("Could not load Pdf " + file, e);
        try {
            writer.close();
        } catch (IOException ioe) {
        }
    }
    try {
        if (document != null && !document.isEncrypted()) {
            try {
                PDFTextStripper stripper = null;
                stripper = new PDFTextStripper();
                stripper.writeText(document, writer);
            } finally {
                close();
            }
        }
    } catch (IOException e) {
        log.warn("Error reading pdf file", e);
    }
}
Example 4
Project: Hibernate-Search-on-action-master  File: TestPDFTextExtractor.java View source code
@Test(groups = "ch13")
public void testPDFExtractor() throws Exception {
    FullTextSession session = Search.getFullTextSession(openSession());
    Transaction tx = session.beginTransaction();
    PDDocument doc;
    try {
        File f = new File("ch13/src/com/manning/hsia/dvdstore/file1.pdf");
        istream = new FileInputStream(f.getAbsolutePath());
        PDFParser p = new PDFParser(istream);
        p.parse();
        doc = p.getPDDocument();
        Pdf pdf = getDocument(doc);
        closeInputStream(istream);
        closeDocument(doc);
        pdf.setId(1);
        buildIndex(pdf, session, tx);
        tx = session.beginTransaction();
        QueryParser parser = new QueryParser("description", analyzer);
        Query query = parser.parse("description:salesman");
        org.hibernate.search.FullTextQuery hibQuery = session.createFullTextQuery(query, Pdf.class);
        List results = hibQuery.list();
        assert results.size() == 1 : "incorrect result size";
        Pdf result = (Pdf) results.get(0);
        assert result.getAuthor().startsWith("John Griffin") : "incorrect author";
        assert result.getDescription().startsWith("Keanu Reeves") : "incorrect description";
        for (Object element : session.createQuery("from " + Pdf.class.getName()).list()) {
            session.delete(element);
        }
        tx.commit();
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        session.close();
    }
}
Example 5
Project: CORISCO2-master  File: PDFPackager.java View source code
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null)
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null)
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        log.debug("PDF Info dict title=\"" + title + "\"");
        item.addDC("title", null, "en", title);
        String value;
        Calendar date;
        if ((value = docinfo.getAuthor()) != null) {
            item.addDC("contributor", "author", null, value);
            log.debug("PDF Info dict author=\"" + value + "\"");
        }
        if ((value = docinfo.getCreator()) != null)
            item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
        if ((value = docinfo.getProducer()) != null)
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        if ((value = docinfo.getSubject()) != null)
            item.addDC("description", "abstract", null, value);
        if ((value = docinfo.getKeywords()) != null)
            item.addDC("subject", "other", null, value);
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue;
        if ((calValue = docinfo.getCreationDate()) == null)
            calValue = docinfo.getModificationDate();
        if (calValue != null)
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        item.update();
    } finally {
        if (cos != null)
            cos.close();
    }
}
Example 6
Project: jinhe-tss-master  File: AttachmentIndex.java View source code
private String getContentFromPDF(File pdfFile) {
    // 内存中存储的PDF Document
    PDDocument document = null;
    String content = "";
    try {
        // 加载pdf文档
        document = PDDocument.load(pdfFile);
        // PDFTextStripper来提取文本
        PDFTextStripper stripper = new PDFTextStripper();
        // 设置是否排序
        stripper.setSortByPosition(false);
        // 设置起始页
        stripper.setStartPage(1);
        // 设置结束页
        stripper.setEndPage(Integer.MAX_VALUE);
        // 调用PDFTextStripper的getText()提取文本信息
        content = stripper.getText(document);
    } catch (Exception e) {
        log.error("发布索引时提取PDF文档:" + pdfFile.getPath() + " 内容失败!", e);
        return "";
    } finally {
        if (document != null) {
            // 关闭PDF Document
            try {
                document.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    return content;
}
Example 7
Project: railo-master  File: PDFUtil.java View source code
public static Object extractText(PDFDocument doc, Set<Integer> pageNumbers) throws IOException, CryptographyException, InvalidPasswordException {
    PDDocument pdDoc = doc.toPDDocument();
    //PDPageNode pages = pdDoc.getDocumentCatalog().getPages();
    //pages.
    //pdDoc.getDocumentCatalog().
    /*Iterator<Integer> it = pageNumbers.iterator();
		int p;
		while(it.hasNext()){
			p=it.next().intValue();
		
			pdDoc.getDocumentCatalog().getPages()
		}
		*/
    //print.o(pages);
    //pdDoc.
    //PDFTextStripperByArea  stripper = new PDFTextStripperByArea();
    //PDFHighlighter  stripper = new PDFHighlighter();
    PDFText2HTML stripper = new PDFText2HTML();
    //PDFTextStripper stripper = new PDFTextStripper();
    StringWriter writer = new StringWriter();
    stripper.writeText(pdDoc, writer);
    return writer.toString();
}
Example 8
Project: openpipe-master  File: PDFParser.java View source code
@Override
public ParserResult parse(ParseData data) throws IOException, ParserException {
    final PDDocument doc = PDDocument.load(data.getInputStream(), scratchFile);
    try {
        writer.reset();
        try {
            stripper.writeText(doc, writer);
            final ParserResultImpl result = new ParserResultImpl();
            result.setText(writer.toString());
            result.setTitle(doc.getDocumentInformation().getTitle());
            return result;
        } finally {
            writer.trimToMaxSize(1024 * 64);
        }
    } finally {
        try {
            doc.close();
        } catch (IOException e) {
        }
    }
}
Example 9
Project: agile-itsm-master  File: Arquivo.java View source code
/**
	 * Extrai o texto de um documento no formato PDF.
	 * 
	 * @param caminhoDocumento
	 * @param nomeArquivo
	 * @return StringBuilder
	 * @throws IOException
	 */
private StringBuilder extrairFormatoPDF(String caminhoDocumento) throws IOException {
    StringBuilder texto = new StringBuilder();
    PDFParser parser;
    FileInputStream fi = null;
    COSDocument cd = null;
    try {
        fi = new FileInputStream(new File(caminhoDocumento));
        parser = new PDFParser(fi);
        parser.parse();
        cd = parser.getDocument();
        texto.append(new PDFTextStripper().getText(new PDDocument(cd)));
    } catch (FileNotFoundException e1) {
        e1.printStackTrace();
    } catch (IOException e1) {
        e1.printStackTrace();
    } finally {
        if (cd != null)
            cd.close();
        if (fi != null)
            fi.close();
    }
    return texto;
}
Example 10
Project: Desktop-master  File: XMPUtilTest.java View source code
/**
	 * Write a manually constructed xmp-string to file
	 * 
	 * @param xmpString
	 * @throws Exception
	 */
public void writeManually(File tempFile, String xmpString) throws Exception {
    PDDocument document = null;
    try {
        document = PDDocument.load(tempFile.getAbsoluteFile());
        if (document.isEncrypted()) {
            System.err.println("Error: Cannot add metadata to encrypted document.");
            System.exit(1);
        }
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        // Convert to UTF8 and make available for metadata.
        ByteArrayOutputStream bs = new ByteArrayOutputStream();
        OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8");
        os.write(xmpString);
        os.close();
        ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray());
        PDMetadata metadataStream = new PDMetadata(document, in, false);
        catalog.setMetadata(metadataStream);
        document.save(tempFile.getAbsolutePath());
    } finally {
        if (document != null)
            document.close();
    }
}
Example 11
Project: infoglue-master  File: LuceneController.java View source code
private String extractTextToIndex(DigitalAssetVO digitalAssetVO, File file) {
    String text = "";
    if (logger.isInfoEnabled())
        logger.info("Asset content type:" + digitalAssetVO.getAssetContentType());
    if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/pdf")) {
        try {
            Writer output = null;
            PDDocument document = null;
            try {
                document = PDDocument.load(file);
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                if (!document.isEncrypted()) {
                    output = new OutputStreamWriter(baos, "UTF-8");
                    PDFTextStripper stripper = new PDFTextStripper();
                    //stripper.setSortByPosition( sort );
                    //stripper.setStartPage( startPage );
                    //stripper.setEndPage( endPage );
                    stripper.writeText(document, output);
                    text = baos.toString("UTF-8");
                    if (logger.isInfoEnabled())
                        logger.info("PDF Document has " + text.length() + " chars\n\n" + text);
                }
            } catch (Exception e) {
                logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage());
            } finally {
                if (output != null) {
                    output.close();
                }
                if (document != null) {
                    document.close();
                }
            }
        } catch (Exception e) {
            logger.warn("Error indexing:" + e.getMessage());
        }
    } else if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/msword")) {
        try {
            InputStream is = new FileInputStream(file);
            POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(file));
            is.close();
            // Create a document for this file
            HWPFDocument doc = new HWPFDocument(fs);
            // Create a WordExtractor to read the text of the word document
            WordExtractor we = new WordExtractor(doc);
            // Extract all paragraphs in the document as strings
            text = we.getText();
            // Output the document
            if (logger.isInfoEnabled())
                logger.info("Word Document has " + text.length() + " chars\n\n" + text);
        } catch (Exception e) {
            logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage());
        }
    }
    return text;
}
Example 12
Project: apache-nutch-fork-master  File: PdfParser.java View source code
public ParseResult getParse(Content content) {
    // in memory representation of pdf file
    PDDocument pdf = null;
    String text = null;
    String title = null;
    Metadata metadata = new Metadata();
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
        }
        PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
        parser.parse();
        pdf = parser.getPDDocument();
        if (pdf.isEncrypted()) {
            //Just try using the default password and move on
            pdf.openProtection(new StandardDecryptionMaterial(""));
        }
        // collect text
        PDFTextStripper stripper = new PDFTextStripper();
        text = stripper.getText(pdf);
        // collect title
        PDDocumentInformation info = pdf.getDocumentInformation();
        title = info.getTitle();
        // more useful info, currently not used. please keep them for future use.
        metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
        metadata.add(Metadata.AUTHOR, info.getAuthor());
        metadata.add(Metadata.SUBJECT, info.getSubject());
        metadata.add(Metadata.KEYWORDS, info.getKeywords());
        metadata.add(Metadata.CREATOR, info.getCreator());
        metadata.add(Metadata.PUBLISHER, info.getProducer());
    //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
    //error here
    //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
    //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
    } catch (CryptographyException e) {
        return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (BadSecurityHandlerException e) {
        return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (// run time exception
    Exception // run time exception
    e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("General exception in PDF parser: " + e.getMessage());
            e.printStackTrace(LogUtil.getWarnStream(LOG));
        }
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } finally {
        try {
            if (pdf != null)
                pdf.close();
        } catch (IOException e) {
        }
    }
    if (text == null)
        text = "";
    if (title == null)
        title = "";
    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Example 13
Project: nutchbase-master  File: PdfParser.java View source code
public ParseResult getParse(Content content) {
    // in memory representation of pdf file
    PDDocument pdf = null;
    String text = null;
    String title = null;
    Metadata metadata = new Metadata();
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
        }
        PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
        parser.parse();
        pdf = parser.getPDDocument();
        if (pdf.isEncrypted()) {
            //Just try using the default password and move on
            pdf.openProtection(new StandardDecryptionMaterial(""));
        }
        // collect text
        PDFTextStripper stripper = new PDFTextStripper();
        text = stripper.getText(pdf);
        // collect title
        PDDocumentInformation info = pdf.getDocumentInformation();
        title = info.getTitle();
        // more useful info, currently not used. please keep them for future use.
        metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
        metadata.add(Metadata.AUTHOR, info.getAuthor());
        metadata.add(Metadata.SUBJECT, info.getSubject());
        metadata.add(Metadata.KEYWORDS, info.getKeywords());
        metadata.add(Metadata.CREATOR, info.getCreator());
        metadata.add(Metadata.PUBLISHER, info.getProducer());
    //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
    //error here
    //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
    //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
    } catch (CryptographyException e) {
        return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (BadSecurityHandlerException e) {
        return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (// run time exception
    Exception // run time exception
    e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("General exception in PDF parser: " + e.getMessage());
            e.printStackTrace(LogUtil.getWarnStream(LOG));
        }
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } finally {
        try {
            if (pdf != null)
                pdf.close();
        } catch (IOException e) {
        }
    }
    if (text == null)
        text = "";
    if (title == null)
        title = "";
    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Example 14
Project: gnutch-master  File: PdfParser.java View source code
public ParseResult getParse(Content content) {
    // in memory representation of pdf file
    PDDocument pdf = null;
    String text = null;
    String title = null;
    Metadata metadata = new Metadata();
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
        }
        PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
        parser.parse();
        pdf = parser.getPDDocument();
        if (pdf.isEncrypted()) {
            DocumentEncryption decryptor = new DocumentEncryption(pdf);
            //Just try using the default password and move on
            decryptor.decryptDocument("");
        }
        // collect text
        PDFTextStripper stripper = new PDFTextStripper();
        text = stripper.getText(pdf);
        // collect title
        PDDocumentInformation info = pdf.getDocumentInformation();
        title = info.getTitle();
        // more useful info, currently not used. please keep them for future use.
        metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
        metadata.add(Metadata.AUTHOR, info.getAuthor());
        metadata.add(Metadata.SUBJECT, info.getSubject());
        metadata.add(Metadata.KEYWORDS, info.getKeywords());
        metadata.add(Metadata.CREATOR, info.getCreator());
        metadata.add(Metadata.PUBLISHER, info.getProducer());
    //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
    //error here
    //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
    //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
    } catch (CryptographyException e) {
        return new ParseStatus(ParseStatus.FAILED, "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (InvalidPasswordException e) {
        return new ParseStatus(ParseStatus.FAILED, "Can't decrypt document - invalid password. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (// run time exception
    Exception // run time exception
    e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("General exception in PDF parser: " + e.getMessage());
            e.printStackTrace(LogUtil.getWarnStream(LOG));
        }
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } finally {
        try {
            if (pdf != null)
                pdf.close();
        } catch (IOException e) {
        }
    }
    if (text == null)
        text = "";
    if (title == null)
        title = "";
    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata(), metadata);
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Example 15
Project: Jorum-DSpace-master  File: PDFPackager.java View source code
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null)
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null)
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        log.debug("PDF Info dict title=\"" + title + "\"");
        item.addDC("title", null, "en", title);
        String value;
        Calendar date;
        if ((value = docinfo.getAuthor()) != null) {
            item.addDC("contributor", "author", null, value);
            log.debug("PDF Info dict author=\"" + value + "\"");
        }
        if ((value = docinfo.getCreator()) != null)
            item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
        if ((value = docinfo.getProducer()) != null)
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        if ((value = docinfo.getSubject()) != null)
            item.addDC("description", "abstract", null, value);
        if ((value = docinfo.getKeywords()) != null)
            item.addDC("subject", "other", null, value);
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue;
        if ((calValue = docinfo.getCreationDate()) == null)
            calValue = docinfo.getModificationDate();
        if (calValue != null)
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        item.update();
    } finally {
        if (cos != null)
            cos.close();
    }
}
Example 16
Project: Docear-master  File: XMPUtilTest.java View source code
/**
	 * Write a manually constructed xmp-string to file
	 * 
	 * @param xmpString
	 * @throws Exception
	 */
public void writeManually(File tempFile, String xmpString) throws Exception {
    PDDocument document = null;
    try {
        document = PDDocument.load(tempFile.getAbsoluteFile());
        if (document.isEncrypted()) {
            System.err.println("Error: Cannot add metadata to encrypted document.");
            System.exit(1);
        }
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        // Convert to UTF8 and make available for metadata.
        ByteArrayOutputStream bs = new ByteArrayOutputStream();
        OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8");
        os.write(xmpString);
        os.close();
        ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray());
        PDMetadata metadataStream = new PDMetadata(document, in, false);
        catalog.setMetadata(metadataStream);
        document.save(tempFile.getAbsolutePath());
    } finally {
        if (document != null)
            document.close();
    }
}
Example 17
Project: desktop-master  File: XMPUtilTest.java View source code
/**
	 * Write a manually constructed xmp-string to file
	 * 
	 * @param xmpString
	 * @throws Exception
	 */
public void writeManually(File tempFile, String xmpString) throws Exception {
    PDDocument document = null;
    try {
        document = PDDocument.load(tempFile.getAbsoluteFile());
        if (document.isEncrypted()) {
            System.err.println("Error: Cannot add metadata to encrypted document.");
            System.exit(1);
        }
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        // Convert to UTF8 and make available for metadata.
        ByteArrayOutputStream bs = new ByteArrayOutputStream();
        OutputStreamWriter os = new OutputStreamWriter(bs, "UTF8");
        os.write(xmpString);
        os.close();
        ByteArrayInputStream in = new ByteArrayInputStream(bs.toByteArray());
        PDMetadata metadataStream = new PDMetadata(document, in, false);
        catalog.setMetadata(metadataStream);
        document.save(tempFile.getAbsolutePath());
    } finally {
        if (document != null)
            document.close();
    }
}
Example 18
Project: caelum-stella-master  File: BoletoTransformerIntegrationTest.java View source code
@Test
public void testPDFWriterEscreveValorCorreto() throws IOException {
    PDFTextStripper stripper = new PDFTextStripper();
    PDDocument document = PDDocument.load(new File("arquivo.pdf"));
    String text = stripper.getText(document);
    document.close();
    assertTrue(text.contains("40,00"));
}
Example 19
Project: novelang-master  File: HttpDaemonFixture.java View source code
public static String extractPdfText(final byte[] pdfBytes) throws IOException {
    final PDDocument pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
    try {
        return new PDFTextStripper().getText(pdfDocument);
    } finally {
        pdfDocument.close();
    }
}
Example 20
Project: ecologylabFundamental-master  File: Environment.java View source code
public boolean hasPDFBox() {
    if (!checkedForPDFBox) {
        checkedForPDFBox = true;
        hasPDFBox = checkFor("org.pdfbox.pdmodel.PDDocument");
        debug("hasPDFBox() = " + hasPDFBox);
        if (hasPDFBox)
            ConsoleUtils.obtrusiveConsoleOutput("PDFBox Found");
    }
    return hasPDFBox;
}