Java Examples for org.apache.pdfbox.pdfparser.PDFParser

The following java examples will help you to understand the usage of org.apache.pdfbox.pdfparser.PDFParser. These source code samples are taken from different open source projects.

Example 1
Project: java-wkhtmltopdf-wrapper-master  File: PdfTest.java View source code
@Test
public void testPdfFromStringTo() throws Exception {
    // GIVEN a html template containing special characters that java stores in utf-16 internally
    Pdf pdf = new Pdf();
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
    // WHEN
    byte[] pdfBytes = pdf.getPDF();
    PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
    Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
Example 2
Project: wkhtmltopdf-master  File: PdfTest.java View source code
@Test
public void testPdfFromStringTo() throws Exception {
    // GIVEN a html template containing special characters that java stores in utf-16 internally
    Pdf pdf = new Pdf();
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
    // WHEN
    byte[] pdfBytes = pdf.getPDF();
    PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
    Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
Example 3
Project: Europeana-Cloud-master  File: PdfBoxExtractor.java View source code
@Override
public String extractText(InputStream is) {
    if (is == null) {
        LOGGER.warn("No data for extraction.");
        return null;
    }
    PDFParser parser;
    String parsedText = null;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    try {
        parser = new PDFParser(is);
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        PDDocumentInformation info = pdDoc.getDocumentInformation();
        Set<String> mdKeys = info.getMetadataKeys();
        extractedMetadata = new HashMap<>();
        for (String key : mdKeys) {
            String value = (String) info.getPropertyStringValue(key);
            extractedMetadata.put(key, value);
        }
        //possible NULL pointer if document is encrypted
        parsedText = pdfStripper.getText(pdDoc);
    } catch (IOException ex) {
        LOGGER.warn("Can not extract text from pdf because: " + ex.getMessage());
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (IOException ex) {
        }
    }
    return parsedText;
}
Example 4
Project: extension-aws-master  File: PdfParser.java View source code
public Parse parse(InputStream inContent) {
    Parse results = new Parse();
    PDDocument pdf = null;
    try {
        PDFParser parser = new PDFParser(inContent);
        //					new ByteArrayInputStream(inContent));
        parser.parse();
        pdf = parser.getPDDocument();
        if (pdf.isEncrypted()) {
            DocumentEncryption decryptor = new DocumentEncryption(pdf);
            // Just try using the default password and move on
            decryptor.decryptDocument("");
        }
        // collect text
        PDFTextStripper stripper = new PDFTextStripper();
        //TODO: Write this out to a temp file that will be indexed seperately
        String text = null;
        String title = null;
        try {
            text = stripper.getText(pdf);
        } catch (Throwable e) {
            log.error("Could not parse", e);
            text = "";
        }
        text = scrubChars(text);
        results.setText(text);
        results.setPages(pdf.getNumberOfPages());
        // collect title
        PDDocumentInformation info = pdf.getDocumentInformation();
        title = info.getTitle();
        results.setTitle(title);
        if (pdf.getNumberOfPages() > 0) {
            PDPage page = (PDPage) pdf.getDocumentCatalog().getAllPages().get(0);
            PDRectangle mediaBox = page.getMediaBox();
            if (mediaBox == null) {
                mediaBox = page.getArtBox();
            }
            if (mediaBox != null) {
                results.put("width", String.valueOf(Math.round(mediaBox.getWidth())));
                results.put("height", String.valueOf(Math.round(mediaBox.getHeight())));
            }
        }
    //Thread.sleep(500); // Slow down PDF's loading
    } catch (CryptographyException e) {
        log.error("Error decrypting document. " + e);
    } catch (InvalidPasswordException e) {
        log.error("Can't decrypt document - invalid password. " + e);
    } catch (Exception e) {
        log.error("Can't be handled as pdf document. " + e);
    } finally {
        try {
            if (pdf != null)
                pdf.close();
        } catch (IOException e) {
        }
    }
    return results;
}
Example 5
Project: knowledge_vault-master  File: PdfTextExtractor.java View source code
//-------------------------------------------------------< TextExtractor >
/**
     * {@inheritDoc}
     */
@SuppressWarnings("rawtypes")
public Reader extractText(InputStream stream, String type, String encoding) throws IOException {
    try {
        PDFParser parser = new PDFParser(new BufferedInputStream(stream));
        try {
            parser.parse();
            PDDocument document = parser.getPDDocument();
            CharArrayWriter writer = new CharArrayWriter();
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(document, writer);
            String st = writer.toString().trim();
            log.debug("TextStripped: '{}'", st);
            if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) {
                log.warn("PDF does not contains text layer");
                // Extract images from PDF
                List pages = document.getDocumentCatalog().getAllPages();
                StringBuilder sb = new StringBuilder();
                for (Iterator itPg = pages.iterator(); itPg.hasNext(); ) {
                    PDPage page = (PDPage) itPg.next();
                    PDResources resources = page.getResources();
                    Map images = resources.getImages();
                    if (images != null) {
                        for (Iterator itImg = images.keySet().iterator(); itImg.hasNext(); ) {
                            String key = (String) itImg.next();
                            PDXObjectImage image = (PDXObjectImage) images.get(key);
                            File pdfImg = File.createTempFile(key, "." + image.getSuffix());
                            log.debug("Writing image: {}", pdfImg.getPath());
                            image.write2file(pdfImg);
                            String txt = new CuneiformTextExtractor().doOcr(pdfImg);
                            sb.append(txt).append(" ");
                            log.debug("OCR Extracted: {}", txt);
                            FileUtils.deleteQuietly(pdfImg);
                        }
                    }
                }
                return new StringReader(sb.toString());
            } else {
                return new CharArrayReader(writer.toCharArray());
            }
        } finally {
            try {
                PDDocument doc = parser.getPDDocument();
                if (doc != null) {
                    doc.close();
                }
            } catch (IOException e) {
            }
        }
    } catch (Exception e) {
        log.warn("Failed to extract PDF text content", e);
        return new StringReader("");
    } finally {
        stream.close();
    }
}
Example 6
Project: nuxeo-versions-difference-master  File: TestPdfBoxN.java View source code
private boolean setMain(String FileName) throws Exception {
    file = new File(FileName);
    if (!file.isFile()) {
        System.err.println("File " + "test.pdf" + " does not exist.");
        return false;
    }
    try {
        parser = new PDFParser(new FileInputStream(file));
    } catch (IOException e) {
        System.err.println("Unable to open PDF Parser. " + e.getMessage());
        return false;
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
    } catch (Exception e) {
        return false;
    }
    return true;
}
Example 7
Project: PDF-to-unusual-HTML-master  File: Overlay.java View source code
private static PDDocument getDocument(String filename) throws IOException {
    FileInputStream input = null;
    PDFParser parser = null;
    PDDocument result = null;
    try {
        input = new FileInputStream(filename);
        parser = new PDFParser(input);
        parser.parse();
        result = parser.getPDDocument();
    } finally {
        if (input != null) {
            input.close();
        }
    }
    return result;
}
Example 8
Project: sakai-cle-master  File: PDFContentDigester.java View source code
public String getContent(ContentResource contentResource) {
    if (contentResource == null) {
        throw new RuntimeException("Null contentResource passed to getContent");
    }
    InputStream contentStream = null;
    PDFParser parser = null;
    PDDocument pddoc = null;
    try {
        contentStream = contentResource.streamContent();
        parser = new PDFParser(new BufferedInputStream(contentStream));
        parser.parse();
        pddoc = parser.getPDDocument();
        if (pddoc != null) {
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            CharArrayWriter cw = new CharArrayWriter();
            stripper.writeText(pddoc, cw);
            return SearchUtils.appendCleanString(cw.toCharArray(), null).toString();
        }
    } catch (ServerOverloadException e) {
        String eMessage = e.getMessage();
        if (eMessage == null) {
            eMessage = e.toString();
        }
        throw new RuntimeException("Failed to get content for indexing: cause: ServerOverloadException: " + eMessage, e);
    } catch (IOException e) {
        String eMessage = e.getMessage();
        if (eMessage == null) {
            eMessage = e.toString();
        }
        throw new RuntimeException("Failed to get content for indexing: cause: IOException:  " + eMessage, e);
    } finally {
        if (pddoc != null) {
            try {
                pddoc.close();
            } catch (IOException e) {
                log.debug(e);
            }
        }
        if (contentStream != null) {
            try {
                contentStream.close();
            } catch (IOException e) {
                log.debug(e);
            }
        }
    }
    return null;
}
Example 9
Project: with-aes-master  File: Overlay.java View source code
private static PDDocument getDocument(String filename) throws IOException {
    FileInputStream input = null;
    PDFParser parser = null;
    PDDocument result = null;
    try {
        input = new FileInputStream(filename);
        parser = new PDFParser(input);
        parser.parse();
        result = parser.getPDDocument();
    } finally {
        if (input != null) {
            input.close();
        }
    }
    return result;
}
Example 10
Project: leech-master  File: LeechConfig.java View source code
protected void init() {
    LinkedList<Parser> llParsers = new LinkedList<Parser>();
    // der default-Parser aus der TikaConfig
    llParsers.add(super.getParser());
    // die Leech-datasource-crawler-parser - die letzten werden priorisiert, somit können wir hier z.b. den Original-html-parser überschreiben
    llParsers.add(new DirectoryCrawlerParser());
    llParsers.add(new HtmlCrawlerParser());
    llParsers.add(new ImapCrawlerParser());
    m_parser = new CompositeParser(this.getMediaTypeRegistry(), llParsers);
    m_detector = new LeechDefaultDetector(m_parser);
    // die kommen in ein field, da die Einstellung wohl nur so lange gültig ist, wie es noch eine gültige Referenz zu diesen Objekten gibt
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.util.PDFStreamEngine"));
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.encoding.Encoding"));
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.BaseParser"));
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdmodel.font.PDSimpleFont"));
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.XrefTrailerResolver"));
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.filter.FlateFilter"));
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.PDFParser"));
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.util.operator.SetTextFont"));
    m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.*"));
    for (Logger logger : m_llPdfBoxLogger) logger.setLevel(Level.OFF);
}
Example 11
Project: MEditor-master  File: GetOcrFromPdfHandler.java View source code
private String pdftoText(String fileName) throws ActionException {
    File pdfFile = new File(fileName);
    if (!pdfFile.isFile()) {
        LOGGER.error("The file: " + fileName + " does not exist.");
        throw new ActionException("Unable to parse the pdf file.");
    }
    PDFParser parser = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;
    PDDocument pdDoc = null;
    String parsedText;
    try {
        parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile)));
    } catch (Exception e) {
        LOGGER.error("Unable to open PDF Parser.: " + e);
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file.");
    }
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        LOGGER.error("An exception occured in parsing the PDF Document.");
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file. " + e);
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    return parsedText;
}
Example 12
Project: seng310-ebookme-master  File: PdfExtractor.java View source code
public void extract(InputStream stream, Charset charset, String mimeType, Map result) throws ExtractorException {
    // setup a PDDocument
    PDDocument document = null;
    try {
        try {
            PDFParser parser = new PDFParser(stream);
            parser.parse();
            document = parser.getPDDocument();
        } catch (IOException e) {
            throw new ExtractorException(e);
        }
        // decrypt and extract info from this document
        processDocument(document, result);
    } finally {
        if (document != null) {
            // close the document
            try {
                document.close();
            } catch (IOException e) {
                throw new ExtractorException(e);
            }
        }
    }
}
Example 13
Project: cider-master  File: pdfIdiom.java View source code
@Override
public Model parse(DataSource source) throws ParserException {
    // create an empty Model
    Model model = ModelFactory.createDefaultModel();
    Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource();
    // open pdf document
    final PDDocument theDocument;
    final PDFParser parser;
    try {
        parser = new PDFParser(source.getStream());
        parser.parse();
        theDocument = parser.getPDDocument();
    } catch (IOException e) {
        log.error(e.getMessage(), e);
        throw new ParserException(e.getMessage(), source.getURI());
    }
    if (theDocument.isEncrypted()) {
        try {
            theDocument.openProtection(new StandardDecryptionMaterial(""));
        } catch (BadSecurityHandlerException e) {
            throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e);
        } catch (IOException e) {
            throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
        } catch (CryptographyException e) {
            throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e);
        }
        final AccessPermission perm = theDocument.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent())
            throw new ParserException("PDF cannot be decrypted", source.getURI());
    }
    // get metadata
    final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
    if (theDocInfo != null) {
        docTitle = theDocInfo.getTitle();
        docSubject = theDocInfo.getSubject();
        docAuthor = theDocInfo.getAuthor();
        docKeywordStr = theDocInfo.getKeywords();
    }
    if (docAuthor != null && docAuthor.length() > 0) {
        resource.addProperty(VCARD.FN, docAuthor);
        resource.addProperty(DC.creator, docAuthor);
    }
    if (docSubject != null && docSubject.length() > 0) {
        resource.addProperty(DC.subject, docSubject);
    }
    if (docTitle != null && docTitle.length() > 0) {
        resource.addProperty(DC.title, docTitle);
    }
    String[] docKeywords = null;
    if (docKeywordStr != null && docKeywordStr.length() > 0) {
        docKeywords = docKeywordStr.split(" |,");
        resource.addProperty(DC.coverage, concat(docKeywords));
    }
    // get the content
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Writer writer;
    try {
        writer = new OutputStreamWriter(baos, "UTF-8");
    } catch (UnsupportedEncodingException e1) {
        writer = new OutputStreamWriter(baos);
    }
    try {
        final PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(theDocument, writer);
        theDocument.close();
        writer.close();
    } catch (IOException e) {
        if (writer != null)
            try {
                writer.close();
            } catch (final Exception ex) {
            }
        throw new ParserException("PDF content reader", source.getURI(), e);
    }
    String content;
    try {
        content = new String(baos.toByteArray(), "UTF-8");
    } catch (UnsupportedEncodingException e) {
        content = new String(baos.toByteArray());
    }
    if (content != null && content.length() > 0) {
        resource.addProperty(CIDER.data_content_text, content);
    }
    return model;
}
Example 14
Project: streamflow-core-master  File: Underlay.java View source code
private static PDDocument getDocument(String filename) throws IOException {
    FileInputStream input = null;
    PDFParser parser = null;
    PDDocument result = null;
    try {
        input = new FileInputStream(filename);
        parser = new PDFParser(input);
        parser.parse();
        result = parser.getPDDocument();
    } finally {
        if (input != null) {
            input.close();
        }
    }
    return result;
}
Example 15
Project: converge-1.x-master  File: MetaDataService.java View source code
/** {@inheritDoc } */
@Override
public String extractContent(MediaItemRendition mir) {
    String contentType = mir.getContentType();
    String story = "";
    if (contentType == null) {
        LOG.log(Level.WARNING, "Content type is null");
        return story;
    }
    if (contentType.equals("application/pdf")) {
        // Extract text in PDF
        try {
            URL originalFile = new URL(mir.getAbsoluteFilename());
            PDDocument doc = null;
            try {
                // Read PDF
                PDFParser parser = new PDFParser(originalFile.openStream());
                parser.parse();
                COSDocument cosDoc = parser.getDocument();
                PDDocument pdDoc = new PDDocument(cosDoc);
                PDFTextStripper stripper = new PDFTextStripper();
                story = stripper.getText(pdDoc);
            } catch (IOException ex) {
                LOG.log(Level.SEVERE, ex.getMessage());
                LOG.log(Level.FINEST, "", ex);
            } finally {
                if (doc != null) {
                    try {
                        doc.close();
                    } catch (IOException ex) {
                        LOG.log(Level.SEVERE, ex.getMessage());
                        LOG.log(Level.FINEST, "", ex);
                    }
                }
            }
        } catch (MalformedURLException ex) {
        }
    } else if (contentType.equals("application/msword") || contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
        try {
            URL originalFile = new URL(mir.getAbsoluteFilename());
            HWPFDocument doc = new HWPFDocument(originalFile.openStream());
            WordExtractor extractor = new WordExtractor(doc);
            story = extractor.getText();
        } catch (IOException ex) {
            LOG.log(Level.SEVERE, ex.getMessage());
            LOG.log(Level.FINEST, "", ex);
        }
    }
    return story;
}
Example 16
Project: dlibrary-master  File: PDFPackager.java View source code
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        item.addDC("title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            item.addDC("contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }
        value = docinfo.getCreator();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
        }
        value = docinfo.getProducer();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        }
        value = docinfo.getSubject();
        if (value != null) {
            item.addDC("description", "abstract", null, value);
        }
        value = docinfo.getKeywords();
        if (value != null) {
            item.addDC("subject", "other", null, value);
        }
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }
        if (calValue != null) {
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        item.update();
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}
Example 17
Project: DSpace-master  File: PDFPackager.java View source code
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        ScratchFile scratchFile = null;
        try {
            // use up to 80% of JVM free memory
            long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100;
            // then fallback to temp file (unlimited size)
            scratchFile = new ScratchFile(MemoryUsageSetting.setupMixed(useRAM));
        } catch (IOException ioe) {
            log.warn("Error initializing scratch file: " + ioe.getMessage());
        }
        PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(metadata), scratchFile);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }
        value = docinfo.getCreator();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "provenance", "en", "Application that created the original document: " + value);
        }
        value = docinfo.getProducer();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "provenance", "en", "Original document converted to PDF by: " + value);
        }
        value = docinfo.getSubject();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "abstract", null, value);
        }
        value = docinfo.getKeywords();
        if (value != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "subject", "other", null, value);
        }
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }
        if (calValue != null) {
            itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        itemService.update(context, item);
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}
Example 18
Project: DSpace-SVN-Deprecated-master  File: PDFPackager.java View source code
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        item.addDC("title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            item.addDC("contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }
        value = docinfo.getCreator();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
        }
        value = docinfo.getProducer();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        }
        value = docinfo.getSubject();
        if (value != null) {
            item.addDC("description", "abstract", null, value);
        }
        value = docinfo.getKeywords();
        if (value != null) {
            item.addDC("subject", "other", null, value);
        }
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }
        if (calValue != null) {
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        item.update();
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}
Example 19
Project: gsearch-master  File: TransformerToText.java View source code
private StringBuffer getTextFromPDF(byte[] doc) throws GenericSearchException {
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF");
    StringBuffer docText = new StringBuffer();
    ByteArrayInputStream bais = null;
    try {
        bais = new ByteArrayInputStream(doc);
    } catch (Exception e) {
        closeBAIS(bais);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF new ByteArrayInputStream: ", e);
        throw new GenericSearchException("getTextFromPDF new ByteArrayInputStream: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF new ByteArrayInputStream");
    PDFParser parser;
    try {
        parser = new PDFParser(bais);
    } catch (Exception e) {
        closeBAIS(bais);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF new PDFParser: ", e);
        throw new GenericSearchException("getTextFromPDF new PDFParser: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF new PDFParser");
    try {
        parser.parse();
    } catch (Exception e) {
        closeBAIS(bais);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF parser.parse: ", e);
        throw new GenericSearchException("getTextFromPDF parser.parse: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF parser.parse");
    COSDocument cosDoc = null;
    try {
        cosDoc = parser.getDocument();
    } catch (Exception e) {
        closeBAIS(bais);
        closeCOSDocument(cosDoc);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF parser.getDocument: ", e);
        throw new GenericSearchException("getTextFromPDF parser.getDocument: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF parser.getDocument");
    PDDocument pdDoc = null;
    try {
        pdDoc = new PDDocument(cosDoc);
    } catch (Exception e) {
        closeBAIS(bais);
        closeCOSDocument(cosDoc);
        closePDDocument(pdDoc);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF new PDDocument: ", e);
        throw new GenericSearchException("getTextFromPDF new PDDocument: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF new PDDocument isEncrypted=" + pdDoc.isEncrypted() + " getNumberOfPages=" + pdDoc.getNumberOfPages());
    PDFTextStripper stripper;
    try {
        stripper = new PDFTextStripper();
    } catch (Exception e) {
        closeBAIS(bais);
        closeCOSDocument(cosDoc);
        closePDDocument(pdDoc);
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF new PDFTextStripper: ", e);
        throw new GenericSearchException("getTextFromPDF new PDFTextStripper: ", e);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF new PDFTextStripper getStartPage=" + stripper.getStartPage() + " getEndPage=" + stripper.getEndPage());
    String docString = "";
    try {
        docString = stripper.getText(pdDoc);
    } catch (Exception e) {
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF stripper.getText: ", e);
        throw new GenericSearchException("getTextFromPDF stripper.getText: ", e);
    } finally {
        if (logger.isDebugEnabled())
            logger.debug("getTextFromPDF stripper.getText finally");
        closeBAIS(bais);
        closeCOSDocument(cosDoc);
        closePDDocument(pdDoc);
    }
    if (logger.isDebugEnabled())
        logger.debug("getTextFromPDF stripper.getText");
    docText = new StringBuffer(docString);
    //      put space instead of characters not allowed in the indexing stylesheet
    char c;
    for (int i = 0; i < docText.length(); i++) {
        c = docText.charAt(i);
        if (c < 32 && c != 9 && c != 10 && c != 13) {
            if (logger.isDebugEnabled())
                logger.debug("getTextFromPDF index=" + i + " char=" + c + " set to 32");
            docText.replace(i, i + 1, " ");
        }
    }
    return docText;
}
Example 20
Project: vtechworks-master  File: PDFPackager.java View source code
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;
    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();
        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }
        /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();
        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        item.addDC("title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            item.addDC("contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }
        value = docinfo.getCreator();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
        }
        value = docinfo.getProducer();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        }
        value = docinfo.getSubject();
        if (value != null) {
            item.addDC("description", "abstract", null, value);
        }
        value = docinfo.getKeywords();
        if (value != null) {
            item.addDC("subject", "other", null, value);
        }
        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }
        if (calValue != null) {
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        item.update();
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}
Example 21
Project: DocBleach-master  File: PdfBleach.java View source code
@SuppressFBWarnings(value = "EXS_EXCEPTION_SOFTENING_RETURN_FALSE", justification = "This method is an helper to check the password")
private PDDocument testPassword(ScratchFile inFile, RandomAccessRead source, String password) throws IOException {
    PDFParser parser = new PDFParser(source, password, inFile);
    try {
        parser.parse();
        return parser.getPDDocument();
    } catch (InvalidPasswordException e) {
        LOGGER.error("The tested password is invalid");
        return null;
    } finally {
        rewind(source);
    }
}
Example 22
Project: sisob-academic-data-extractor-master  File: EmailExtractor.java View source code
/**
     *
     * @param input_file
     * @param data_dir
     * @param output_file
     * @param norepeat_output_file
     * @param notfound_output_file
     * @param notfound_norepeat_output_file
     * @param filters
     * @param error_sw
     */
public static void extract_emails(File input_file, File data_dir, File output_file, File norepeat_output_file, File notfound_output_file, File notfound_norepeat_output_file, List<String> filters, StringWriter error_sw) {
    CSVReader reader = null;
    try {
        reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR);
    } catch (FileNotFoundException ex) {
        Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
    }
    int idStaffIdentifier = -1;
    int idName = -1;
    int idFirstName = -1;
    int idLastName = -1;
    int idInitials = -1;
    int idUnitOfAssessment_Description = -1;
    int idInstitutionName = -1;
    int idWebAddress = -1;
    int idResearchGroupDescription = -1;
    int idResearcherWebAddress = -1;
    int idResearcherWebAddressType = -1;
    int idResearcherWebAddressExt = -1;
    int idScoreUrl = -1;
    String filter_literal = "(";
    for (String filter : filters) {
        filter_literal += filter + ",";
    }
    filter_literal += ")";
    String[] nextLine;
    try {
        if ((nextLine = reader.readNext()) != null) {
            //Locate indexes                        
            for (int i = 0; i < nextLine.length; i++) {
                String column_name = nextLine[i];
                if (column_name.equals(FileFormatConversor.CSV_COL_ID))
                    idStaffIdentifier = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_NAME))
                    idName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME))
                    idFirstName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME))
                    idLastName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS))
                    idInitials = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT))
                    idUnitOfAssessment_Description = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME))
                    idInstitutionName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL))
                    idWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL))
                    idResearcherWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE))
                    idResearcherWebAddressType = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT))
                    idResearcherWebAddressExt = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL))
                    idScoreUrl = i;
            }
        }
    } catch (Exception ex) {
        String error_msg = "Error reading headers of " + input_file.getName();
        Logger.getRootLogger().error(error_msg + " - " + ex.toString());
        if (error_sw != null)
            error_sw.append(error_msg + "\r\n");
        return;
    }
    if (idResearcherWebAddress != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) {
        //if(!test_only_output)
        {
            try {
                String header = "";
                header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idFirstName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR;
                if (idInstitutionName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
                if (idWebAddress != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressExt != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressType != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
                if (idScoreUrl != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"";
                header += "\r\n";
                FileUtils.write(output_file, header, "UTF-8", false);
                header = "";
                header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idFirstName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
                if (idInstitutionName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
                if (idWebAddress != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressExt != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressType != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
                if (idScoreUrl != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"";
                header += "\r\n";
                FileUtils.write(notfound_output_file, header, "UTF-8", false);
            } catch (IOException ex) {
                Logger.getLogger("root").error(ex.toString());
                error_sw.append("Error creating output files\r\n");
            }
        }
        try {
            //if(!test_only_output)
            {
                Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+");
                while ((nextLine = reader.readNext()) != null) {
                    nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    if (idFirstName != -1)
                        nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    if (idName != -1)
                        nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    String content = "";
                    String researcher_page_url = nextLine[idResearcherWebAddress];
                    Logger.getLogger("root").info("Go with " + researcher_page_url);
                    if (p1.matcher(researcher_page_url).matches()) {
                        File f = new File(data_dir, researcher_page_url);
                        if (researcher_page_url.endsWith(".doc") || researcher_page_url.endsWith(".docx")) {
                            Logger.getLogger("root").error("The document " + researcher_page_url + " could not loaded");
                            error_sw.append("The document " + researcher_page_url + " could not loaded");
                        } else if (researcher_page_url.endsWith(".pdf")) {
                            PDFParser parser = null;
                            PDFTextStripper pdfStripper = null;
                            PDDocument pdDoc = null;
                            COSDocument cosDoc = null;
                            try {
                                parser = new PDFParser(new FileInputStream(f));
                            } catch (IOException e) {
                                Logger.getLogger("root").error(e.toString());
                                error_sw.append("Unable to open PDF called " + researcher_page_url);
                            }
                            if (parser != null) {
                                try {
                                    parser.parse();
                                    cosDoc = parser.getDocument();
                                    pdfStripper = new PDFTextStripper();
                                    pdDoc = new PDDocument(cosDoc);
                                    pdfStripper.setStartPage(1);
                                    pdfStripper.setEndPage(2);
                                    content = pdfStripper.getText(pdDoc);
                                } catch (Exception e) {
                                    Logger.getLogger("root").error(e.toString());
                                    error_sw.append("An exception occured in parsing the PDF Document.");
                                } finally {
                                    try {
                                        if (cosDoc != null)
                                            cosDoc.close();
                                        if (pdDoc != null)
                                            pdDoc.close();
                                    } catch (Exception e) {
                                        Logger.getLogger("root").error(e.toString());
                                    }
                                }
                            }
                        }
                    } else {
                        try {
                            Logger.getRootLogger().info("Reading " + researcher_page_url);
                            File temp;
                            temp = File.createTempFile("temp-file-name", ".tmp");
                            URL fetched_url = Downloader.fetchURL(researcher_page_url);
                            FileUtils.copyURLToFile(fetched_url, temp);
                            long sizeInBytes = temp.length();
                            long sizeInMb = sizeInBytes / (1024 * 1024);
                            if (sizeInMb > 100) {
                                content = "";
                            } else {
                                content = FileUtils.readFileToString(temp);
                                temp.delete();
                            }
                        } catch (Exception ex) {
                            Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex);
                            error_sw.append("" + researcher_page_url + " could not loaded");
                            content = "";
                        } catch (java.lang.OutOfMemoryError ex2) {
                            Logger.getLogger("root").error(researcher_page_url + " could not loaded (Jsoup OutOfMemoryError)", ex2);
                            error_sw.append("" + researcher_page_url + " could not loaded");
                            content = "";
                        }
                    }
                    if (!content.equals("")) {
                        //final String RE_MAIL = "([\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Za-z]{2,4})";
                        final String RE_MAIL = "([\\w\\-]([\\.\\w]){1,16}[\\w]{1,16}@([\\w\\-]{1,16}\\.){1,16}[A-Za-z]{2,4})";
                        Pattern p = Pattern.compile(RE_MAIL);
                        Matcher m = p.matcher(content);
                        List<String> emails = new ArrayList<String>();
                        while (m.find()) {
                            String email = m.group(1);
                            if (!emails.contains(email)) {
                                // Apply filter
                                boolean pass = true;
                                if (filters.size() > 0) {
                                    pass = false;
                                    for (String filter : filters) {
                                        String filter2 = filter.replace("*", ".*?");
                                        Pattern pattern = Pattern.compile(filter2);
                                        if (pattern.matcher(email).matches()) {
                                            pass = true;
                                            break;
                                        } else {
                                        }
                                    }
                                }
                                if (pass) {
                                    Logger.getRootLogger().info(researcher_page_url + " => " + email + " PASS FILTER! " + filter_literal);
                                    emails.add(email);
                                } else {
                                    Logger.getRootLogger().info(researcher_page_url + " => " + email + " REFUSE BY FILTER! " + filter_literal);
                                }
                            }
                        }
                        if (emails.size() < MAX_MAIL_PER_PAGE) {
                            for (String email : emails) {
                                String score_email = "";
                                String lastname = nextLine[idLastName];
                                if (lastname.length() > 5)
                                    lastname = lastname.substring(0, 6);
                                if (email.toLowerCase().contains(lastname)) {
                                    score_email = "A";
                                } else {
                                    int temp_id = idFirstName;
                                    if (temp_id == -1)
                                        temp_id = idInitials;
                                    if (!nextLine[idInitials].trim().equals("")) {
                                        String firstname = nextLine[temp_id].split(" ")[0];
                                        if (firstname.length() > 5)
                                            firstname = firstname.substring(0, 5);
                                        if (firstname.length() > 1) {
                                            if (email.toLowerCase().contains(firstname)) {
                                                score_email = "A";
                                            }
                                        }
                                    }
                                    if (score_email.equals("")) {
                                        String initials = "";
                                        String[] arr = nextLine[temp_id].split(" ");
                                        for (int i = 0; i < arr.length; i++) {
                                            if (arr[i].length() > 0)
                                                initials += arr[i].charAt(0);
                                        }
                                        initials += nextLine[idLastName].charAt(0);
                                        if (email.toLowerCase().contains(initials)) {
                                            score_email = "B";
                                        } else {
                                            score_email = "Z";
                                        }
                                    }
                                }
                                String result = "";
                                result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                                if (idFirstName != -1)
                                    result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                                if (idName != -1)
                                    result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + email + "\"" + CSV_SEPARATOR;
                                if (idInstitutionName != -1)
                                    result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                                if (idWebAddress != -1)
                                    result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
                                if (idResearcherWebAddressExt != -1)
                                    result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                                if (idResearcherWebAddressType != -1)
                                    result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR;
                                if (idScoreUrl != -1)
                                    result += "\"" + nextLine[idScoreUrl] + "\"" + CSV_SEPARATOR;
                                result += "\"" + score_email + "\"";
                                result += "\r\n";
                                try {
                                    FileUtils.write(output_file, result, "UTF-8", true);
                                } catch (IOException ex) {
                                    Logger.getLogger("root").error(ex.toString());
                                }
                            }
                        } else {
                            content = "";
                        }
                        if (emails.size() == 0)
                            content = "";
                    }
                    if (content == "") {
                        String result = "";
                        result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                        if (idFirstName != -1)
                            result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                        if (idName != -1)
                            result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                        if (idInstitutionName != -1)
                            result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                        if (idWebAddress != -1)
                            result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
                        if (idResearcherWebAddressExt != -1)
                            result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                        if (idResearcherWebAddressType != -1)
                            result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR;
                        if (idScoreUrl != -1)
                            result += "\"" + nextLine[idScoreUrl] + "\"";
                        result += "\r\n";
                        try {
                            FileUtils.write(notfound_output_file, result, "UTF-8", true);
                        } catch (IOException ex) {
                            Logger.getLogger("root").error(ex.toString());
                        }
                    }
                }
                reader.close();
            }
            Logger.getLogger("root").info("Applying deduplication algoritm - Counting duplications");
            boolean finish = false;
            String alternate_filename_1 = "file1";
            String alternate_filename_2 = "file2";
            File alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1);
            File alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2);
            FileUtils.copyFile(output_file, alternate_file_s);
            //FileUtils.write(output_file_wor_notfound, "", "UTF-8", false);
            FileUtils.write(norepeat_output_file, "", "UTF-8", false);
            while (!finish) {
                reader = null;
                try {
                    reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR);
                } catch (FileNotFoundException ex) {
                    Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
                }
                HashMap<String, Integer> count_dictionary = new HashMap<String, Integer>();
                int idEmail = 3;
                if (idFirstName != -1)
                    idEmail++;
                if (idName != -1)
                    idEmail++;
                try {
                    FileUtils.write(alternate_file_d, "", "UTF-8", false);
                } catch (IOException ex) {
                    Logger.getLogger("root").error(ex.toString());
                }
                finish = true;
                while ((nextLine = reader.readNext()) != null) {
                    Integer count = 1;
                    if (count_dictionary.containsKey(nextLine[idEmail].toString()))
                        count = count_dictionary.get(nextLine[idEmail].toString());
                    else {
                        if (count_dictionary.size() < max_in_mem) {
                            count_dictionary.put(nextLine[idEmail].toString(), count + 1);
                        } else {
                            try {
                                for (int i = 0; i < nextLine.length; i++) nextLine[i] = "\"" + nextLine[i] + "\"";
                                FileUtils.write(alternate_file_d, StringUtil.join(Arrays.asList(nextLine), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                                finish = false;
                            } catch (IOException ex) {
                                Logger.getLogger("root").error(ex.toString());
                            }
                        }
                    }
                }
                reader.close();
                Logger.getLogger("root").info("Applying deduplication algoritm - Removing duplications");
                reader = null;
                try {
                    reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR);
                } catch (FileNotFoundException ex) {
                    Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
                }
                String previous_id = "%previous%";
                String previous_email = "%previous_email%";
                List<String[]> cache = new ArrayList<String[]>();
                while ((nextLine = reader.readNext()) != null) {
                    String id = nextLine[idStaffIdentifier].toString();
                    if (previous_id.equals(id)) {
                        cache.add(nextLine);
                        previous_id = id;
                    } else {
                        //Process
                        String[] winner_line = null;
                        String max_score = "Z";
                        for (String[] act_line : cache) {
                            String act_score = "Z";
                            try {
                                act_score = act_line[act_line.length - 1];
                            } catch (Exception ex) {
                            }
                            String email = act_line[idEmail].toString();
                            if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) {
                                if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) {
                                    winner_line = act_line;
                                    max_score = act_score;
                                }
                                count_dictionary.put(email, 0);
                            }
                        }
                        if (winner_line != null) {
                            try {
                                for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\"";
                                FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                            } catch (IOException ex) {
                                Logger.getLogger("root").error(ex.toString());
                            }
                        } else {
                        //                            try {
                        //                                FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                        //                            } catch (IOException ex) {
                        //                                Logger.getLogger("root").error(ex.toString());
                        //                            }
                        }
                        cache.clear();
                        cache.add(nextLine);
                        previous_id = id;
                    }
                }
                //Process
                if (cache.size() > 0) {
                    String[] winner_line = null;
                    String max_score = "Z";
                    for (String[] act_line : cache) {
                        String act_score = "Z";
                        try {
                            act_score = (act_line[act_line.length - 1]);
                        } catch (Exception ex) {
                        }
                        String email = act_line[idEmail];
                        if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) {
                            if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) {
                                winner_line = act_line;
                                max_score = act_score;
                            }
                            count_dictionary.put(email, 0);
                        }
                    }
                    if (winner_line != null) {
                        try {
                            for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\"";
                            FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                        } catch (IOException ex) {
                            Logger.getLogger("root").error(ex.toString());
                        }
                    } else {
                    //                        try {
                    //                            FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                    //                        } catch (IOException ex) {
                    //                            Logger.getLogger("root").error(ex.toString());
                    //                        }
                    }
                }
                reader.close();
                //
                if (!finish) {
                    FileUtils.copyFile(alternate_file_d, alternate_file_s);
                    alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1);
                    alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2);
                }
            }
            FileUtils.forceDelete(alternate_file_s);
            FileUtils.forceDelete(alternate_file_d);
            Logger.getLogger("root").info("Applying deduplication algoritm - Finish");
        } catch (Exception ex) {
            String error_msg = "Error extracting emails from extractor " + input_file.getName();
            Logger.getRootLogger().error(error_msg + " - " + ex.toString());
            if (error_sw != null)
                error_sw.append(error_msg + "\r\n");
            return;
        }
    }
}
Example 23
Project: corona_src-master  File: DocumentSpliter.java View source code
/**
     * 分割処�
     * 
     * @param input
     *            入力ファイル
     * @return ��
     */
public boolean split(File input) {
    if (input == null) {
        //$NON-NLS-1$
        throw new IllegalArgumentException("input file must not null");
    }
    /* テキスト構造解��食��る���InputStream */
    final InputStream is;
    if (input.getPath().endsWith(".pdf")) {
        //$NON-NLS-1$
        /*
             * PDFファイル�らテキストを抽出�る
             */
        FileInputStream pdfStream = null;
        try {
            pdfStream = new FileInputStream(input.getPath());
            PDFParser pdfParser = new PDFParser(pdfStream);
            // 分�
            pdfParser.parse();
            PDDocument pdf = pdfParser.getPDDocument();
            PDFTextStripper stripper = new PDFTextStripper();
            String spdf2txt = stripper.getText(pdf);
            is = new ByteArrayInputStream(spdf2txt.getBytes());
        } catch (FileNotFoundException e) {
            openErrorDialog(Messages.ErrorTitle_FailedReadFile, Messages.bind(Messages.ErrorMessage_FileNotFound, input.getPath()));
            e.printStackTrace();
            return false;
        } catch (IOException e) {
            openErrorDialog(Messages.ErrorTitle_FailedReadFile, Messages.bind(Messages.ErrorMessage_FailedReadFile, input.getPath()));
            e.printStackTrace();
            return false;
        } finally {
            if (pdfStream != null) {
                try {
                    pdfStream.close();
                } catch (IOException e1) {
                }
            }
        }
    } else {
        /*
             * ã??れ以外(*.txtã?¨ã?‹ï¼‰
             */
        setEncode(Encoding.Shift_JIS.toString());
        try {
            is = new FileInputStream(input.getPath());
        } catch (FileNotFoundException e) {
            openErrorDialog(Messages.ErrorTitle_FailedReadFile, Messages.bind(Messages.ErrorMessage_FileNotFound, input.getPath()));
            e.printStackTrace();
            return false;
        }
    }
    //$NON-NLS-1$
    final String Regex_HeadSpace = "^[ \\s]+";
    //$NON-NLS-1$
    final String Regex_TailSpace = "[ \\s]+$";
    BufferedReader br = null;
    try {
        // 1行��判定
        br = new BufferedReader(new InputStreamReader(is, encode));
        String line;
        StringBuilder buff = new StringBuilder(100);
        divPointList = new ArrayList<Integer>();
        deletePointMap = new TreeMap<Integer, Integer>(new Comparator<Integer>() {

            @Override
            public int compare(Integer i1, Integer i2) {
                return i2.compareTo(i1);
            }
        });
        if (allDefinitions.size() > 0) {
            while ((line = br.readLine()) != null) {
                /*
                     * 行頭・行末�空白文字(全角��角�タブ)除去を�る。
                     * ���DB登録時�エラー��る��'を置��る
                     */
                //$NON-NLS-1$//$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
                line = line.replaceAll(Regex_HeadSpace, "").replaceAll(Regex_TailSpace, "").replace("'", "\"").replace("\\", "\\\\");
                divPointList.clear();
                deletePointMap.clear();
                if (line.length() > 0) {
                    for (CoronaDocumentDefinition definition : allDefinitions) {
                        if (definition.getPosition() == CoronaDocumentDefinition.PHRASE) {
                            // 文頭�ェック
                            checkPhrase(line, buff, definition);
                        } else if (definition.getPosition() == CoronaDocumentDefinition.WHOLE) {
                            // 全体�ェック
                            checkWhole(line, buff, definition);
                        }
                    }
                    //$NON-NLS-1$
                    buff.append(line).append("\n");
                    // 文章を分割
                    divisionRecord(buff);
                    divideWriting(buff);
                } else {
                    // 段��ェック
                    if (buff.length() > 0) {
                        divideWriting(buff);
                        output.add(buff.toString());
                        buff.setLength(0);
                    }
                }
            }
            if (buff.length() > 0) {
                divideWriting(buff);
                output.add(buff.toString());
            }
        } else {
            while ((line = br.readLine()) != null) {
                //$NON-NLS-1$//$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
                line = line.replaceAll(Regex_HeadSpace, "").replaceAll(Regex_TailSpace, "").replaceAll("'", "\"").replace("\\", "\\\\");
                buff.append(line);
            }
            if (buff.length() > 0) {
                divideWriting(buff);
                output.add(buff.toString());
            }
        }
    } catch (IOException e) {
        openErrorDialog(Messages.ErrorTitle_FailedReadFile, Messages.bind(Messages.ErrorMessage_FailedReadFile, input.getPath()));
        e.printStackTrace();
        return false;
    } finally {
        if (br != null) {
            try {
                br.close();
            } catch (IOException e) {
            }
        }
    }
    return true;
}
Example 24
Project: pdfbox-master  File: PDDocument.java View source code
/**
     * Parses a PDF.
     * 
     * @param file file to be loaded
     * @param password password to be used for decryption
     * @param keyStore key store to be used for decryption when using public key security 
     * @param alias alias to be used for decryption when using public key security
     * @param memUsageSetting defines how memory is used for buffering PDF streams 
     * 
     * @return loaded document
     * 
     * @throws IOException in case of a file reading or parsing error
     */
public static PDDocument load(File file, String password, InputStream keyStore, String alias, MemoryUsageSetting memUsageSetting) throws IOException {
    RandomAccessBufferedFileInputStream raFile = new RandomAccessBufferedFileInputStream(file);
    try {
        ScratchFile scratchFile = new ScratchFile(memUsageSetting);
        try {
            PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile);
            parser.parse();
            return parser.getPDDocument();
        } catch (IOException ioe) {
            IOUtils.closeQuietly(scratchFile);
            throw ioe;
        }
    } catch (IOException ioe) {
        IOUtils.closeQuietly(raFile);
        throw ioe;
    }
}
Example 25
Project: brigen-base-master  File: PDFBoxDelegaterImpl.java View source code
private static void check() {
    try {
        Class.forName(PDFParser.class.getName());
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    }
}