Java Examples for org.apache.pdfbox.pdfparser.PDFParser
The following java examples will help you to understand the usage of org.apache.pdfbox.pdfparser.PDFParser. These source code samples are taken from different open source projects.
Example 1
| Project: java-wkhtmltopdf-wrapper-master File: PdfTest.java View source code |
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN a html template containing special characters that java stores in utf-16 internally
Pdf pdf = new Pdf();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}Example 2
| Project: wkhtmltopdf-master File: PdfTest.java View source code |
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN a html template containing special characters that java stores in utf-16 internally
Pdf pdf = new Pdf();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}Example 3
| Project: Europeana-Cloud-master File: PdfBoxExtractor.java View source code |
@Override
public String extractText(InputStream is) {
if (is == null) {
LOGGER.warn("No data for extraction.");
return null;
}
PDFParser parser;
String parsedText = null;
PDFTextStripper pdfStripper = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
try {
parser = new PDFParser(is);
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
PDDocumentInformation info = pdDoc.getDocumentInformation();
Set<String> mdKeys = info.getMetadataKeys();
extractedMetadata = new HashMap<>();
for (String key : mdKeys) {
String value = (String) info.getPropertyStringValue(key);
extractedMetadata.put(key, value);
}
//possible NULL pointer if document is encrypted
parsedText = pdfStripper.getText(pdDoc);
} catch (IOException ex) {
LOGGER.warn("Can not extract text from pdf because: " + ex.getMessage());
} finally {
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (IOException ex) {
}
}
return parsedText;
}Example 4
| Project: extension-aws-master File: PdfParser.java View source code |
public Parse parse(InputStream inContent) {
Parse results = new Parse();
PDDocument pdf = null;
try {
PDFParser parser = new PDFParser(inContent);
// new ByteArrayInputStream(inContent));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
DocumentEncryption decryptor = new DocumentEncryption(pdf);
// Just try using the default password and move on
decryptor.decryptDocument("");
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
//TODO: Write this out to a temp file that will be indexed seperately
String text = null;
String title = null;
try {
text = stripper.getText(pdf);
} catch (Throwable e) {
log.error("Could not parse", e);
text = "";
}
text = scrubChars(text);
results.setText(text);
results.setPages(pdf.getNumberOfPages());
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
results.setTitle(title);
if (pdf.getNumberOfPages() > 0) {
PDPage page = (PDPage) pdf.getDocumentCatalog().getAllPages().get(0);
PDRectangle mediaBox = page.getMediaBox();
if (mediaBox == null) {
mediaBox = page.getArtBox();
}
if (mediaBox != null) {
results.put("width", String.valueOf(Math.round(mediaBox.getWidth())));
results.put("height", String.valueOf(Math.round(mediaBox.getHeight())));
}
}
//Thread.sleep(500); // Slow down PDF's loading
} catch (CryptographyException e) {
log.error("Error decrypting document. " + e);
} catch (InvalidPasswordException e) {
log.error("Can't decrypt document - invalid password. " + e);
} catch (Exception e) {
log.error("Can't be handled as pdf document. " + e);
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
}
}
return results;
}Example 5
| Project: knowledge_vault-master File: PdfTextExtractor.java View source code |
//-------------------------------------------------------< TextExtractor >
/**
* {@inheritDoc}
*/
@SuppressWarnings("rawtypes")
public Reader extractText(InputStream stream, String type, String encoding) throws IOException {
try {
PDFParser parser = new PDFParser(new BufferedInputStream(stream));
try {
parser.parse();
PDDocument document = parser.getPDDocument();
CharArrayWriter writer = new CharArrayWriter();
PDFTextStripper stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
stripper.writeText(document, writer);
String st = writer.toString().trim();
log.debug("TextStripped: '{}'", st);
if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) {
log.warn("PDF does not contains text layer");
// Extract images from PDF
List pages = document.getDocumentCatalog().getAllPages();
StringBuilder sb = new StringBuilder();
for (Iterator itPg = pages.iterator(); itPg.hasNext(); ) {
PDPage page = (PDPage) itPg.next();
PDResources resources = page.getResources();
Map images = resources.getImages();
if (images != null) {
for (Iterator itImg = images.keySet().iterator(); itImg.hasNext(); ) {
String key = (String) itImg.next();
PDXObjectImage image = (PDXObjectImage) images.get(key);
File pdfImg = File.createTempFile(key, "." + image.getSuffix());
log.debug("Writing image: {}", pdfImg.getPath());
image.write2file(pdfImg);
String txt = new CuneiformTextExtractor().doOcr(pdfImg);
sb.append(txt).append(" ");
log.debug("OCR Extracted: {}", txt);
FileUtils.deleteQuietly(pdfImg);
}
}
}
return new StringReader(sb.toString());
} else {
return new CharArrayReader(writer.toCharArray());
}
} finally {
try {
PDDocument doc = parser.getPDDocument();
if (doc != null) {
doc.close();
}
} catch (IOException e) {
}
}
} catch (Exception e) {
log.warn("Failed to extract PDF text content", e);
return new StringReader("");
} finally {
stream.close();
}
}Example 6
| Project: nuxeo-versions-difference-master File: TestPdfBoxN.java View source code |
private boolean setMain(String FileName) throws Exception {
file = new File(FileName);
if (!file.isFile()) {
System.err.println("File " + "test.pdf" + " does not exist.");
return false;
}
try {
parser = new PDFParser(new FileInputStream(file));
} catch (IOException e) {
System.err.println("Unable to open PDF Parser. " + e.getMessage());
return false;
}
try {
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
} catch (Exception e) {
return false;
}
return true;
}Example 7
| Project: PDF-to-unusual-HTML-master File: Overlay.java View source code |
private static PDDocument getDocument(String filename) throws IOException {
FileInputStream input = null;
PDFParser parser = null;
PDDocument result = null;
try {
input = new FileInputStream(filename);
parser = new PDFParser(input);
parser.parse();
result = parser.getPDDocument();
} finally {
if (input != null) {
input.close();
}
}
return result;
}Example 8
| Project: sakai-cle-master File: PDFContentDigester.java View source code |
public String getContent(ContentResource contentResource) {
if (contentResource == null) {
throw new RuntimeException("Null contentResource passed to getContent");
}
InputStream contentStream = null;
PDFParser parser = null;
PDDocument pddoc = null;
try {
contentStream = contentResource.streamContent();
parser = new PDFParser(new BufferedInputStream(contentStream));
parser.parse();
pddoc = parser.getPDDocument();
if (pddoc != null) {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
CharArrayWriter cw = new CharArrayWriter();
stripper.writeText(pddoc, cw);
return SearchUtils.appendCleanString(cw.toCharArray(), null).toString();
}
} catch (ServerOverloadException e) {
String eMessage = e.getMessage();
if (eMessage == null) {
eMessage = e.toString();
}
throw new RuntimeException("Failed to get content for indexing: cause: ServerOverloadException: " + eMessage, e);
} catch (IOException e) {
String eMessage = e.getMessage();
if (eMessage == null) {
eMessage = e.toString();
}
throw new RuntimeException("Failed to get content for indexing: cause: IOException: " + eMessage, e);
} finally {
if (pddoc != null) {
try {
pddoc.close();
} catch (IOException e) {
log.debug(e);
}
}
if (contentStream != null) {
try {
contentStream.close();
} catch (IOException e) {
log.debug(e);
}
}
}
return null;
}Example 9
| Project: with-aes-master File: Overlay.java View source code |
private static PDDocument getDocument(String filename) throws IOException {
FileInputStream input = null;
PDFParser parser = null;
PDDocument result = null;
try {
input = new FileInputStream(filename);
parser = new PDFParser(input);
parser.parse();
result = parser.getPDDocument();
} finally {
if (input != null) {
input.close();
}
}
return result;
}Example 10
| Project: leech-master File: LeechConfig.java View source code |
protected void init() {
LinkedList<Parser> llParsers = new LinkedList<Parser>();
// der default-Parser aus der TikaConfig
llParsers.add(super.getParser());
// die Leech-datasource-crawler-parser - die letzten werden priorisiert, somit können wir hier z.b. den Original-html-parser überschreiben
llParsers.add(new DirectoryCrawlerParser());
llParsers.add(new HtmlCrawlerParser());
llParsers.add(new ImapCrawlerParser());
m_parser = new CompositeParser(this.getMediaTypeRegistry(), llParsers);
m_detector = new LeechDefaultDetector(m_parser);
// die kommen in ein field, da die Einstellung wohl nur so lange gültig ist, wie es noch eine gültige Referenz zu diesen Objekten gibt
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.util.PDFStreamEngine"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.encoding.Encoding"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.BaseParser"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdmodel.font.PDSimpleFont"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.XrefTrailerResolver"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.filter.FlateFilter"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.PDFParser"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.util.operator.SetTextFont"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.*"));
for (Logger logger : m_llPdfBoxLogger) logger.setLevel(Level.OFF);
}Example 11
| Project: MEditor-master File: GetOcrFromPdfHandler.java View source code |
private String pdftoText(String fileName) throws ActionException {
File pdfFile = new File(fileName);
if (!pdfFile.isFile()) {
LOGGER.error("The file: " + fileName + " does not exist.");
throw new ActionException("Unable to parse the pdf file.");
}
PDFParser parser = null;
COSDocument cosDoc = null;
PDFTextStripper pdfStripper;
PDDocument pdDoc = null;
String parsedText;
try {
parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile)));
} catch (Exception e) {
LOGGER.error("Unable to open PDF Parser.: " + e);
e.printStackTrace();
throw new ActionException("Unable to parse the pdf file.");
}
try {
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
parsedText = pdfStripper.getText(pdDoc);
} catch (Exception e) {
LOGGER.error("An exception occured in parsing the PDF Document.");
e.printStackTrace();
throw new ActionException("Unable to parse the pdf file. " + e);
} finally {
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return parsedText;
}Example 12
| Project: seng310-ebookme-master File: PdfExtractor.java View source code |
public void extract(InputStream stream, Charset charset, String mimeType, Map result) throws ExtractorException {
// setup a PDDocument
PDDocument document = null;
try {
try {
PDFParser parser = new PDFParser(stream);
parser.parse();
document = parser.getPDDocument();
} catch (IOException e) {
throw new ExtractorException(e);
}
// decrypt and extract info from this document
processDocument(document, result);
} finally {
if (document != null) {
// close the document
try {
document.close();
} catch (IOException e) {
throw new ExtractorException(e);
}
}
}
}Example 13
| Project: cider-master File: pdfIdiom.java View source code |
@Override
public Model parse(DataSource source) throws ParserException {
// create an empty Model
Model model = ModelFactory.createDefaultModel();
Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource();
// open pdf document
final PDDocument theDocument;
final PDFParser parser;
try {
parser = new PDFParser(source.getStream());
parser.parse();
theDocument = parser.getPDDocument();
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new ParserException(e.getMessage(), source.getURI());
}
if (theDocument.isEncrypted()) {
try {
theDocument.openProtection(new StandardDecryptionMaterial(""));
} catch (BadSecurityHandlerException e) {
throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e);
} catch (IOException e) {
throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
} catch (CryptographyException e) {
throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e);
}
final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("PDF cannot be decrypted", source.getURI());
}
// get metadata
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
if (theDocInfo != null) {
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
docAuthor = theDocInfo.getAuthor();
docKeywordStr = theDocInfo.getKeywords();
}
if (docAuthor != null && docAuthor.length() > 0) {
resource.addProperty(VCARD.FN, docAuthor);
resource.addProperty(DC.creator, docAuthor);
}
if (docSubject != null && docSubject.length() > 0) {
resource.addProperty(DC.subject, docSubject);
}
if (docTitle != null && docTitle.length() > 0) {
resource.addProperty(DC.title, docTitle);
}
String[] docKeywords = null;
if (docKeywordStr != null && docKeywordStr.length() > 0) {
docKeywords = docKeywordStr.split(" |,");
resource.addProperty(DC.coverage, concat(docKeywords));
}
// get the content
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Writer writer;
try {
writer = new OutputStreamWriter(baos, "UTF-8");
} catch (UnsupportedEncodingException e1) {
writer = new OutputStreamWriter(baos);
}
try {
final PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(theDocument, writer);
theDocument.close();
writer.close();
} catch (IOException e) {
if (writer != null)
try {
writer.close();
} catch (final Exception ex) {
}
throw new ParserException("PDF content reader", source.getURI(), e);
}
String content;
try {
content = new String(baos.toByteArray(), "UTF-8");
} catch (UnsupportedEncodingException e) {
content = new String(baos.toByteArray());
}
if (content != null && content.length() > 0) {
resource.addProperty(CIDER.data_content_text, content);
}
return model;
}Example 14
| Project: streamflow-core-master File: Underlay.java View source code |
private static PDDocument getDocument(String filename) throws IOException {
FileInputStream input = null;
PDFParser parser = null;
PDDocument result = null;
try {
input = new FileInputStream(filename);
parser = new PDFParser(input);
parser.parse();
result = parser.getPDDocument();
} finally {
if (input != null) {
input.close();
}
}
return result;
}Example 15
| Project: converge-1.x-master File: MetaDataService.java View source code |
/** {@inheritDoc } */
@Override
public String extractContent(MediaItemRendition mir) {
String contentType = mir.getContentType();
String story = "";
if (contentType == null) {
LOG.log(Level.WARNING, "Content type is null");
return story;
}
if (contentType.equals("application/pdf")) {
// Extract text in PDF
try {
URL originalFile = new URL(mir.getAbsoluteFilename());
PDDocument doc = null;
try {
// Read PDF
PDFParser parser = new PDFParser(originalFile.openStream());
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDDocument pdDoc = new PDDocument(cosDoc);
PDFTextStripper stripper = new PDFTextStripper();
story = stripper.getText(pdDoc);
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
}
}
} catch (MalformedURLException ex) {
}
} else if (contentType.equals("application/msword") || contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
try {
URL originalFile = new URL(mir.getAbsoluteFilename());
HWPFDocument doc = new HWPFDocument(originalFile.openStream());
WordExtractor extractor = new WordExtractor(doc);
story = extractor.getText();
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
}
return story;
}Example 16
| Project: dlibrary-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
PDFParser parser = new PDFParser(metadata);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
}
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null) {
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
}
if (log.isDebugEnabled()) {
log.debug("PDF Info dict title=\"" + title + "\"");
}
item.addDC("title", null, "en", title);
String value = docinfo.getAuthor();
if (value != null) {
item.addDC("contributor", "author", null, value);
if (log.isDebugEnabled()) {
log.debug("PDF Info dict author=\"" + value + "\"");
}
}
value = docinfo.getCreator();
if (value != null) {
item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
}
value = docinfo.getProducer();
if (value != null) {
item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
}
value = docinfo.getSubject();
if (value != null) {
item.addDC("description", "abstract", null, value);
}
value = docinfo.getKeywords();
if (value != null) {
item.addDC("subject", "other", null, value);
}
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue = docinfo.getCreationDate();
if (calValue == null) {
calValue = docinfo.getModificationDate();
}
if (calValue != null) {
item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
}
item.update();
} finally {
if (cos != null) {
cos.close();
}
}
}Example 17
| Project: DSpace-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
ScratchFile scratchFile = null;
try {
// use up to 80% of JVM free memory
long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100;
// then fallback to temp file (unlimited size)
scratchFile = new ScratchFile(MemoryUsageSetting.setupMixed(useRAM));
} catch (IOException ioe) {
log.warn("Error initializing scratch file: " + ioe.getMessage());
}
PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(metadata), scratchFile);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
}
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null) {
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
}
if (log.isDebugEnabled()) {
log.debug("PDF Info dict title=\"" + title + "\"");
}
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "title", null, "en", title);
String value = docinfo.getAuthor();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "contributor", "author", null, value);
if (log.isDebugEnabled()) {
log.debug("PDF Info dict author=\"" + value + "\"");
}
}
value = docinfo.getCreator();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "provenance", "en", "Application that created the original document: " + value);
}
value = docinfo.getProducer();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "provenance", "en", "Original document converted to PDF by: " + value);
}
value = docinfo.getSubject();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "description", "abstract", null, value);
}
value = docinfo.getKeywords();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "subject", "other", null, value);
}
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue = docinfo.getCreationDate();
if (calValue == null) {
calValue = docinfo.getModificationDate();
}
if (calValue != null) {
itemService.addMetadata(context, item, MetadataSchema.DC_SCHEMA, "date", "created", null, (new DCDate(calValue.getTime())).toString());
}
itemService.update(context, item);
} finally {
if (cos != null) {
cos.close();
}
}
}Example 18
| Project: DSpace-SVN-Deprecated-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
PDFParser parser = new PDFParser(metadata);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
}
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null) {
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
}
if (log.isDebugEnabled()) {
log.debug("PDF Info dict title=\"" + title + "\"");
}
item.addDC("title", null, "en", title);
String value = docinfo.getAuthor();
if (value != null) {
item.addDC("contributor", "author", null, value);
if (log.isDebugEnabled()) {
log.debug("PDF Info dict author=\"" + value + "\"");
}
}
value = docinfo.getCreator();
if (value != null) {
item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
}
value = docinfo.getProducer();
if (value != null) {
item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
}
value = docinfo.getSubject();
if (value != null) {
item.addDC("description", "abstract", null, value);
}
value = docinfo.getKeywords();
if (value != null) {
item.addDC("subject", "other", null, value);
}
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue = docinfo.getCreationDate();
if (calValue == null) {
calValue = docinfo.getModificationDate();
}
if (calValue != null) {
item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
}
item.update();
} finally {
if (cos != null) {
cos.close();
}
}
}Example 19
| Project: gsearch-master File: TransformerToText.java View source code |
private StringBuffer getTextFromPDF(byte[] doc) throws GenericSearchException {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF");
StringBuffer docText = new StringBuffer();
ByteArrayInputStream bais = null;
try {
bais = new ByteArrayInputStream(doc);
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new ByteArrayInputStream: ", e);
throw new GenericSearchException("getTextFromPDF new ByteArrayInputStream: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new ByteArrayInputStream");
PDFParser parser;
try {
parser = new PDFParser(bais);
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFParser: ", e);
throw new GenericSearchException("getTextFromPDF new PDFParser: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFParser");
try {
parser.parse();
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.parse: ", e);
throw new GenericSearchException("getTextFromPDF parser.parse: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.parse");
COSDocument cosDoc = null;
try {
cosDoc = parser.getDocument();
} catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.getDocument: ", e);
throw new GenericSearchException("getTextFromPDF parser.getDocument: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.getDocument");
PDDocument pdDoc = null;
try {
pdDoc = new PDDocument(cosDoc);
} catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDDocument: ", e);
throw new GenericSearchException("getTextFromPDF new PDDocument: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDDocument isEncrypted=" + pdDoc.isEncrypted() + " getNumberOfPages=" + pdDoc.getNumberOfPages());
PDFTextStripper stripper;
try {
stripper = new PDFTextStripper();
} catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFTextStripper: ", e);
throw new GenericSearchException("getTextFromPDF new PDFTextStripper: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFTextStripper getStartPage=" + stripper.getStartPage() + " getEndPage=" + stripper.getEndPage());
String docString = "";
try {
docString = stripper.getText(pdDoc);
} catch (Exception e) {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText: ", e);
throw new GenericSearchException("getTextFromPDF stripper.getText: ", e);
} finally {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText finally");
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText");
docText = new StringBuffer(docString);
// put space instead of characters not allowed in the indexing stylesheet
char c;
for (int i = 0; i < docText.length(); i++) {
c = docText.charAt(i);
if (c < 32 && c != 9 && c != 10 && c != 13) {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF index=" + i + " char=" + c + " set to 32");
docText.replace(i, i + 1, " ");
}
}
return docText;
}Example 20
| Project: vtechworks-master File: PDFPackager.java View source code |
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
PDFParser parser = new PDFParser(metadata);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
}
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null) {
throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
}
if (log.isDebugEnabled()) {
log.debug("PDF Info dict title=\"" + title + "\"");
}
item.addDC("title", null, "en", title);
String value = docinfo.getAuthor();
if (value != null) {
item.addDC("contributor", "author", null, value);
if (log.isDebugEnabled()) {
log.debug("PDF Info dict author=\"" + value + "\"");
}
}
value = docinfo.getCreator();
if (value != null) {
item.addDC("description", "provenance", "en", "Application that created the original document: " + value);
}
value = docinfo.getProducer();
if (value != null) {
item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
}
value = docinfo.getSubject();
if (value != null) {
item.addDC("description", "abstract", null, value);
}
value = docinfo.getKeywords();
if (value != null) {
item.addDC("subject", "other", null, value);
}
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue = docinfo.getCreationDate();
if (calValue == null) {
calValue = docinfo.getModificationDate();
}
if (calValue != null) {
item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
}
item.update();
} finally {
if (cos != null) {
cos.close();
}
}
}Example 21
| Project: DocBleach-master File: PdfBleach.java View source code |
@SuppressFBWarnings(value = "EXS_EXCEPTION_SOFTENING_RETURN_FALSE", justification = "This method is an helper to check the password")
private PDDocument testPassword(ScratchFile inFile, RandomAccessRead source, String password) throws IOException {
PDFParser parser = new PDFParser(source, password, inFile);
try {
parser.parse();
return parser.getPDDocument();
} catch (InvalidPasswordException e) {
LOGGER.error("The tested password is invalid");
return null;
} finally {
rewind(source);
}
}Example 22
| Project: sisob-academic-data-extractor-master File: EmailExtractor.java View source code |
/**
*
* @param input_file
* @param data_dir
* @param output_file
* @param norepeat_output_file
* @param notfound_output_file
* @param notfound_norepeat_output_file
* @param filters
* @param error_sw
*/
public static void extract_emails(File input_file, File data_dir, File output_file, File norepeat_output_file, File notfound_output_file, File notfound_norepeat_output_file, List<String> filters, StringWriter error_sw) {
CSVReader reader = null;
try {
reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR);
} catch (FileNotFoundException ex) {
Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
}
int idStaffIdentifier = -1;
int idName = -1;
int idFirstName = -1;
int idLastName = -1;
int idInitials = -1;
int idUnitOfAssessment_Description = -1;
int idInstitutionName = -1;
int idWebAddress = -1;
int idResearchGroupDescription = -1;
int idResearcherWebAddress = -1;
int idResearcherWebAddressType = -1;
int idResearcherWebAddressExt = -1;
int idScoreUrl = -1;
String filter_literal = "(";
for (String filter : filters) {
filter_literal += filter + ",";
}
filter_literal += ")";
String[] nextLine;
try {
if ((nextLine = reader.readNext()) != null) {
//Locate indexes
for (int i = 0; i < nextLine.length; i++) {
String column_name = nextLine[i];
if (column_name.equals(FileFormatConversor.CSV_COL_ID))
idStaffIdentifier = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_NAME))
idName = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME))
idFirstName = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME))
idLastName = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS))
idInitials = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT))
idUnitOfAssessment_Description = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME))
idInstitutionName = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL))
idWebAddress = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL))
idResearcherWebAddress = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE))
idResearcherWebAddressType = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT))
idResearcherWebAddressExt = i;
else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL))
idScoreUrl = i;
}
}
} catch (Exception ex) {
String error_msg = "Error reading headers of " + input_file.getName();
Logger.getRootLogger().error(error_msg + " - " + ex.toString());
if (error_sw != null)
error_sw.append(error_msg + "\r\n");
return;
}
if (idResearcherWebAddress != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) {
//if(!test_only_output)
{
try {
String header = "";
header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
if (idFirstName != -1)
header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
if (idName != -1)
header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR;
if (idInstitutionName != -1)
header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
if (idWebAddress != -1)
header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
if (idResearcherWebAddressExt != -1)
header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
if (idResearcherWebAddressType != -1)
header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
if (idScoreUrl != -1)
header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR;
header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"";
header += "\r\n";
FileUtils.write(output_file, header, "UTF-8", false);
header = "";
header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
if (idFirstName != -1)
header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
if (idName != -1)
header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
if (idInstitutionName != -1)
header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
if (idWebAddress != -1)
header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
if (idResearcherWebAddressExt != -1)
header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
if (idResearcherWebAddressType != -1)
header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
if (idScoreUrl != -1)
header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"";
header += "\r\n";
FileUtils.write(notfound_output_file, header, "UTF-8", false);
} catch (IOException ex) {
Logger.getLogger("root").error(ex.toString());
error_sw.append("Error creating output files\r\n");
}
}
try {
//if(!test_only_output)
{
Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+");
while ((nextLine = reader.readNext()) != null) {
nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase();
if (idFirstName != -1)
nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
if (idName != -1)
nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
String content = "";
String researcher_page_url = nextLine[idResearcherWebAddress];
Logger.getLogger("root").info("Go with " + researcher_page_url);
if (p1.matcher(researcher_page_url).matches()) {
File f = new File(data_dir, researcher_page_url);
if (researcher_page_url.endsWith(".doc") || researcher_page_url.endsWith(".docx")) {
Logger.getLogger("root").error("The document " + researcher_page_url + " could not loaded");
error_sw.append("The document " + researcher_page_url + " could not loaded");
} else if (researcher_page_url.endsWith(".pdf")) {
PDFParser parser = null;
PDFTextStripper pdfStripper = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
try {
parser = new PDFParser(new FileInputStream(f));
} catch (IOException e) {
Logger.getLogger("root").error(e.toString());
error_sw.append("Unable to open PDF called " + researcher_page_url);
}
if (parser != null) {
try {
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdfStripper.setStartPage(1);
pdfStripper.setEndPage(2);
content = pdfStripper.getText(pdDoc);
} catch (Exception e) {
Logger.getLogger("root").error(e.toString());
error_sw.append("An exception occured in parsing the PDF Document.");
} finally {
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e) {
Logger.getLogger("root").error(e.toString());
}
}
}
}
} else {
try {
Logger.getRootLogger().info("Reading " + researcher_page_url);
File temp;
temp = File.createTempFile("temp-file-name", ".tmp");
URL fetched_url = Downloader.fetchURL(researcher_page_url);
FileUtils.copyURLToFile(fetched_url, temp);
long sizeInBytes = temp.length();
long sizeInMb = sizeInBytes / (1024 * 1024);
if (sizeInMb > 100) {
content = "";
} else {
content = FileUtils.readFileToString(temp);
temp.delete();
}
} catch (Exception ex) {
Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex);
error_sw.append("" + researcher_page_url + " could not loaded");
content = "";
} catch (java.lang.OutOfMemoryError ex2) {
Logger.getLogger("root").error(researcher_page_url + " could not loaded (Jsoup OutOfMemoryError)", ex2);
error_sw.append("" + researcher_page_url + " could not loaded");
content = "";
}
}
if (!content.equals("")) {
//final String RE_MAIL = "([\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Za-z]{2,4})";
final String RE_MAIL = "([\\w\\-]([\\.\\w]){1,16}[\\w]{1,16}@([\\w\\-]{1,16}\\.){1,16}[A-Za-z]{2,4})";
Pattern p = Pattern.compile(RE_MAIL);
Matcher m = p.matcher(content);
List<String> emails = new ArrayList<String>();
while (m.find()) {
String email = m.group(1);
if (!emails.contains(email)) {
// Apply filter
boolean pass = true;
if (filters.size() > 0) {
pass = false;
for (String filter : filters) {
String filter2 = filter.replace("*", ".*?");
Pattern pattern = Pattern.compile(filter2);
if (pattern.matcher(email).matches()) {
pass = true;
break;
} else {
}
}
}
if (pass) {
Logger.getRootLogger().info(researcher_page_url + " => " + email + " PASS FILTER! " + filter_literal);
emails.add(email);
} else {
Logger.getRootLogger().info(researcher_page_url + " => " + email + " REFUSE BY FILTER! " + filter_literal);
}
}
}
if (emails.size() < MAX_MAIL_PER_PAGE) {
for (String email : emails) {
String score_email = "";
String lastname = nextLine[idLastName];
if (lastname.length() > 5)
lastname = lastname.substring(0, 6);
if (email.toLowerCase().contains(lastname)) {
score_email = "A";
} else {
int temp_id = idFirstName;
if (temp_id == -1)
temp_id = idInitials;
if (!nextLine[idInitials].trim().equals("")) {
String firstname = nextLine[temp_id].split(" ")[0];
if (firstname.length() > 5)
firstname = firstname.substring(0, 5);
if (firstname.length() > 1) {
if (email.toLowerCase().contains(firstname)) {
score_email = "A";
}
}
}
if (score_email.equals("")) {
String initials = "";
String[] arr = nextLine[temp_id].split(" ");
for (int i = 0; i < arr.length; i++) {
if (arr[i].length() > 0)
initials += arr[i].charAt(0);
}
initials += nextLine[idLastName].charAt(0);
if (email.toLowerCase().contains(initials)) {
score_email = "B";
} else {
score_email = "Z";
}
}
}
String result = "";
result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
if (idFirstName != -1)
result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
if (idName != -1)
result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
result += "\"" + email + "\"" + CSV_SEPARATOR;
if (idInstitutionName != -1)
result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
if (idWebAddress != -1)
result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
if (idResearcherWebAddressExt != -1)
result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
if (idResearcherWebAddressType != -1)
result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR;
if (idScoreUrl != -1)
result += "\"" + nextLine[idScoreUrl] + "\"" + CSV_SEPARATOR;
result += "\"" + score_email + "\"";
result += "\r\n";
try {
FileUtils.write(output_file, result, "UTF-8", true);
} catch (IOException ex) {
Logger.getLogger("root").error(ex.toString());
}
}
} else {
content = "";
}
if (emails.size() == 0)
content = "";
}
if (content == "") {
String result = "";
result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
if (idFirstName != -1)
result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
if (idName != -1)
result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
if (idInstitutionName != -1)
result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
if (idWebAddress != -1)
result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
if (idResearcherWebAddressExt != -1)
result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
if (idResearcherWebAddressType != -1)
result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR;
if (idScoreUrl != -1)
result += "\"" + nextLine[idScoreUrl] + "\"";
result += "\r\n";
try {
FileUtils.write(notfound_output_file, result, "UTF-8", true);
} catch (IOException ex) {
Logger.getLogger("root").error(ex.toString());
}
}
}
reader.close();
}
Logger.getLogger("root").info("Applying deduplication algoritm - Counting duplications");
boolean finish = false;
String alternate_filename_1 = "file1";
String alternate_filename_2 = "file2";
File alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1);
File alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2);
FileUtils.copyFile(output_file, alternate_file_s);
//FileUtils.write(output_file_wor_notfound, "", "UTF-8", false);
FileUtils.write(norepeat_output_file, "", "UTF-8", false);
while (!finish) {
reader = null;
try {
reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR);
} catch (FileNotFoundException ex) {
Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
}
HashMap<String, Integer> count_dictionary = new HashMap<String, Integer>();
int idEmail = 3;
if (idFirstName != -1)
idEmail++;
if (idName != -1)
idEmail++;
try {
FileUtils.write(alternate_file_d, "", "UTF-8", false);
} catch (IOException ex) {
Logger.getLogger("root").error(ex.toString());
}
finish = true;
while ((nextLine = reader.readNext()) != null) {
Integer count = 1;
if (count_dictionary.containsKey(nextLine[idEmail].toString()))
count = count_dictionary.get(nextLine[idEmail].toString());
else {
if (count_dictionary.size() < max_in_mem) {
count_dictionary.put(nextLine[idEmail].toString(), count + 1);
} else {
try {
for (int i = 0; i < nextLine.length; i++) nextLine[i] = "\"" + nextLine[i] + "\"";
FileUtils.write(alternate_file_d, StringUtil.join(Arrays.asList(nextLine), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
finish = false;
} catch (IOException ex) {
Logger.getLogger("root").error(ex.toString());
}
}
}
}
reader.close();
Logger.getLogger("root").info("Applying deduplication algoritm - Removing duplications");
reader = null;
try {
reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR);
} catch (FileNotFoundException ex) {
Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
}
String previous_id = "%previous%";
String previous_email = "%previous_email%";
List<String[]> cache = new ArrayList<String[]>();
while ((nextLine = reader.readNext()) != null) {
String id = nextLine[idStaffIdentifier].toString();
if (previous_id.equals(id)) {
cache.add(nextLine);
previous_id = id;
} else {
//Process
String[] winner_line = null;
String max_score = "Z";
for (String[] act_line : cache) {
String act_score = "Z";
try {
act_score = act_line[act_line.length - 1];
} catch (Exception ex) {
}
String email = act_line[idEmail].toString();
if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) {
if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) {
winner_line = act_line;
max_score = act_score;
}
count_dictionary.put(email, 0);
}
}
if (winner_line != null) {
try {
for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\"";
FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
} catch (IOException ex) {
Logger.getLogger("root").error(ex.toString());
}
} else {
// try {
// FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
// } catch (IOException ex) {
// Logger.getLogger("root").error(ex.toString());
// }
}
cache.clear();
cache.add(nextLine);
previous_id = id;
}
}
//Process
if (cache.size() > 0) {
String[] winner_line = null;
String max_score = "Z";
for (String[] act_line : cache) {
String act_score = "Z";
try {
act_score = (act_line[act_line.length - 1]);
} catch (Exception ex) {
}
String email = act_line[idEmail];
if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) {
if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) {
winner_line = act_line;
max_score = act_score;
}
count_dictionary.put(email, 0);
}
}
if (winner_line != null) {
try {
for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\"";
FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
} catch (IOException ex) {
Logger.getLogger("root").error(ex.toString());
}
} else {
// try {
// FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
// } catch (IOException ex) {
// Logger.getLogger("root").error(ex.toString());
// }
}
}
reader.close();
//
if (!finish) {
FileUtils.copyFile(alternate_file_d, alternate_file_s);
alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1);
alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2);
}
}
FileUtils.forceDelete(alternate_file_s);
FileUtils.forceDelete(alternate_file_d);
Logger.getLogger("root").info("Applying deduplication algoritm - Finish");
} catch (Exception ex) {
String error_msg = "Error extracting emails from extractor " + input_file.getName();
Logger.getRootLogger().error(error_msg + " - " + ex.toString());
if (error_sw != null)
error_sw.append(error_msg + "\r\n");
return;
}
}
}Example 23
| Project: corona_src-master File: DocumentSpliter.java View source code |
/**
* 分割処�
*
* @param input
* 入力ファイル
* @return ��
*/
public boolean split(File input) {
if (input == null) {
//$NON-NLS-1$
throw new IllegalArgumentException("input file must not null");
}
/* テã‚ã‚¹ãƒˆæ§‹é€ è§£æž?ã?«é£Ÿã‚?ã?›ã‚‹ã?Ÿã‚?ã?®InputStream */
final InputStream is;
if (input.getPath().endsWith(".pdf")) {
//$NON-NLS-1$
/*
* PDFファイルã?‹ã‚‰ãƒ†ã‚ストを抽出ã?™ã‚‹
*/
FileInputStream pdfStream = null;
try {
pdfStream = new FileInputStream(input.getPath());
PDFParser pdfParser = new PDFParser(pdfStream);
// 分�
pdfParser.parse();
PDDocument pdf = pdfParser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
String spdf2txt = stripper.getText(pdf);
is = new ByteArrayInputStream(spdf2txt.getBytes());
} catch (FileNotFoundException e) {
openErrorDialog(Messages.ErrorTitle_FailedReadFile, Messages.bind(Messages.ErrorMessage_FileNotFound, input.getPath()));
e.printStackTrace();
return false;
} catch (IOException e) {
openErrorDialog(Messages.ErrorTitle_FailedReadFile, Messages.bind(Messages.ErrorMessage_FailedReadFile, input.getPath()));
e.printStackTrace();
return false;
} finally {
if (pdfStream != null) {
try {
pdfStream.close();
} catch (IOException e1) {
}
}
}
} else {
/*
* ã??れ以外(*.txtã?¨ã?‹ï¼‰
*/
setEncode(Encoding.Shift_JIS.toString());
try {
is = new FileInputStream(input.getPath());
} catch (FileNotFoundException e) {
openErrorDialog(Messages.ErrorTitle_FailedReadFile, Messages.bind(Messages.ErrorMessage_FileNotFound, input.getPath()));
e.printStackTrace();
return false;
}
}
//$NON-NLS-1$
final String Regex_HeadSpace = "^[ \\s]+";
//$NON-NLS-1$
final String Regex_TailSpace = "[ \\s]+$";
BufferedReader br = null;
try {
// 1行��判定
br = new BufferedReader(new InputStreamReader(is, encode));
String line;
StringBuilder buff = new StringBuilder(100);
divPointList = new ArrayList<Integer>();
deletePointMap = new TreeMap<Integer, Integer>(new Comparator<Integer>() {
@Override
public int compare(Integer i1, Integer i2) {
return i2.compareTo(i1);
}
});
if (allDefinitions.size() > 0) {
while ((line = br.readLine()) != null) {
/*
* 行é ・行末ã?®ç©ºç™½æ–‡å—(全角ã€?å?Šè§’ã€?タブ)除去をã?™ã‚‹ã€‚
* ���DB登録時�エラー��る��'を置��る
*/
//$NON-NLS-1$//$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
line = line.replaceAll(Regex_HeadSpace, "").replaceAll(Regex_TailSpace, "").replace("'", "\"").replace("\\", "\\\\");
divPointList.clear();
deletePointMap.clear();
if (line.length() > 0) {
for (CoronaDocumentDefinition definition : allDefinitions) {
if (definition.getPosition() == CoronaDocumentDefinition.PHRASE) {
// æ–‡é ãƒ?ェック
checkPhrase(line, buff, definition);
} else if (definition.getPosition() == CoronaDocumentDefinition.WHOLE) {
// 全体�ェック
checkWhole(line, buff, definition);
}
}
//$NON-NLS-1$
buff.append(line).append("\n");
// æ–‡ç« ã‚’åˆ†å‰²
divisionRecord(buff);
divideWriting(buff);
} else {
// 段��ェック
if (buff.length() > 0) {
divideWriting(buff);
output.add(buff.toString());
buff.setLength(0);
}
}
}
if (buff.length() > 0) {
divideWriting(buff);
output.add(buff.toString());
}
} else {
while ((line = br.readLine()) != null) {
//$NON-NLS-1$//$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
line = line.replaceAll(Regex_HeadSpace, "").replaceAll(Regex_TailSpace, "").replaceAll("'", "\"").replace("\\", "\\\\");
buff.append(line);
}
if (buff.length() > 0) {
divideWriting(buff);
output.add(buff.toString());
}
}
} catch (IOException e) {
openErrorDialog(Messages.ErrorTitle_FailedReadFile, Messages.bind(Messages.ErrorMessage_FailedReadFile, input.getPath()));
e.printStackTrace();
return false;
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
}
}
}
return true;
}Example 24
| Project: pdfbox-master File: PDDocument.java View source code |
/**
* Parses a PDF.
*
* @param file file to be loaded
* @param password password to be used for decryption
* @param keyStore key store to be used for decryption when using public key security
* @param alias alias to be used for decryption when using public key security
* @param memUsageSetting defines how memory is used for buffering PDF streams
*
* @return loaded document
*
* @throws IOException in case of a file reading or parsing error
*/
public static PDDocument load(File file, String password, InputStream keyStore, String alias, MemoryUsageSetting memUsageSetting) throws IOException {
RandomAccessBufferedFileInputStream raFile = new RandomAccessBufferedFileInputStream(file);
try {
ScratchFile scratchFile = new ScratchFile(memUsageSetting);
try {
PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile);
parser.parse();
return parser.getPDDocument();
} catch (IOException ioe) {
IOUtils.closeQuietly(scratchFile);
throw ioe;
}
} catch (IOException ioe) {
IOUtils.closeQuietly(raFile);
throw ioe;
}
}Example 25
| Project: brigen-base-master File: PDFBoxDelegaterImpl.java View source code |
private static void check() {
try {
Class.forName(PDFParser.class.getName());
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}