/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2010-2015 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
**/
package com.jaeksoft.searchlib.parser;
import java.awt.Dimension;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.comparator.LastModifiedFileComparator;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.util.PDFMergerUtility;
import com.jaeksoft.searchlib.ClientCatalog;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.ocr.HocrDocument;
import com.jaeksoft.searchlib.ocr.HocrPdf;
import com.jaeksoft.searchlib.ocr.HocrPdf.HocrPage;
import com.jaeksoft.searchlib.ocr.OcrManager;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiter;
import com.jaeksoft.searchlib.util.ExecuteUtils.ExecutionException;
import com.jaeksoft.searchlib.util.GhostScript;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.ImageUtils;
import com.jaeksoft.searchlib.util.PdfCrack;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.ThreadUtils;
import com.jaeksoft.searchlib.util.pdfbox.PDFBoxUtils;
import com.jaeksoft.searchlib.util.pdfbox.PDFBoxUtils.TolerantPDFTextStripper;
public class PdfParser extends Parser {
public static final String[] DEFAULT_MIMETYPES = { "application/pdf" };
public static final String[] DEFAULT_EXTENSIONS = { "pdf" };
public static final Semaphore gsSemaphore = new Semaphore(Runtime.getRuntime().availableProcessors());
private static ParserFieldEnum[] fl = { ParserFieldEnum.parser_name, ParserFieldEnum.title, ParserFieldEnum.author,
ParserFieldEnum.subject, ParserFieldEnum.content, ParserFieldEnum.producer, ParserFieldEnum.keywords,
ParserFieldEnum.creation_date, ParserFieldEnum.modification_date, ParserFieldEnum.language,
ParserFieldEnum.number_of_pages, ParserFieldEnum.ocr_content, ParserFieldEnum.image_ocr_boxes,
ParserFieldEnum.pdfcrack_password };
public PdfParser() {
super(fl);
}
@Override
public void initProperties() throws SearchLibException {
super.initProperties();
addProperty(ClassPropertyEnum.SIZE_LIMIT, "0", null, 20, 1);
addProperty(ClassPropertyEnum.GHOSTSCRIPT_BINARYPATH, "", null, 50, 1);
addProperty(ClassPropertyEnum.PDFCRACK_COMMANDLINE, "", null, 50, 1);
}
private Calendar getCreationDate(PDDocumentInformation pdfInfo) {
try {
return pdfInfo.getCreationDate();
} catch (IOException e) {
Logging.warn(e);
return null;
}
}
private Calendar getModificationDate(PDDocumentInformation pdfInfo) {
try {
return pdfInfo.getCreationDate();
} catch (IOException e) {
Logging.warn(e);
return null;
}
}
private String getDate(Calendar cal) {
if (cal == null)
return null;
Date time = cal.getTime();
if (time == null)
return null;
return time.toString();
}
private void extractMetaData(ParserResultItem result, PDDocument pdf) throws IOException {
PDDocumentInformation info = pdf.getDocumentInformation();
if (info != null) {
result.addField(ParserFieldEnum.title, info.getTitle());
result.addField(ParserFieldEnum.subject, info.getSubject());
result.addField(ParserFieldEnum.author, info.getAuthor());
result.addField(ParserFieldEnum.producer, info.getProducer());
result.addField(ParserFieldEnum.keywords, info.getKeywords());
String d = getDate(getCreationDate(info));
if (d != null)
result.addField(ParserFieldEnum.creation_date, d);
d = getDate(getModificationDate(info));
if (d != null)
result.addField(ParserFieldEnum.modification_date, d);
}
int pages = pdf.getNumberOfPages();
result.addField(ParserFieldEnum.number_of_pages, pages);
PDDocumentCatalog catalog = pdf.getDocumentCatalog();
if (catalog != null) {
result.addField(ParserFieldEnum.language, catalog.getLanguage());
}
}
private int addLine(ParserResultItem result, String line) {
if (line == null)
return 0;
line = StringUtils.replaceConsecutiveSpaces(line, " ").trim();
int l = line.length();
if (l == 0)
return 0;
result.addField(ParserFieldEnum.content, line);
return line.length();
}
/**
* Extract text content using PDFBox
*
* @param result
* @param pdf
* @throws IOException
*/
private int extractTextContent(ParserResultItem result, PDDocument pdf) throws IOException {
TolerantPDFTextStripper stripper = new TolerantPDFTextStripper();
String text = stripper.getText(pdf);
if (StringUtils.isEmpty(text))
return 0;
String[] lines = StringUtils.splitLines(text);
int characterCount = 0;
for (String line : lines)
characterCount += addLine(result, line);
return characterCount;
}
/**
* Extract text content using Ghostscript
*
* @param result
* @param ghostScript
* @param pdfFile
* @param pdfPassword
* @throws IOException
* @throws InterruptedException
*/
private int extractTextContent(ParserResultItem result, PdfOcrContext context)
throws IOException, InterruptedException {
File textFile = null;
BufferedReader bufferedReader = null;
FileReader fileReader = null;
try {
textFile = File.createTempFile("oss_pdfparser", "txt");
context.ghostScript.extractText(context.pdfPassword, context.pdfFile, textFile);
fileReader = new FileReader(textFile);
bufferedReader = new BufferedReader(fileReader);
int characterCount = 0;
String line;
while ((line = bufferedReader.readLine()) != null)
characterCount += addLine(result, line);
return characterCount;
} catch (ExecutionException e) {
Logging.warn("Ghostscript returned: " + e.getReturnedText());
throw e;
} finally {
IOUtils.close(bufferedReader, fileReader);
if (textFile != null)
if (textFile.exists())
textFile.delete();
}
}
private String decrypt(PDDocument pdf, File pdfFile)
throws BadSecurityHandlerException, IOException, CryptographyException {
// Let's try first with an empty password
String password = StringUtils.EMPTY;
try {
pdf.openProtection(new StandardDecryptionMaterial(password));
} catch (CryptographyException e) {
// New attempt with PDFCrack
String pdfCrackCommandLine = getStringProperty(ClassPropertyEnum.PDFCRACK_COMMANDLINE);
if (StringUtils.isEmpty(pdfCrackCommandLine))
throw e;
password = PdfCrack.findPassword(pdfCrackCommandLine, pdfFile);
if (password == null) // No password found
throw new IOException("Encrypted PDF.");
// Password found, let's open
pdf.openProtection(new StandardDecryptionMaterial(password));
}
return password;
}
@Override
protected void parseContent(StreamLimiter streamLimiter, final LanguageEnum lang) throws IOException {
PdfOcrContext context = new PdfOcrContext();
context.lang = lang;
String fileName = null;
try {
String ghostScriptBinaryPath = getStringProperty(ClassPropertyEnum.GHOSTSCRIPT_BINARYPATH);
context.ghostScript = StringUtils.isEmpty(ghostScriptBinaryPath) ? null
: new GhostScript(ghostScriptBinaryPath);
fileName = streamLimiter.getFile().getName();
context.pdfFile = streamLimiter.getFile();
context.pdf = PDDocument.load(context.pdfFile, null);
try {
if (context.pdf.isEncrypted())
context.pdfPassword = decrypt(context.pdf, context.pdfFile);
} catch (Exception e) {
Logging.warn("PDFBox decryption failed " + fileName);
IOUtils.closeQuietly(context.pdf);
context.pdf = null;
}
ParserResultItem result = getNewParserResultItem();
result.addField(ParserFieldEnum.pdfcrack_password, context.pdfPassword);
if (context.pdf != null)
extractMetaData(result, context.pdf);
int charCount = 0;
if (context.ghostScript == null) {
if (context.pdf != null)
charCount = extractTextContent(result, context.pdf);
} else
charCount = extractTextContent(result, context);
if (charCount == 0 && context.pdf != null)
extractImagesForOCR(result, context);
result.langDetection(10000, ParserFieldEnum.content);
} catch (SearchLibException e) {
throw new IOException("Failed on " + fileName, e);
} catch (InterruptedException e) {
throw new IOException("Failed on " + fileName, e);
} catch (java.util.concurrent.ExecutionException e) {
throw new IOException("Failed on " + fileName, e);
} finally {
if (context.pdf != null)
context.pdf.close();
}
}
private HocrDocument doOcr(OcrManager ocr, LanguageEnum lang, BufferedImage image)
throws IOException, InterruptedException, SearchLibException {
File hocrFile = null;
try {
hocrFile = File.createTempFile("ossocr", "." + ocr.getHocrFileExtension());
ocr.ocerizeImage(image, hocrFile, lang, true);
if (hocrFile.length() == 0)
return null;
return new HocrDocument(hocrFile);
} finally {
if (hocrFile != null)
FileUtils.deleteQuietly(hocrFile);
}
}
private HocrDocument doOcr(OcrManager ocr, LanguageEnum lang, File imageFile)
throws IOException, InterruptedException, SearchLibException {
File hocrFile = null;
try {
hocrFile = File.createTempFile("ossocr", "." + ocr.getHocrFileExtension());
ocr.ocerize(imageFile, hocrFile, lang, true);
if (hocrFile.length() == 0)
return null;
return new HocrDocument(hocrFile);
} finally {
if (hocrFile != null)
FileUtils.deleteQuietly(hocrFile);
}
}
private void ocrImageGhostcript(PdfOcrContext context, int page)
throws IOException, InterruptedException, SearchLibException {
File imageFile = null;
try {
imageFile = File.createTempFile("oss_pdfparser", ".png");
gsSemaphore.acquire();
try {
context.ghostScript.generateImage(context.pdfPassword, page, context.pdfFile, 300, imageFile);
} finally {
gsSemaphore.release();
}
Dimension dimension = ImageUtils.getDimensions(imageFile);
HocrPage hocrPage = context.hocrPdf.createPage(page - 1, dimension.width, dimension.height);
hocrPage.addImage(doOcr(context.ocr, context.lang, imageFile));
} finally {
if (imageFile != null)
if (imageFile.exists())
imageFile.delete();
}
}
public class PdfOcrContext {
private PDDocument pdf = null;
private OcrManager ocr = null;
private LanguageEnum lang = null;
private GhostScript ghostScript = null;
private File pdfFile = null;
private String pdfPassword = null;
private HocrPdf hocrPdf = null;
}
public class ImageOcrCallable implements Callable<Boolean> {
private final PdfOcrContext context;
private final PDPage page;
private final int currentPage;
private final AtomicInteger emptyPageImages;
public ImageOcrCallable(PdfOcrContext context, PDPage page, int currentPage, AtomicInteger emptyPageImages) {
this.context = context;
this.page = page;
this.currentPage = currentPage;
this.emptyPageImages = emptyPageImages;
}
@Override
public Boolean call() throws IOException, InterruptedException, SearchLibException {
if (PDFBoxUtils.countCheckImage(page) == 0)
return false;
if (context.ghostScript == null) {
BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_BGR, 300);
if (ImageUtils.checkIfManyColors(image)) {
HocrPage hocrPage = context.hocrPdf.createPage(currentPage - 1, image.getWidth(),
image.getHeight());
hocrPage.addImage(doOcr(context.ocr, context.lang, image));
} else
emptyPageImages.incrementAndGet();
} else {
ocrImageGhostcript(context, currentPage);
}
return true;
}
}
private void extractImagesForOCR(ParserResultItem result, PdfOcrContext context)
throws SearchLibException, IOException, InterruptedException, java.util.concurrent.ExecutionException {
context.ocr = ClientCatalog.getOcrManager();
if (context.ocr == null || context.ocr.isDisabled())
return;
if (!getFieldMap().isMapped(ParserFieldEnum.ocr_content)
&& !getFieldMap().isMapped(ParserFieldEnum.image_ocr_boxes))
return;
context.hocrPdf = new HocrPdf();
List<?> pages = context.pdf.getDocumentCatalog().getAllPages();
Iterator<?> iter = pages.iterator();
int currentPage = 0;
AtomicInteger emptyPageImages = new AtomicInteger(0);
ExecutorService executorService = config.getThreadPool();
List<Future<Boolean>> futures = new ArrayList<Future<Boolean>>();
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
ImageOcrCallable callable = new ImageOcrCallable(context, page, ++currentPage, emptyPageImages);
futures.add(executorService.submit(callable));
}
ThreadUtils.<Boolean> done(futures);
if (currentPage > 0 && emptyPageImages.get() == currentPage)
throw new SearchLibException("All pages are blank " + currentPage);
if (getFieldMap().isMapped(ParserFieldEnum.image_ocr_boxes))
context.hocrPdf.putHocrToParserField(result, ParserFieldEnum.image_ocr_boxes);
if (getFieldMap().isMapped(ParserFieldEnum.ocr_content))
context.hocrPdf.putTextToParserField(result, ParserFieldEnum.ocr_content);
}
@Override
public void mergeFiles(File fileDir, File destFile) throws SearchLibException {
PDFMergerUtility pdfMerger = new PDFMergerUtility();
File[] files = new LastModifiedFileComparator().sort(fileDir.listFiles());
for (File file : files) {
String ext = FilenameUtils.getExtension(file.getName());
if (!"pdf".equalsIgnoreCase(ext))
continue;
pdfMerger.addSource(file);
}
if (destFile.exists())
destFile.delete();
pdfMerger.setDestinationFileName(destFile.getAbsolutePath());
try {
pdfMerger.mergeDocuments();
} catch (COSVisitorException e) {
throw new SearchLibException(e);
} catch (IOException e) {
throw new SearchLibException(e);
}
}
}