/*
* Created on 03 dic 2016
* Copyright 2015 by Andrea Vacondio (andrea.vacondio@gmail.com).
* This file is part of Sejda.
*
* Sejda is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Sejda is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with Sejda. If not, see <http://www.gnu.org/licenses/>.
*/
package org.sejda.impl.sambox.ocr.component;
import static java.util.Objects.nonNull;
import static java.util.Optional.ofNullable;
import static org.sejda.util.RequireUtils.requireNotNullArg;
import java.io.Closeable;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.sejda.sambox.contentstream.PDFStreamEngine;
import org.sejda.sambox.contentstream.operator.MissingOperandException;
import org.sejda.sambox.contentstream.operator.Operator;
import org.sejda.sambox.contentstream.operator.OperatorProcessor;
import org.sejda.sambox.cos.COSBase;
import org.sejda.sambox.cos.COSDictionary;
import org.sejda.sambox.cos.COSName;
import org.sejda.sambox.cos.COSStream;
import org.sejda.sambox.pdmodel.MissingResourceException;
import org.sejda.sambox.pdmodel.PDPage;
import org.sejda.sambox.pdmodel.common.PDStream;
import org.sejda.sambox.pdmodel.graphics.PDXObject;
import org.sejda.sambox.pdmodel.graphics.form.PDFormXObject;
import org.sejda.sambox.pdmodel.graphics.form.PDTransparencyGroup;
import org.sejda.sambox.pdmodel.graphics.image.PDImageXObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Stream engine doing OCR on images of a PDPage
*
* @author Andrea Vacondio
*/
public class OcrTextExtractor extends PDFStreamEngine implements Consumer<PDPage>, Closeable {
private static final Logger LOG = LoggerFactory.getLogger(OcrTextExtractor.class);
private Writer writer;
private OCR ocrEngine;
public OcrTextExtractor(Writer writer, OCR ocrEngine) {
requireNotNullArg(writer, "Cannot write text on a null writer");
requireNotNullArg(ocrEngine, "OCR engine cannot be null");
addOperator(new DoOCR());
this.writer = writer;
this.ocrEngine = ocrEngine;
// so it uses the env variable TESSDATA_PREFIX
this.ocrEngine.setDatapath(null);
}
public void setLanguage(Set<Locale> languages) {
if (nonNull(languages) && !languages.isEmpty()) {
ocrEngine.setLanguage(languages.stream().map(Locale::getISO3Language).collect(Collectors.joining("+")));
} else {
// default to eng
ocrEngine.setLanguage("eng");
}
}
private class DoOCR extends OperatorProcessor {
@Override
public void process(Operator operator, List<COSBase> operands) throws IOException {
if (operands.isEmpty()) {
throw new MissingOperandException(operator, operands);
}
COSBase operand = operands.get(0);
if (operand instanceof COSName) {
COSName name = (COSName) operand;
COSBase existing = ofNullable(
getContext().getResources().getCOSObject().getDictionaryObject(COSName.XOBJECT,
COSDictionary.class))
.map(d -> d.getDictionaryObject(name))
.orElseThrow(() -> new MissingResourceException("Missing XObject: " + name.getName()));
if (existing instanceof COSStream) {
COSStream stream = (COSStream) existing;
String subtype = stream.getNameAsString(COSName.SUBTYPE);
if (COSName.IMAGE.getName().equals(subtype)) {
LOG.trace("Performing OCR on {}", name);
PDXObject xobject = PDXObject.createXObject(stream.getCOSObject(), getContext().getResources());
try {
OcrTextExtractor.this.writer
.write(ocrEngine.ocrTextFrom(((PDImageXObject) xobject).getImage()));
} catch (IOException e) {
LOG.warn("Unable to OCR image", e);
}
xobject.getCOSObject().unDecode();
} else if (COSName.FORM.getName().equals(subtype)) {
PDXObject xobject = PDXObject.createXObject(existing.getCOSObject(),
getContext().getResources());
if (xobject instanceof PDTransparencyGroup) {
getContext().showTransparencyGroup((PDTransparencyGroup) xobject);
} else if (xobject instanceof PDFormXObject) {
getContext().showForm((PDFormXObject) xobject);
}
}
}
}
}
@Override
public String getName() {
return "Do";
}
}
/**
* process the page
*
* @throws UnsatisfiedLinkError
* in case the OCR engine is not found
*/
@Override
public void accept(PDPage page) {
try {
if (page.hasContents()) {
processPage(page);
unload(page);
} else {
LOG.debug("Skipping page with no content");
}
} catch (IOException e) {
LOG.error("An error occurred doing OCR on page, skipping and continuing with next.", e);
}
}
private void unload(PDPage page) {
Iterator<PDStream> iter = page.getContentStreams();
while (iter.hasNext()) {
iter.next().getCOSObject().unDecode();
}
}
@Override
public void close() {
IOUtils.closeQuietly(this.writer);
}
}