package de.uni_goettingen.sub.commons.ocr.tesseract;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Locale;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.uni_goettingen.sub.commons.ocr.api.AbstractImage;
import de.uni_goettingen.sub.commons.ocr.api.AbstractOutput;
import de.uni_goettingen.sub.commons.ocr.api.AbstractProcess;
import de.uni_goettingen.sub.commons.ocr.api.OcrFormat;
import de.uni_goettingen.sub.commons.ocr.api.OcrImage;
import de.uni_goettingen.sub.commons.ocr.api.OcrOutput;
import de.uni_goettingen.sub.commons.ocr.api.OcrProcess;
import de.uni_goettingen.sub.commons.ocr.api.OcrTextType;
import de.unigoettingen.sub.commons.ocr.util.BeanProvider;
import de.unigoettingen.sub.commons.ocr.util.FileAccess;
import de.unigoettingen.sub.commons.ocr.util.merge.Merger;
import de.unigoettingen.sub.commons.ocr.util.merge.MergerProvider;
/**
* Represents an OCR job with several images.
*/
public class TesseractProcess extends AbstractProcess implements
OcrProcess {
private static final long serialVersionUID = 4819408808755150623L;
/** The logger. */
protected static Logger logger = LoggerFactory
.getLogger(TesseractProcess.class);
/**
* The temp files which are generated for each run of tesseract. Are merged
* into one file at the end.
*/
private List<File> tempFiles = new ArrayList<File>();
/**
* Languages of the images, mapped to strings which tesseract understands.
* Tesseract can only use one for each image.
*/
private Map<String, String> languages = new HashMap<String, String>();
/**
* The extensions that are generated by tesseract. txt for text results,
* html for hocr results
*/
private Map<OcrFormat, String> extensions = new HashMap<OcrFormat, String>();
/** Mappings of the interface formats to tesseract-specific ones */
private Map<OcrFormat, String> formats = new HashMap<OcrFormat, String>();
private MergerProvider mergerProvider = new MergerProvider();
private BeanProvider beanProvider = new BeanProvider();
private FileAccess fileAccess;
{
languages.put("de", "deu");
languages.put("en", "eng");
extensions.put(OcrFormat.TXT, "txt");
extensions.put(OcrFormat.HOCR, "html");
formats.put(OcrFormat.TXT, "");
formats.put(OcrFormat.HOCR, "hocr");
}
// for unit tests
void setMergerProvider(MergerProvider newProvider) {
mergerProvider = newProvider;
}
void setBeanProvider(BeanProvider newBeanProvider) {
beanProvider = newBeanProvider;
}
Tesseract createTesseract(File image, File output) {
return new Tesseract(image, output);
}
@Override
public void addImage(URI localUri) {
if (!localUri.getScheme().equals("file")) {
throw new IllegalArgumentException("Only local files can be processed. Path was: " + localUri);
}
OcrImage image = new AbstractImage() {};
image.setLocalUri(localUri);
ocrImages.add(image);
}
@Override
public void addOutput(OcrFormat format) {
if (!formats.keySet().contains(format)) {
throw new IllegalArgumentException("Format is not supported by tesseract: " + format);
}
OcrOutput output = new AbstractOutput() {};
output.setLocalUri(constructLocalUri(format));
output.setFormat(format);
ocrOutputs.add(output);
}
/**
* Manages the input images and output files, then starts tesseract once for
* each image.
*/
public void start() {
if (ocrOutputs.isEmpty() || ocrImages.isEmpty()) {
logger.warn("There are no images or no defined outputs in the process.");
return;
}
fileAccess = beanProvider.getFileAccess();
List<InputStream> inputsToMerge = new ArrayList<InputStream>();
for (OcrOutput output : ocrOutputs) {
// eg TXT
OcrFormat format = output.getFormat();
try {
// to have a different file name for each OCRed text
int i = 1;
for (OcrImage image : ocrImages) {
File imageFile = new File(image.getLocalUri());
File tempOutput = new File(output.getLocalUri().getPath() + i);
i++;
executeTesseract(imageFile, format, tempOutput);
// eg html for HOCR files, is automatically added by
// tesseract
String actualExtension = extensions.get(format);
String actualOutput = tempOutput.getAbsolutePath() + "."
+ actualExtension;
tempFiles.add(new File(actualOutput));
InputStream is = fileAccess.inputStreamForFile(new File(actualOutput));
inputsToMerge.add(is);
}
File localOutput = new File(output.getLocalUri().getPath());
OutputStream mergedOutput = fileAccess.outputStreamForFile(localOutput);
Merger merger = mergerProvider.createMerger(format);
merger.mergeBuffered(inputsToMerge, mergedOutput);
for (File file : tempFiles) {
logger.info("Deleting file " + file.getAbsolutePath());
fileAccess.deleteFile(file);
}
} catch (IOException e) {
logger.error("Could not finish the process for format '" + format + "'.", e);
}
}
}
private void executeTesseract(File image, OcrFormat format, File output) throws IOException {
File parentDir = new File(output.getParent());
if(!fileAccess.fileExists(parentDir)) {
fileAccess.makeDirs(parentDir);
}
Tesseract tesseract = createTesseract(image, output);
tesseract.setFormat(formats.get(format));
if (langs.isEmpty()) {
logger.warn("No language defined! Setting to German as default.");
langs.add(Locale.GERMAN);
}
// tesseract only takes one language
Locale locale = new ArrayList<Locale>(langs).get(0);
tesseract.setLanguage(languages.get(locale.getLanguage()));
if (getTextType() == OcrTextType.GOTHIC) {
tesseract.setGothic(true);
}
tesseract.execute();
}
}