package de.uni_goettingen.sub.commons.ocr.tesseract; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URI; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Locale; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.uni_goettingen.sub.commons.ocr.api.AbstractImage; import de.uni_goettingen.sub.commons.ocr.api.AbstractOutput; import de.uni_goettingen.sub.commons.ocr.api.AbstractProcess; import de.uni_goettingen.sub.commons.ocr.api.OcrFormat; import de.uni_goettingen.sub.commons.ocr.api.OcrImage; import de.uni_goettingen.sub.commons.ocr.api.OcrOutput; import de.uni_goettingen.sub.commons.ocr.api.OcrProcess; import de.uni_goettingen.sub.commons.ocr.api.OcrTextType; import de.unigoettingen.sub.commons.ocr.util.BeanProvider; import de.unigoettingen.sub.commons.ocr.util.FileAccess; import de.unigoettingen.sub.commons.ocr.util.merge.Merger; import de.unigoettingen.sub.commons.ocr.util.merge.MergerProvider; /** * Represents an OCR job with several images. */ public class TesseractProcess extends AbstractProcess implements OcrProcess { private static final long serialVersionUID = 4819408808755150623L; /** The logger. */ protected static Logger logger = LoggerFactory .getLogger(TesseractProcess.class); /** * The temp files which are generated for each run of tesseract. Are merged * into one file at the end. */ private List<File> tempFiles = new ArrayList<File>(); /** * Languages of the images, mapped to strings which tesseract understands. * Tesseract can only use one for each image. */ private Map<String, String> languages = new HashMap<String, String>(); /** * The extensions that are generated by tesseract. txt for text results, * html for hocr results */ private Map<OcrFormat, String> extensions = new HashMap<OcrFormat, String>(); /** Mappings of the interface formats to tesseract-specific ones */ private Map<OcrFormat, String> formats = new HashMap<OcrFormat, String>(); private MergerProvider mergerProvider = new MergerProvider(); private BeanProvider beanProvider = new BeanProvider(); private FileAccess fileAccess; { languages.put("de", "deu"); languages.put("en", "eng"); extensions.put(OcrFormat.TXT, "txt"); extensions.put(OcrFormat.HOCR, "html"); formats.put(OcrFormat.TXT, ""); formats.put(OcrFormat.HOCR, "hocr"); } // for unit tests void setMergerProvider(MergerProvider newProvider) { mergerProvider = newProvider; } void setBeanProvider(BeanProvider newBeanProvider) { beanProvider = newBeanProvider; } Tesseract createTesseract(File image, File output) { return new Tesseract(image, output); } @Override public void addImage(URI localUri) { if (!localUri.getScheme().equals("file")) { throw new IllegalArgumentException("Only local files can be processed. Path was: " + localUri); } OcrImage image = new AbstractImage() {}; image.setLocalUri(localUri); ocrImages.add(image); } @Override public void addOutput(OcrFormat format) { if (!formats.keySet().contains(format)) { throw new IllegalArgumentException("Format is not supported by tesseract: " + format); } OcrOutput output = new AbstractOutput() {}; output.setLocalUri(constructLocalUri(format)); output.setFormat(format); ocrOutputs.add(output); } /** * Manages the input images and output files, then starts tesseract once for * each image. */ public void start() { if (ocrOutputs.isEmpty() || ocrImages.isEmpty()) { logger.warn("There are no images or no defined outputs in the process."); return; } fileAccess = beanProvider.getFileAccess(); List<InputStream> inputsToMerge = new ArrayList<InputStream>(); for (OcrOutput output : ocrOutputs) { // eg TXT OcrFormat format = output.getFormat(); try { // to have a different file name for each OCRed text int i = 1; for (OcrImage image : ocrImages) { File imageFile = new File(image.getLocalUri()); File tempOutput = new File(output.getLocalUri().getPath() + i); i++; executeTesseract(imageFile, format, tempOutput); // eg html for HOCR files, is automatically added by // tesseract String actualExtension = extensions.get(format); String actualOutput = tempOutput.getAbsolutePath() + "." + actualExtension; tempFiles.add(new File(actualOutput)); InputStream is = fileAccess.inputStreamForFile(new File(actualOutput)); inputsToMerge.add(is); } File localOutput = new File(output.getLocalUri().getPath()); OutputStream mergedOutput = fileAccess.outputStreamForFile(localOutput); Merger merger = mergerProvider.createMerger(format); merger.mergeBuffered(inputsToMerge, mergedOutput); for (File file : tempFiles) { logger.info("Deleting file " + file.getAbsolutePath()); fileAccess.deleteFile(file); } } catch (IOException e) { logger.error("Could not finish the process for format '" + format + "'.", e); } } } private void executeTesseract(File image, OcrFormat format, File output) throws IOException { File parentDir = new File(output.getParent()); if(!fileAccess.fileExists(parentDir)) { fileAccess.makeDirs(parentDir); } Tesseract tesseract = createTesseract(image, output); tesseract.setFormat(formats.get(format)); if (langs.isEmpty()) { logger.warn("No language defined! Setting to German as default."); langs.add(Locale.GERMAN); } // tesseract only takes one language Locale locale = new ArrayList<Locale>(langs).get(0); tesseract.setLanguage(languages.get(locale.getLanguage())); if (getTextType() == OcrTextType.GOTHIC) { tesseract.setGothic(true); } tesseract.execute(); } }