package de.uni_goettingen.sub.commons.ocr.tesseract;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Wrapper for the tesseract command-line tool.
*
* @author dennis
*
*
*/
public class Tesseract {
protected static Logger logger = LoggerFactory.getLogger(Tesseract.class);
/**
* the command for the shell
*/
private final String tesseract = "tesseract";
/**
* It this is empty, then tesseract will produce a text file.
* Another option is currently 'hocr'
*/
private String format = "";
private File inputImage;
private File outputBase;
/**
* the corresponding language package must be installed on the system.
*/
private String language = "deu";
private boolean isGothic = false;
public Tesseract(File inputImage, File outputBase) {
this.inputImage = inputImage;
this.outputBase = outputBase;
}
/**
* tesseract v3 only supports txt and hocr
* @param format
*/
public void setFormat(String format) {
this.format = format;
}
/**
* the corresponding language package must be installed on the system.
*/
public void setLanguage(String language) {
this.language = language;
}
public void setGothic(boolean isGothic) {
this.isGothic = isGothic;
}
/**
* Starts tesseract on the command line using the parameter fields
*/
public void execute() {
String postfix = isGothic ? "-frak" : "";
String i = inputImage.getAbsolutePath();
String o = outputBase.getAbsolutePath();
logger.debug("Executing command: " + tesseract + " " + i + " " + o
+ " " + "-l" + " " + language + postfix + " " + format);
try {
Process proc = new ProcessBuilder(tesseract, i, o, "-l", language
+ postfix, format).start();
logger.debug("Tesseract stdout: "
+ IOUtils.toString(proc.getInputStream()));
String tessError = IOUtils.toString(proc.getErrorStream());
logger.debug("Tesseract stderr: " + tessError);
} catch (IOException e) {
logger.error("Error executing Tesseract.");
e.printStackTrace();
}
}
}