package mj.ocraptor.extraction.image_processing;
import java.awt.image.BufferedImage;
import java.io.File;
import mj.ocraptor.MainController;
import mj.ocraptor.MainController.Status;
import mj.ocraptor.configuration.Config;
import mj.ocraptor.configuration.properties.ConfigBool;
import mj.ocraptor.configuration.properties.ConfigInteger;
import mj.ocraptor.console.Platform;
import mj.ocraptor.console.Platform.Os;
import mj.ocraptor.events.EventManager;
import mj.ocraptor.file_handler.executer.CommandExecutor;
import mj.ocraptor.file_handler.executer.handler_impl.SimpleOutput;
import mj.ocraptor.tools.St;
import net.sourceforge.tess4j.Tesseract1;
import net.sourceforge.tess4j.TesseractException;
import org.apache.commons.io.FileUtils;
public class ImageTextExtractorTess4j extends ImageTextExtractor {
private final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(getClass());
private static final int WIDTH_THRESHOLD_TO_RESIZE = 1500;
private static final int HEIGHT_THRESHOLD_TO_RESIZE = 1500;
private static final int ONE_SECOND_IN_MS = 1000;
private static final int CHECK_TIMEOUT_INTERVAL = 10;
private String javaPath = "";
/**
*
*/
public ImageTextExtractorTess4j() {
super();
init();
}
/**
*
*
*/
private void init() {
// try {
// ImageTextExtractorTess4j.copyNativeFiles();
// } catch (Exception e) {
// String message = "Could not copy tesseracts native files. Shutting down...";
// LOG.error(message, e);
// JOptionPane.showMessageDialog(null, message + "\n\n"
// + StringTools.trimToLengthIndicatorRight(ExceptionUtils.getStackTrace(e), 500));
// // TODO: logging
// System.exit(1);
// }
if (this.cfg.useBuildInJRE()) {
final String basePath = Config.getBinsFolder() + File.separator + "portable-java"
+ File.separator;
Os os = Platform.getSystem();
if (os == Os.LINUX) {
javaPath = basePath + "lin-x86-64/bin/";
} else if (os == Os.OSX) {
javaPath = basePath + "osx-x86-64/bin/";
} else if (os == Os.WINDOWS) {
javaPath = basePath + "win-x86-64\\bin\\";
}
}
}
private static final int PSM_MODE = 1;
private static final String TESSERACT_FOLDER = "res/tess";
/**
*
*
* @param file
* @param originalPath
* @param language
* @return
*/
private String ocr(File file, String originalPath, String language) {
// TODO:
Tesseract1 instance = new Tesseract1();
instance.setPageSegMode(PSM_MODE);
instance.setDatapath(TESSERACT_FOLDER);
instance.setLanguage(language);
try {
String result = instance.doOCR(file);
return result;
} catch (TesseractException e) {
e.printStackTrace();
}
return null;
}
private static final String TESS4J_WRAPPER_MAIN_CLASS = "CommandLineInterpreter";
/**
*
*
* @param file
* @param originalPath
* @param language
* @return
*
*/
private String ocrExternal(File file, String originalPath, String language) {
String command = javaPath + "java -Dfile.encoding=UTF-8 -Xmx256m -cp \""
+ Config.getTess4jWrapperBinPath() + (Platform.getSystem() == Os.WINDOWS ? ";" : ":")
+ Config.getLibraryFolderPath() + "/*\" " + TESS4J_WRAPPER_MAIN_CLASS + " \""
+ file.getAbsolutePath() + "\" \"" + language + "\"";
// LOG.info(command);
String errOutput = null, stdOutput = null;
SimpleOutput eventHandler = new SimpleOutput();
CommandExecutor bashExecuter = new CommandExecutor(Platform.getSystem(), eventHandler);
bashExecuter.setCommand(command);
Thread executorThread = new Thread(bashExecuter);
MainController controller = MainController.inst();
if (command != null && !command.trim().isEmpty()) {
executorThread.start();
final long startTime = System.currentTimeMillis();
boolean pause = false;
while (!Thread.currentThread().isInterrupted() && executorThread.isAlive()) {
long duration = System.currentTimeMillis() - startTime;
if (!pause) {
pause = (controller.getStatus() == Status.PAUSED);
}
// monitoring, if timeout occures --> kill process
try {
Thread.sleep(CHECK_TIMEOUT_INTERVAL);
final int timeout = this.cfg.getProp(ConfigInteger.PROCESSING_TIMEOUT_IN_SECONDS)
* ONE_SECOND_IN_MS;
if (duration > timeout || (controller.getStatus() == Status.STOPPED) || pause) {
bashExecuter.killProcess();
if (duration > timeout) {
EventManager.instance().failedToProcessFile(
"Timeout processing image with OCR-Engine.", originalPath);
}
break;
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
// if application is paused, kill all current processes
// and restart them on resume
if (pause) {
while (!Thread.currentThread().isInterrupted()
&& (controller.getStatus() != Status.STOPPED)) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
e.printStackTrace();
}
if (controller.getStatus() != Status.PAUSED) {
return this.ocr(file, originalPath, language);
}
}
}
}
errOutput = eventHandler.getErrOut();
stdOutput = eventHandler.getStdOut();
if (bashExecuter.isCommandStillRunning()) {
executorThread.interrupt();
}
// @formatter:off
String[] acceptedMessages = new String[] {
"using default language params",
"weak margin",
"test blob"
};
// @formatter:on
if (errOutput != null && !errOutput.trim().isEmpty()) {
for (String acceptedMessage : acceptedMessages) {
if (errOutput.toLowerCase().contains(acceptedMessage)) {
return stdOutput;
}
}
LOG.error(errOutput);
}
// System.out.println(stdOutput);
// System.out.println(errOutput);
return stdOutput;
}
/**
*
*
* @param image
* @param originalPath
* @param language
* @return
*/
private String ocr(BufferedImage image, String originalPath, String language) {
// TODO:
return null;
}
/**
*
*
* @return
*/
private String extractText(BufferedImage image, File file, String originalPath,
String... languages) {
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
String parsedText = null;
try {
String languageStrings = "";
for (String lang : languages) {
languageStrings += lang + "+";
}
languageStrings = St.removeLastCharacters(languageStrings, 1);
if (image != null) {
parsedText = ocr(image, originalPath, languageStrings);
}
if (file != null) {
parsedText = ocr(file, originalPath, languageStrings);
}
} catch (Exception e) {
e.printStackTrace();
}
return parsedText;
}
/**
*
*
* @return
*/
@Override
public String extractText(BufferedImage image, String originalPath, String... languages) {
if (image != null && cfg.getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
BufferedImage processedImage = null;
if (this.lastGivenBufferedImage != null && this.lastProcessedBufferedImage != null
&& this.lastGivenBufferedImage == image) {
processedImage = this.lastProcessedBufferedImage;
} else {
processedImage = preProcessImage(image);
this.lastGivenBufferedImage = image;
this.lastProcessedBufferedImage = processedImage;
}
File tempImageFile = this.preprocessor.bufferedImageToFile(processedImage);
if (tempImageFile != null) {
return extractText(null, tempImageFile, originalPath, languages);
}
}
return null;
}
/**
*
*
* @return
*/
@Override
public String extractText(File file, String originalPath, String... languages) {
if (file != null && cfg.getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
BufferedImage processedImage = null;
if (this.lastGivenImageFile != null && this.lastProcessedBufferedImage != null
&& this.lastGivenImageFile == file) {
processedImage = this.lastProcessedBufferedImage;
} else {
BufferedImage image = getPreprocessor().fileToBufferedImage(file);
if (image != null) {
processedImage = preProcessImage(image);
this.lastGivenImageFile = file;
this.lastProcessedBufferedImage = processedImage;
}
}
File tempImageFile = this.preprocessor.bufferedImageToFile(processedImage);
return extractText(null, tempImageFile, originalPath, languages);
}
return null;
}
/**
*
*
* @param image
* @return
*/
private BufferedImage preProcessImage(BufferedImage image) {
BufferedImage processedImage = null;
if (image.getWidth() < WIDTH_THRESHOLD_TO_RESIZE
|| image.getHeight() < HEIGHT_THRESHOLD_TO_RESIZE) {
processedImage = getPreprocessor().preprocessForOCR(image, true);
} else {
processedImage = getPreprocessor().preprocessForOCR(image, false);
}
return processedImage;
}
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
/**
*
*
* @return
*/
public static void copyNativeFiles() throws Exception {
if (!Config.devMode()) {
// try {
final String nativeLibPath = Config.getTess4jNativeLibrariesFolderPath();
if (nativeLibPath != null) {
final File path = new File(nativeLibPath);
if (path.exists() && path.isDirectory()) {
final File[] libs = path.listFiles();
File basePath = Config.getJarFileFolder();
basePath = new File("");
if (basePath != null) {
for (File lib : libs) {
final File newLibFile = new File(basePath.getParent(), lib.getName());
if (!newLibFile.exists()) {
FileUtils.copyFile(lib, newLibFile);
newLibFile.deleteOnExit();
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
if (newLibFile.exists() && newLibFile.canWrite()) {
newLibFile.delete();
}
}
});
}
}
}
}
}
// } catch (Exception e) {
// String message =
// "Could not copy tesseracts native files. Shutting down...";
// LOG.error(message, e);
// JOptionPane.showMessageDialog(null, message + "\n\n"
// +
// StringTools.trimToLengthIndicatorRight(ExceptionUtils.getStackTrace(e),
// 500));
// // TODO: logging
// System.exit(1);
// }
}
}
}