package de.unigoettingen.sub.commons.ocrComponents.cli;
/*
© 2010, SUB Goettingen. All rights reserved.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.util.Properties;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import de.unigoettingen.sub.ocr.controller.OcrEngineStarter;
import de.unigoettingen.sub.ocr.controller.OcrParameters;
import de.unigoettingen.sub.ocr.controller.Validator;
public class Main {
private PrintStream out = System.out;
private Options options = new Options();
private CommandLine parsedOptions;
private boolean terminated = false;
private Validator paramValidator = new Validator();
private OcrEngineStarter engineStarter = new OcrEngineStarter();
// for unit tests
void redirectSystemOutputTo(PrintStream stream) {
out = stream;
}
// for unit tests
void setValidator(Validator newValidator) {
paramValidator = newValidator;
}
// for unit tests
void setOcrEngineStarter(OcrEngineStarter newStarter) {
engineStarter = newStarter;
}
public static void main(String[] args) throws URISyntaxException, UnsupportedEncodingException {
new Main().execute(args);
}
void execute(String[] args) throws UnsupportedEncodingException {
initOptions(args);
if (terminated) {
return;
}
OcrParameters params = transformOptions();
if (terminated) {
return;
}
String validationMessage = paramValidator.validateParameters(params);
if ("OK".equals(validationMessage)) {
out.println("Starting OCR...");
engineStarter.startOcrWithParams(params);
out.println("Finished OCR.");
} else {
out.println("Illegal options: " + validationMessage);
}
}
private void initOptions(String[] args) throws UnsupportedEncodingException {
options.addOption("help", false, "Print help");
options.addOption("indir", true, "Input directory - required");
options.addOption("informats", true, "File extensions, e.g. tif,jpg (default: all images)");
options.addOption("texttype", true, "E.g. normal or gothic - required");
options.addOption("langs", true, "Languages, e.g. de,en,fr - required");
options.addOption("outdir", true, "Output directory - required");
options.addOption("outformats", true, "Output formats, e.g. pdf,xml - required");
options.addOption("prio", true, "Priority: -2, -1, 0, 1, or 2. default is 0");
options.addOption("engine", true, "OCR engine, e.g. abbyy, abbyy-multiuser, ocrsdk, tesseract (default is abbyy)");
options.addOption("props", true, "Further properties, comma-separated. E.g. -props lock.overwrite=true,user=hans,books.split=true");
CommandLineParser parser = new GnuParser();
try {
parsedOptions = parser.parse(options, args);
} catch (ParseException e) {
out.println("Illegal arguments. Use -help.");
terminated = true;
}
}
private void printHelp() throws UnsupportedEncodingException {
OutputStreamWriter osw = new OutputStreamWriter(out, "UTF8");
PrintWriter pw = new PrintWriter(osw);
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(pw, HelpFormatter.DEFAULT_WIDTH, "java -jar ocr.jar <options>", "", options,
HelpFormatter.DEFAULT_LEFT_PAD, HelpFormatter.DEFAULT_DESC_PAD, "");
pw.close();
}
private OcrParameters transformOptions() throws UnsupportedEncodingException {
OcrParameters params = new OcrParameters();
if (parsedOptions.hasOption("help")) {
terminated = true;
printHelp();
return params;
}
if (requiredOptionsArePresent()) {
params.inputFolder = parsedOptions.getOptionValue("indir");
params.inputTextType = parsedOptions.getOptionValue("texttype");
params.inputLanguages = parsedOptions.getOptionValue("langs").split(",");
params.outputFolder = parsedOptions.getOptionValue("outdir");
params.outputFormats = parsedOptions.getOptionValue("outformats").split(",");
} else {
out.println("Required options are missing. Use -help.");
terminated = true;
return params;
}
if (parsedOptions.hasOption("informats")) {
params.inputFormats = parsedOptions.getOptionValue("informats").split(",");
}
if (parsedOptions.hasOption("prio")) {
params.priority = parsedOptions.getOptionValue("prio");
}
if (parsedOptions.hasOption("engine")) {
params.ocrEngine = parsedOptions.getOptionValue("engine");
}
if (parsedOptions.hasOption("props")) {
params.props = convertExtraProperties(parsedOptions.getOptionValue("props"));
}
return params;
}
private boolean requiredOptionsArePresent() {
boolean allPresent = true;
allPresent &= parsedOptions.hasOption("indir");
allPresent &= parsedOptions.hasOption("texttype");
allPresent &= parsedOptions.hasOption("langs");
allPresent &= parsedOptions.hasOption("outdir");
allPresent &= parsedOptions.hasOption("outformats");
return allPresent;
}
private Properties convertExtraProperties(String extras) {
Properties extraProperties = new Properties();
String[] extrasArray = extras.split(","); // opt1=a,opt2=b
for (String extraProp : extrasArray) {
String[] keyAndValue = extraProp.split("=");
String key = keyAndValue[0];
String value = keyAndValue[1];
extraProperties.setProperty(key, value);
}
return extraProperties;
}
}