/**
* Licensed to The Apereo Foundation under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
*
* The Apereo Foundation licenses this file to you under the Educational
* Community License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License
* at:
*
* http://opensource.org/licenses/ecl2.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*/
package org.opencastproject.textextractor.tesseract;
import org.opencastproject.textextractor.api.TextExtractor;
import org.opencastproject.textextractor.api.TextExtractorException;
import org.opencastproject.textextractor.api.TextFrame;
import org.opencastproject.util.ProcessRunner;
import com.entwinemedia.fn.Pred;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.cm.ManagedService;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Dictionary;
/**
* Commandline wrapper around tesseract' <code>tesseract</code> command.
*/
public class TesseractTextExtractor implements TextExtractor, ManagedService {
/** The logging facility */
private static final Logger logger = LoggerFactory.getLogger(TesseractTextExtractor.class);
/** Default name of the tesseract binary */
public static final String TESSERACT_BINARY_DEFAULT = "tesseract";
/** Configuration property that defines the path to the tesseract binary */
public static final String TESSERACT_BINARY_CONFIG_KEY =
"org.opencastproject.textanalyzer.tesseract.path";
/** Configuration property that defines additional tesseract options like the
* language or the pagesegmode to use. This is just appended to the command
* line when tesseract is called. */
public static final String TESSERACT_OPTS_CONFIG_KEY =
"org.opencastproject.textanalyzer.tesseract.options";
/** Binary of the tesseract command */
protected String binary = null;
/** Additional options for the tesseract command */
protected String addOptions = "";
/**
* Creates a new tesseract command wrapper that will be using the default binary.
*/
public TesseractTextExtractor() {
this(TESSERACT_BINARY_DEFAULT);
}
/**
* Creates a new tesseract command wrapper that will be using the given binary.
*
* @param binary
* the tesseract binary
*/
public TesseractTextExtractor(String binary) {
this.binary = binary;
}
/**
* Returns the path to the <code>tesseract</code> binary.
*
* @return path to the binary
*/
public String getBinary() {
return binary;
}
/**
* Sets additional options for tesseract calls.
*
* @param addOptions
*/
public void setAdditionalOptions(String addOptions) {
this.addOptions = addOptions;
}
/**
* Returns the additional options for tesseract..
*
* @return additional options
*/
public String getAdditionalOptions() {
return addOptions;
}
/**
* Sets the path to the <code>tesseract</code> binary.
*
* @param binary
*/
public void setBinary(String binary) {
this.binary = binary;
}
/**
* {@inheritDoc}
*
* @see org.opencastproject.textextractor.api.TextExtractor#extract(java.io.File)
*/
@Override
public TextFrame extract(File image) throws TextExtractorException {
if (binary == null)
throw new IllegalStateException("Binary is not set");
InputStream is = null;
File outputFile = null;
File outputFileBase = new File(image.getParentFile(), FilenameUtils.getBaseName(image.getName()));
// Run tesseract
String opts = getAnalysisOptions(image, outputFileBase);
logger.info("Running Tesseract: {} {}", binary, opts);
try {
final int exitCode = ProcessRunner.run(ProcessRunner.mk(binary, opts), fnLogDebug, new Pred<String>() {
@Override public Boolean apply(String line) {
if (!line.trim().startsWith("Page") && !line.trim().startsWith("Tesseract Open Source OCR Engine")) {
logger.warn(line);
}
return true;
}
});
if (exitCode != 0) {
throw new TextExtractorException("Text analyzer " + binary + " exited with code " + exitCode);
}
// Read the tesseract output file
outputFile = new File(outputFileBase.getAbsolutePath() + ".txt");
is = new FileInputStream(outputFile);
TextFrame textFrame = TesseractTextFrame.parse(is);
is.close();
return textFrame;
} catch (IOException e) {
throw new TextExtractorException("Error running text extractor " + binary, e);
} finally {
IOUtils.closeQuietly(is);
FileUtils.deleteQuietly(outputFile);
}
}
/**
* The only parameter to <code>tesseract</code> is the filename, so this is what this method returns.
*
* @param image
* the image file
* @return the options to run analysis on the image
*/
protected String getAnalysisOptions(File image, File outputFile) {
StringBuilder options = new StringBuilder();
options.append(image.getAbsolutePath());
options.append(" ");
options.append(outputFile.getAbsolutePath());
options.append(" ");
options.append(this.addOptions);
return options.toString();
}
@Override
public void updated(Dictionary properties) throws ConfigurationException {
String path = (String) properties.get(TESSERACT_BINARY_CONFIG_KEY);
if (path != null) {
logger.info("Setting Tesseract path to {}", path);
this.binary = path;
}
/* Set additional options for tesseract (i.e. language to use) */
String addopts = (String) properties.get(TESSERACT_OPTS_CONFIG_KEY);
if (addopts != null) {
logger.info("Setting additional options for Tesseract path to '{}'", addopts);
this.addOptions = addopts;
}
}
public void activate(ComponentContext cc) {
// Configure ffmpeg
String path = (String) cc.getBundleContext().getProperty(TESSERACT_BINARY_CONFIG_KEY);
if (path == null) {
logger.debug("DEFAULT " + TESSERACT_BINARY_CONFIG_KEY + ": " + TESSERACT_BINARY_DEFAULT);
} else {
setBinary(path);
logger.info("Setting Tesseract path to binary from config: {}", path);
}
/* Set additional options for tesseract (i.e. language to use) */
String addopts = (String) cc.getBundleContext().getProperty(TESSERACT_OPTS_CONFIG_KEY);
if (addopts != null) {
logger.info("Setting additional options for Tesseract to '{}'", addopts);
this.addOptions = addopts;
} else {
logger.info("No additional options for Tesseract");
this.addOptions = "";
}
}
private static final Pred<String> fnLogDebug = new Pred<String>() {
@Override public Boolean apply(String s) {
logger.debug(s);
return true;
}
};
}