/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.extractor;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.extractor.AbstractTextExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.openkm.core.Config;
import com.openkm.util.DocumentUtils;
import com.openkm.util.ExecutionUtils;
import com.openkm.util.FileUtils;
import com.openkm.util.TemplateUtils;
/**
* Text extractor for image documents.
* Use OCR from http://code.google.com/p/tesseract-ocr/
*/
public class Tesseract3TextExtractor extends AbstractTextExtractor {
/**
* Logger instance.
*/
private static final Logger log = LoggerFactory.getLogger(Tesseract3TextExtractor.class);
/**
* Creates a new <code>TextExtractor</code> instance.
*/
public Tesseract3TextExtractor() {
super(new String[] { "image/tiff", "image/gif", "image/jpg", "image/png" });
}
//-------------------------------------------------------< TextExtractor >
/**
* {@inheritDoc}
*/
public Reader extractText(InputStream stream, String type, String encoding) throws IOException {
BufferedReader stdout = null;
File tmpFileIn = null;
File tmpFileOut = null;
String cmd = null;
if (!Config.SYSTEM_OCR.equals("")) {
try {
// Create temp file
tmpFileIn = FileUtils.createTempFileFromMime(type);
tmpFileOut = File.createTempFile("okm", "");
FileOutputStream fos = new FileOutputStream(tmpFileIn);
IOUtils.copy(stream, fos);
fos.close();
// Performs OCR
HashMap<String, Object> hm = new HashMap<String, Object>();
hm.put("fileIn", tmpFileIn.getPath());
hm.put("fileOut", tmpFileOut.getPath());
String tpl = Config.SYSTEM_OCR + " ${fileIn} ${fileOut}";
cmd = TemplateUtils.replace("SYSTEM_OCR", tpl, hm);
ExecutionUtils.runCmd(cmd);
// Read result
String text = IOUtils.toString(new FileInputStream(tmpFileOut.getPath()+".txt"));
// Spellchecker
if (Config.SYSTEM_OPENOFFICE_DICTIONARY.equals("")) {
log.info("TEXT: {}", text);
return new StringReader(text);
} else {
text = DocumentUtils.spellChecker(text);
log.info("TEXT: {}", text);
return new StringReader(text);
}
} catch (SecurityException e) {
log.warn("Security exception executing command: " + cmd, e);
return new StringReader("");
} catch (IOException e) {
log.warn("IO exception executing command: " + cmd, e);
return new StringReader("");
} catch (InterruptedException e) {
log.warn("Interrupted exception executing command: " + cmd, e);
return new StringReader("");
} catch (Exception e) {
log.warn("Failed to extract OCR text", e);
return new StringReader("");
} finally {
IOUtils.closeQuietly(stream);
IOUtils.closeQuietly(stdout);
FileUtils.deleteQuietly(tmpFileIn);
FileUtils.deleteQuietly(tmpFileOut);
FileUtils.deleteQuietly(new File(tmpFileOut.getPath()+".txt"));
}
} else {
log.warn("Undefined OCR application");
return new StringReader("");
}
}
}