/*
* Copyright 2011 Robert Theis
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.sfsu.cs.orange.ocr;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.xeustechnologies.jtar.TarEntry;
import org.xeustechnologies.jtar.TarInputStream;
import android.app.ProgressDialog;
import android.content.Context;
import android.os.AsyncTask;
import android.util.Log;
import com.googlecode.tesseract.android.TessBaseAPI;
/**
* Installs the language data required for OCR, and initializes the OCR engine using a background
* thread.
*/
final class OcrInitAsyncTask extends AsyncTask<String, String, Boolean> {
private static final String TAG = OcrInitAsyncTask.class.getSimpleName();
/** Suffixes of required data files for Cube. */
private static final String[] CUBE_DATA_FILES = {
".cube.bigrams",
".cube.fold",
".cube.lm",
".cube.nn",
".cube.params",
//".cube.size", // This file is not available for Hindi
".cube.word-freq",
".tesseract_cube.nn",
".traineddata"
};
private CaptureActivity activity;
private Context context;
private TessBaseAPI baseApi;
private ProgressDialog dialog;
private ProgressDialog indeterminateDialog;
private final String languageCode;
private String languageName;
private int ocrEngineMode;
/**
* AsyncTask to asynchronously download data and initialize Tesseract.
*
* @param activity
* The calling activity
* @param baseApi
* API to the OCR engine
* @param dialog
* Dialog box with thermometer progress indicator
* @param indeterminateDialog
* Dialog box with indeterminate progress indicator
* @param languageCode
* ISO 639-2 OCR language code
* @param languageName
* Name of the OCR language, for example, "English"
* @param ocrEngineMode
* Whether to use Tesseract, Cube, or both
*/
OcrInitAsyncTask(CaptureActivity activity, TessBaseAPI baseApi, ProgressDialog dialog,
ProgressDialog indeterminateDialog, String languageCode, String languageName,
int ocrEngineMode) {
this.activity = activity;
this.context = activity.getBaseContext();
this.baseApi = baseApi;
this.dialog = dialog;
this.indeterminateDialog = indeterminateDialog;
this.languageCode = languageCode;
this.languageName = languageName;
this.ocrEngineMode = ocrEngineMode;
}
@Override
protected void onPreExecute() {
super.onPreExecute();
dialog.setTitle("Please wait");
dialog.setMessage("Checking for data installation...");
dialog.setIndeterminate(false);
dialog.setProgressStyle(ProgressDialog.STYLE_HORIZONTAL);
dialog.setCancelable(false);
dialog.show();
activity.setButtonVisibility(false);
}
/**
* In background thread, perform required setup, and request initialization of
* the OCR engine.
*
* @param params
* [0] Pathname for the directory for storing language data files to the SD card
*/
protected Boolean doInBackground(String... params) {
// Check whether we need Cube data or Tesseract data.
// Example Cube data filename: "tesseract-ocr-3.01.eng.tar"
// Example Tesseract data filename: "eng.traineddata"
String destinationFilenameBase = languageCode + ".traineddata";
boolean isCubeSupported = false;
for (String s : CaptureActivity.CUBE_SUPPORTED_LANGUAGES) {
if (s.equals(languageCode)) {
isCubeSupported = true;
destinationFilenameBase = "tesseract-ocr-3.01." + languageCode + ".tar";
}
}
// Hack for Thai, which is a Tesseract-only language but packaged as a tar.
if (languageCode.equals("tha")) {
destinationFilenameBase = "tesseract-ocr-3.01.tha.tar";
}
// Check for, and create if necessary, folder to hold model data
String destinationDirBase = params[0]; // The storage directory, minus the
// "tessdata" subdirectory
File tessdataDir = new File(destinationDirBase + File.separator + "tessdata");
if (!tessdataDir.exists() && !tessdataDir.mkdirs()) {
Log.e(TAG, "Couldn't make directory " + tessdataDir);
return false;
}
// Create a reference to the file to save the download in
File downloadFile = new File(tessdataDir, destinationFilenameBase);
// Check if an incomplete download is present. If a *.download file is there, delete it and
// any (possibly half-unzipped) Tesseract and Cube data files that may be there.
File incomplete = new File(tessdataDir, destinationFilenameBase + ".download");
File tesseractTestFile = new File(tessdataDir, languageCode + ".traineddata");
if (incomplete.exists()) {
incomplete.delete();
if (tesseractTestFile.exists()) {
tesseractTestFile.delete();
}
deleteCubeDataFiles(tessdataDir);
}
// Check whether all Cube data files have already been installed
boolean isAllCubeDataInstalled = false;
if (isCubeSupported) {
boolean isAFileMissing = false;
File dataFile;
for (String s : CUBE_DATA_FILES) {
dataFile = new File(tessdataDir.toString() + File.separator + languageCode + s);
if (!dataFile.exists()) {
isAFileMissing = true;
}
}
isAllCubeDataInstalled = !isAFileMissing;
}
// If language data files are not present, install them
boolean installSuccess = false;
if (!tesseractTestFile.exists()
|| (isCubeSupported && !isAllCubeDataInstalled)) {
Log.d(TAG, "Language data for " + languageCode + " not found in " + tessdataDir.toString());
deleteCubeDataFiles(tessdataDir);
// Check assets for language data to install. If not present, download from Internet
try {
Log.d(TAG, "Checking for language data (" + destinationFilenameBase
+ ".zip) in application assets...");
// Check for a file like "eng.traineddata.zip" or "tesseract-ocr-3.01.eng.tar.zip"
installSuccess = installFromAssets(destinationFilenameBase + ".zip", tessdataDir,
downloadFile);
} catch (IOException e) {
Log.e(TAG, "IOException", e);
} catch (Exception e) {
Log.e(TAG, "Got exception", e);
}
if (!installSuccess) {
// File was not packaged in assets, so download it
Log.d(TAG, "Downloading " + destinationFilenameBase + ".gz...");
try {
installSuccess = downloadFile(destinationFilenameBase, downloadFile);
if (!installSuccess) {
Log.e(TAG, "Download failed");
return false;
}
} catch (IOException e) {
Log.e(TAG, "IOException received in doInBackground. Is a network connection available?");
return false;
}
}
// If we have a tar file at this point because we downloaded v3.01+ data, untar it
String extension = destinationFilenameBase.substring(
destinationFilenameBase.lastIndexOf('.'),
destinationFilenameBase.length());
if (extension.equals(".tar")) {
try {
untar(new File(tessdataDir.toString() + File.separator + destinationFilenameBase),
tessdataDir);
installSuccess = true;
} catch (IOException e) {
Log.e(TAG, "Untar failed");
return false;
}
}
} else {
Log.d(TAG, "Language data for " + languageCode + " already installed in "
+ tessdataDir.toString());
installSuccess = true;
}
// If OSD data file is not present, download it
File osdFile = new File(tessdataDir, CaptureActivity.OSD_FILENAME_BASE);
boolean osdInstallSuccess = false;
if (!osdFile.exists()) {
// Check assets for language data to install. If not present, download from Internet
languageName = "orientation and script detection";
try {
// Check for, and delete, partially-downloaded OSD files
String[] badFiles = { CaptureActivity.OSD_FILENAME + ".gz.download",
CaptureActivity.OSD_FILENAME + ".gz", CaptureActivity.OSD_FILENAME };
for (String filename : badFiles) {
File file = new File(tessdataDir, filename);
if (file.exists()) {
file.delete();
}
}
Log.d(TAG, "Checking for OSD data (" + CaptureActivity.OSD_FILENAME_BASE
+ ".zip) in application assets...");
// Check for "osd.traineddata.zip"
osdInstallSuccess = installFromAssets(CaptureActivity.OSD_FILENAME_BASE + ".zip",
tessdataDir, new File(CaptureActivity.OSD_FILENAME));
} catch (IOException e) {
Log.e(TAG, "IOException", e);
} catch (Exception e) {
Log.e(TAG, "Got exception", e);
}
if (!osdInstallSuccess) {
// File was not packaged in assets, so download it
Log.d(TAG, "Downloading " + CaptureActivity.OSD_FILENAME + ".gz...");
try {
osdInstallSuccess = downloadFile(CaptureActivity.OSD_FILENAME, new File(tessdataDir,
CaptureActivity.OSD_FILENAME));
if (!osdInstallSuccess) {
Log.e(TAG, "Download failed");
return false;
}
} catch (IOException e) {
Log.e(TAG, "IOException received in doInBackground. Is a network connection available?");
return false;
}
}
// Untar the OSD tar file
try {
untar(new File(tessdataDir.toString() + File.separator + CaptureActivity.OSD_FILENAME),
tessdataDir);
} catch (IOException e) {
Log.e(TAG, "Untar failed");
return false;
}
} else {
Log.d(TAG, "OSD file already present in " + tessdataDir.toString());
osdInstallSuccess = true;
}
// Dismiss the progress dialog box, revealing the indeterminate dialog box behind it
try {
dialog.dismiss();
} catch (IllegalArgumentException e) {
// Catch "View not attached to window manager" error, and continue
}
// Initialize the OCR engine
if (baseApi.init(destinationDirBase + File.separator, languageCode, ocrEngineMode)) {
return installSuccess && osdInstallSuccess;
}
return false;
}
/**
* Delete any existing data files for Cube that are present in the given directory. Files may be
* partially uncompressed files left over from a failed install, or pre-v3.01 traineddata files.
*
* @param tessdataDir
* Directory to delete the files from
*/
private void deleteCubeDataFiles(File tessdataDir) {
File badFile;
for (String s : CUBE_DATA_FILES) {
badFile = new File(tessdataDir.toString() + File.separator + languageCode + s);
if (badFile.exists()) {
Log.d(TAG, "Deleting existing file " + badFile.toString());
badFile.delete();
}
badFile = new File(tessdataDir.toString() + File.separator + "tesseract-ocr-3.01."
+ languageCode + ".tar");
if (badFile.exists()) {
Log.d(TAG, "Deleting existing file " + badFile.toString());
badFile.delete();
}
}
}
/**
* Download a file from the site specified by DOWNLOAD_BASE, and gunzip to the given destination.
*
* @param sourceFilenameBase
* Name of file to download, minus the required ".gz" extension
* @param destinationFile
* Name of file to save the unzipped data to, including path
* @return True if download and unzip are successful
* @throws IOException
*/
private boolean downloadFile(String sourceFilenameBase, File destinationFile)
throws IOException {
try {
return downloadGzippedFileHttp(new URL(CaptureActivity.DOWNLOAD_BASE + sourceFilenameBase +
".gz"),
destinationFile);
} catch (MalformedURLException e) {
throw new IllegalArgumentException("Bad URL string.");
}
}
/**
* Download a gzipped file using an HttpURLConnection, and gunzip it to the given destination.
*
* @param url
* URL to download from
* @param destinationFile
* File to save the download as, including path
* @return True if response received, destinationFile opened, and unzip
* successful
* @throws IOException
*/
private boolean downloadGzippedFileHttp(URL url, File destinationFile)
throws IOException {
// Send an HTTP GET request for the file
Log.d(TAG, "Sending GET request to " + url + "...");
publishProgress("Downloading data for " + languageName + "...", "0");
HttpURLConnection urlConnection = null;
urlConnection = (HttpURLConnection) url.openConnection();
urlConnection.setAllowUserInteraction(false);
urlConnection.setInstanceFollowRedirects(true);
urlConnection.setRequestMethod("GET");
urlConnection.connect();
if (urlConnection.getResponseCode() != HttpURLConnection.HTTP_OK) {
Log.e(TAG, "Did not get HTTP_OK response.");
Log.e(TAG, "Response code: " + urlConnection.getResponseCode());
Log.e(TAG, "Response message: " + urlConnection.getResponseMessage().toString());
return false;
}
int fileSize = urlConnection.getContentLength();
InputStream inputStream = urlConnection.getInputStream();
File tempFile = new File(destinationFile.toString() + ".gz.download");
// Stream the file contents to a local file temporarily
Log.d(TAG, "Streaming download to " + destinationFile.toString() + ".gz.download...");
final int BUFFER = 8192;
FileOutputStream fileOutputStream = null;
Integer percentComplete;
int percentCompleteLast = 0;
try {
fileOutputStream = new FileOutputStream(tempFile);
} catch (FileNotFoundException e) {
Log.e(TAG, "Exception received when opening FileOutputStream.", e);
}
int downloaded = 0;
byte[] buffer = new byte[BUFFER];
int bufferLength = 0;
while ((bufferLength = inputStream.read(buffer, 0, BUFFER)) > 0) {
fileOutputStream.write(buffer, 0, bufferLength);
downloaded += bufferLength;
percentComplete = (int) ((downloaded / (float) fileSize) * 100);
if (percentComplete > percentCompleteLast) {
publishProgress(
"Downloading data for " + languageName + "...",
percentComplete.toString());
percentCompleteLast = percentComplete;
}
}
fileOutputStream.close();
if (urlConnection != null) {
urlConnection.disconnect();
}
// Uncompress the downloaded temporary file into place, and remove the temporary file
try {
Log.d(TAG, "Unzipping...");
gunzip(tempFile,
new File(tempFile.toString().replace(".gz.download", "")));
return true;
} catch (FileNotFoundException e) {
Log.e(TAG, "File not available for unzipping.");
} catch (IOException e) {
Log.e(TAG, "Problem unzipping file.");
}
return false;
}
/**
* Unzips the given Gzipped file to the given destination, and deletes the
* gzipped file.
*
* @param zippedFile
* The gzipped file to be uncompressed
* @param outFilePath
* File to unzip to, including path
* @throws FileNotFoundException
* @throws IOException
*/
private void gunzip(File zippedFile, File outFilePath)
throws FileNotFoundException, IOException {
int uncompressedFileSize = getGzipSizeUncompressed(zippedFile);
Integer percentComplete;
int percentCompleteLast = 0;
int unzippedBytes = 0;
final Integer progressMin = 0;
int progressMax = 100 - progressMin;
publishProgress("Uncompressing data for " + languageName + "...",
progressMin.toString());
// If the file is a tar file, just show progress to 50%
String extension = zippedFile.toString().substring(
zippedFile.toString().length() - 16);
if (extension.equals(".tar.gz.download")) {
progressMax = 50;
}
GZIPInputStream gzipInputStream = new GZIPInputStream(
new BufferedInputStream(new FileInputStream(zippedFile)));
OutputStream outputStream = new FileOutputStream(outFilePath);
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(
outputStream);
final int BUFFER = 8192;
byte[] data = new byte[BUFFER];
int len;
while ((len = gzipInputStream.read(data, 0, BUFFER)) > 0) {
bufferedOutputStream.write(data, 0, len);
unzippedBytes += len;
percentComplete = (int) ((unzippedBytes / (float) uncompressedFileSize) * progressMax)
+ progressMin;
if (percentComplete > percentCompleteLast) {
publishProgress("Uncompressing data for " + languageName
+ "...", percentComplete.toString());
percentCompleteLast = percentComplete;
}
}
gzipInputStream.close();
bufferedOutputStream.flush();
bufferedOutputStream.close();
if (zippedFile.exists()) {
zippedFile.delete();
}
}
/**
* Returns the uncompressed size for a Gzipped file.
*
* @param file
* Gzipped file to get the size for
* @return Size when uncompressed, in bytes
* @throws IOException
*/
private int getGzipSizeUncompressed(File zipFile) throws IOException {
RandomAccessFile raf = new RandomAccessFile(zipFile, "r");
raf.seek(raf.length() - 4);
int b4 = raf.read();
int b3 = raf.read();
int b2 = raf.read();
int b1 = raf.read();
raf.close();
return (b1 << 24) | (b2 << 16) + (b3 << 8) + b4;
}
/**
* Untar the contents of a tar file into the given directory, ignoring the
* relative pathname in the tar file, and delete the tar file.
*
* Uses jtar: http://code.google.com/p/jtar/
*
* @param tarFile
* The tar file to be untarred
* @param destinationDir
* The directory to untar into
* @throws IOException
*/
private void untar(File tarFile, File destinationDir) throws IOException {
Log.d(TAG, "Untarring...");
final int uncompressedSize = getTarSizeUncompressed(tarFile);
Integer percentComplete;
int percentCompleteLast = 0;
int unzippedBytes = 0;
final Integer progressMin = 50;
final int progressMax = 100 - progressMin;
publishProgress("Uncompressing data for " + languageName + "...",
progressMin.toString());
// Extract all the files
TarInputStream tarInputStream = new TarInputStream(new BufferedInputStream(
new FileInputStream(tarFile)));
TarEntry entry;
while ((entry = tarInputStream.getNextEntry()) != null) {
int len;
final int BUFFER = 8192;
byte data[] = new byte[BUFFER];
String pathName = entry.getName();
String fileName = pathName.substring(pathName.lastIndexOf('/'), pathName.length());
OutputStream outputStream = new FileOutputStream(destinationDir + fileName);
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(outputStream);
Log.d(TAG, "Writing " + fileName.substring(1, fileName.length()) + "...");
while ((len = tarInputStream.read(data, 0, BUFFER)) != -1) {
bufferedOutputStream.write(data, 0, len);
unzippedBytes += len;
percentComplete = (int) ((unzippedBytes / (float) uncompressedSize) * progressMax)
+ progressMin;
if (percentComplete > percentCompleteLast) {
publishProgress("Uncompressing data for " + languageName + "...",
percentComplete.toString());
percentCompleteLast = percentComplete;
}
}
bufferedOutputStream.flush();
bufferedOutputStream.close();
}
tarInputStream.close();
if (tarFile.exists()) {
tarFile.delete();
}
}
/**
* Return the uncompressed size for a Tar file.
*
* @param tarFile
* The Tarred file
* @return Size when uncompressed, in bytes
* @throws IOException
*/
private int getTarSizeUncompressed(File tarFile) throws IOException {
int size = 0;
TarInputStream tis = new TarInputStream(new BufferedInputStream(
new FileInputStream(tarFile)));
TarEntry entry;
while ((entry = tis.getNextEntry()) != null) {
if (!entry.isDirectory()) {
size += entry.getSize();
}
}
return size;
}
/**
* Install a file from application assets to device external storage.
*
* @param sourceFilename
* File in assets to install
* @param modelRoot
* Directory on SD card to install the file to
* @param destinationFile
* File name for destination, excluding path
* @return True if installZipFromAssets returns true
* @throws IOException
*/
private boolean installFromAssets(String sourceFilename, File modelRoot,
File destinationFile) throws IOException {
String extension = sourceFilename.substring(sourceFilename.lastIndexOf('.'),
sourceFilename.length());
try {
if (extension.equals(".zip")) {
return installZipFromAssets(sourceFilename, modelRoot, destinationFile);
} else {
throw new IllegalArgumentException("Extension " + extension
+ " is unsupported.");
}
} catch (FileNotFoundException e) {
Log.d(TAG, "Language not packaged in application assets.");
}
return false;
}
/**
* Unzip the given Zip file, located in application assets, into the given
* destination file.
*
* @param sourceFilename
* Name of the file in assets
* @param destinationDir
* Directory to save the destination file in
* @param destinationFile
* File to unzip into, excluding path
* @return
* @throws IOException
* @throws FileNotFoundException
*/
private boolean installZipFromAssets(String sourceFilename,
File destinationDir, File destinationFile) throws IOException,
FileNotFoundException {
// Attempt to open the zip archive
publishProgress("Uncompressing data for " + languageName + "...", "0");
ZipInputStream inputStream = new ZipInputStream(context.getAssets().open(sourceFilename));
// Loop through all the files and folders in the zip archive (but there should just be one)
for (ZipEntry entry = inputStream.getNextEntry(); entry != null; entry = inputStream
.getNextEntry()) {
destinationFile = new File(destinationDir, entry.getName());
if (entry.isDirectory()) {
destinationFile.mkdirs();
} else {
// Note getSize() returns -1 when the zipfile does not have the size set
long zippedFileSize = entry.getSize();
// Create a file output stream
FileOutputStream outputStream = new FileOutputStream(destinationFile);
final int BUFFER = 8192;
// Buffer the output to the file
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(outputStream, BUFFER);
int unzippedSize = 0;
// Write the contents
int count = 0;
Integer percentComplete = 0;
Integer percentCompleteLast = 0;
byte[] data = new byte[BUFFER];
while ((count = inputStream.read(data, 0, BUFFER)) != -1) {
bufferedOutputStream.write(data, 0, count);
unzippedSize += count;
percentComplete = (int) ((unzippedSize / (long) zippedFileSize) * 100);
if (percentComplete > percentCompleteLast) {
publishProgress("Uncompressing data for " + languageName + "...",
percentComplete.toString(), "0");
percentCompleteLast = percentComplete;
}
}
bufferedOutputStream.close();
}
inputStream.closeEntry();
}
inputStream.close();
return true;
}
/**
* Update the dialog box with the latest incremental progress.
*
* @param message
* [0] Text to be displayed
* @param message
* [1] Numeric value for the progress
*/
@Override
protected void onProgressUpdate(String... message) {
super.onProgressUpdate(message);
int percentComplete = 0;
percentComplete = Integer.parseInt(message[1]);
dialog.setMessage(message[0]);
dialog.setProgress(percentComplete);
dialog.show();
}
@Override
protected void onPostExecute(Boolean result) {
super.onPostExecute(result);
try {
indeterminateDialog.dismiss();
} catch (IllegalArgumentException e) {
// Catch "View not attached to window manager" error, and continue
}
if (result) {
// Restart recognition
activity.resumeOCR();
activity.showLanguageName();
} else {
activity.showErrorMessage("Error", "Network is unreachable - cannot download language data. "
+ "Please enable network access and restart this app.");
}
}
}