/* * Copyright (C) 2011 Google Inc. * Copyright (C) 2011 Robert Theis * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.googlecode.tesseract.android; import android.graphics.Bitmap; import android.graphics.Rect; import android.util.Log; import com.googlecode.leptonica.android.Pixa; import com.googlecode.leptonica.android.Pix; import com.googlecode.leptonica.android.ReadFile; import java.io.File; /** * Java interface for the Tesseract OCR engine. Does not implement all available * JNI methods, but does implement enough to be useful. Comments are adapted * from original Tesseract source. * * @author alanv@google.com (Alan Viverette) */ public class TessBaseAPI { /** * Used by the native implementation of the class. */ private int mNativeData; static { System.loadLibrary("lept"); System.loadLibrary("tess"); nativeClassInit(); } public static final class PageSegMode { /** Orientation and script detection only. */ public static final int PSM_OSD_ONLY = 0; /** Automatic page segmentation with orientation and script detection. (OSD) */ public static final int PSM_AUTO_OSD = 1; /** Fully automatic page segmentation, but no OSD, or OCR. */ public static final int PSM_AUTO_ONLY = 2; /** Fully automatic page segmentation, but no OSD. */ public static final int PSM_AUTO = 3; /** Assume a single column of text of variable sizes. */ public static final int PSM_SINGLE_COLUMN = 4; /** Assume a single uniform block of vertically aligned text. */ public static final int PSM_SINGLE_BLOCK_VERT_TEXT = 5; /** Assume a single uniform block of text. (Default.) */ public static final int PSM_SINGLE_BLOCK = 6; /** Treat the image as a single text line. */ public static final int PSM_SINGLE_LINE = 7; /** Treat the image as a single word. */ public static final int PSM_SINGLE_WORD = 8; /** Treat the image as a single word in a circle. */ public static final int PSM_CIRCLE_WORD = 9; /** Treat the image as a single character. */ public static final int PSM_SINGLE_CHAR = 10; /** Find as much text as possible in no particular order. */ public static final int PSM_SPARSE_TEXT = 11; /** Sparse text with orientation and script detection. */ public static final int PSM_SPARSE_TEXT_OSD = 12; /** Number of enum entries. */ public static final int PSM_COUNT = 13; } /** Whitelist of characters to recognize. */ public static final String VAR_CHAR_WHITELIST = "tessedit_char_whitelist"; /** Blacklist of characters to not recognize. */ public static final String VAR_CHAR_BLACKLIST = "tessedit_char_blacklist"; /** Run Tesseract only - fastest */ public static final int OEM_TESSERACT_ONLY = 0; /** Run Cube only - better accuracy, but slower */ public static final int OEM_CUBE_ONLY = 1; /** Run both and combine results - best accuracy */ public static final int OEM_TESSERACT_CUBE_COMBINED = 2; /** Default OCR engine mode. */ public static final int OEM_DEFAULT = 3; /** * Elements of the page hierarchy, used in {@link ResultIterator} to provide * functions that operate on each level without having to have 5x as many * functions. * <p> * NOTE: At present {@link #RIL_PARA} and {@link #RIL_BLOCK} are equivalent * as there is no paragraph internally yet. */ public static final class PageIteratorLevel { /** Block of text/image/separator line. */ public static final int RIL_BLOCK = 0; /** Paragraph within a block. */ public static final int RIL_PARA = 1; /** Line within a paragraph. */ public static final int RIL_TEXTLINE = 2; /** Word within a text line. */ public static final int RIL_WORD = 3; /** Symbol/character within a word. */ public static final int RIL_SYMBOL = 4; }; /** * Constructs an instance of TessBaseAPI. */ public TessBaseAPI() { nativeConstruct(); } /** * Called by the GC to clean up the native data that we set up when we * construct the object. * * Altered from original version to avoid a crash-causing bug in OCR Test application. */ @Override protected void finalize() throws Throwable { // TODO Find out why finalize() is getting called when we change languages, even though // we're still using the object. Is bypassing nativeFinalize() OK if we still call // baseApi.end() in the Activity's onDestroy()? try { Log.d("TessBaseAPI.java", "finalize(): NOT calling nativeFinalize() due to premature garbage collection"); //nativeFinalize(); } finally { Log.d("TessBaseAPI.java", "finalize(): calling super.finalize()"); super.finalize(); } } /** * Initializes the Tesseract engine with a specified language model. Returns * <code>true</code> on success. * <p> * Instances are now mostly thread-safe and totally independent, but some * global parameters remain. Basically it is safe to use multiple * TessBaseAPIs in different threads in parallel, UNLESS you use SetVariable * on some of the Params in classify and textord. If you do, then the effect * will be to change it for all your instances. * <p> * The datapath must be the name of the parent directory of tessdata and * must end in / . Any name after the last / will be stripped. The language * is (usually) an ISO 639-3 string or <code>null</code> will default to eng. * It is entirely safe (and eventually will be efficient too) to call Init * multiple times on the same instance to change language, or just to reset * the classifier. * <p> * The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating * that multiple languages are to be loaded. Eg hin+eng will load Hindi and * English. Languages may specify internally that they want to be loaded * with one or more other languages, so the ~ sign is available to override * that. Eg if hin were set to load eng by default, then hin+~eng would force * loading only hin. The number of loaded languages is limited only by * memory, with the caveat that loading additional languages will impact * both speed and accuracy, as there is more work to do to decide on the * applicable language, and there is more chance of hallucinating incorrect * words. * <p> * <b>WARNING:</b> On changing languages, all Tesseract parameters are reset * back to their default values. (Which may vary between languages.) * <p> * If you have a rare need to set a Variable that controls initialization * for a second call to Init you should explicitly call End() and then use * SetVariable before Init. This is only a very rare use case, since there * are very few uses that require any parameters to be set before Init. * * @param datapath the parent directory of tessdata ending in a forward * slash * @param language (optional) an ISO 639-3 string representing the language(s) * @return <code>true</code> on success */ public boolean init(String datapath, String language) { if (datapath == null) throw new IllegalArgumentException("Data path must not be null!"); if (!datapath.endsWith(File.separator)) datapath += File.separator; File tessdata = new File(datapath + "tessdata"); if (!tessdata.exists() || !tessdata.isDirectory()) throw new IllegalArgumentException("Data path must contain subfolder tessdata!"); return nativeInit(datapath, language); } /** * Initializes the Tesseract engine with the specified language model(s). Returns * <code>true</code> on success. * * @param datapath the parent directory of tessdata ending in a forward * slash * @param language (optional) an ISO 639-3 string representing the language(s) * @param mode the OCR engine mode to be set * @return <code>true</code> on success */ public boolean init(String datapath, String language, int ocrEngineMode) { if (datapath == null) throw new IllegalArgumentException("Data path must not be null!"); if (!datapath.endsWith(File.separator)) datapath += File.separator; File tessdata = new File(datapath + "tessdata"); if (!tessdata.exists() || !tessdata.isDirectory()) throw new IllegalArgumentException("Data path must contain subfolder tessdata!"); return nativeInitOem(datapath, language, ocrEngineMode); } /** * Returns the languages string used in the last valid initialization. * If the last initialization specified "deu+hin" then that will be * returned. If hin loaded eng automatically as well, then that will * not be included in this list. To find the languages actually * loaded use GetLoadedLanguagesAsVector. * * @return the last-used language code */ public String getInitLanguagesAsString() { return nativeGetInitLanguagesAsString(); } /** * Frees up recognition results and any stored image data, without actually * freeing any recognition data that would be time-consuming to reload. * Afterwards, you must call SetImage or SetRectangle before doing any * Recognize or Get* operation. */ public void clear() { nativeClear(); } /** * Closes down tesseract and free up all memory. End() is equivalent to * destructing and reconstructing your TessBaseAPI. * <p> * Once End() has been used, none of the other API functions may be used * other than Init and anything declared above it in the class definition. */ public void end() { nativeEnd(); } /** * Set the value of an internal "variable" (of either old or new types). * Supply the name of the variable and the value as a string, just as you * would in a config file. * <p> * Example: <code>setVariable(VAR_TESSEDIT_CHAR_BLACKLIST, "xyz"); to ignore x, y and z. * setVariable(VAR_BLN_NUMERICMODE, "1"); to set numeric-only mode. * </code> * <p> * setVariable() may be used before open(), but settings will revert to * defaults on close(). * * @param var name of the variable * @param value value to set * @return false if the name lookup failed */ public boolean setVariable(String var, String value) { return nativeSetVariable(var, value); } /** * Sets the page segmentation mode. This controls how much processing the * OCR engine will perform before recognizing text. * * @param mode the page segmentation mode to set */ public void setPageSegMode(int mode) { nativeSetPageSegMode(mode); } /** * Sets debug mode. This controls how much information is displayed in the * log during recognition. * * @param enabled <code>true</code> to enable debugging mode */ public void setDebug(boolean enabled) { nativeSetDebug(enabled); } /** * Restricts recognition to a sub-rectangle of the image. Call after * SetImage. Each SetRectangle clears the recogntion results so multiple * rectangles can be recognized with the same image. * * @param rect the bounding rectangle */ public void setRectangle(Rect rect) { setRectangle(rect.left, rect.top, rect.width(), rect.height()); } /** * Restricts recognition to a sub-rectangle of the image. Call after * SetImage. Each SetRectangle clears the recogntion results so multiple * rectangles can be recognized with the same image. * * @param left the left bound * @param top the right bound * @param width the width of the bounding box * @param height the height of the bounding box */ public void setRectangle(int left, int top, int width, int height) { nativeSetRectangle(left, top, width, height); } /** * Provides an image for Tesseract to recognize. * * @param file absolute path to the image file */ public void setImage(File file) { Pix image = ReadFile.readFile(file); if (image == null) { throw new RuntimeException("Failed to read image file"); } nativeSetImagePix(image.getNativePix()); } /** * Provides an image for Tesseract to recognize. Does not copy the image * buffer. The source image must persist until after Recognize or * GetUTF8Chars is called. * * @param bmp bitmap representation of the image */ public void setImage(Bitmap bmp) { Pix image = ReadFile.readBitmap(bmp); if (image == null) { throw new RuntimeException("Failed to read bitmap"); } nativeSetImagePix(image.getNativePix()); } /** * Provides a Leptonica pix format image for Tesseract to recognize. Clones * the pix object. The source image may be destroyed immediately after * SetImage is called, but its contents may not be modified. * * @param image Leptonica pix representation of the image */ public void setImage(Pix image) { nativeSetImagePix(image.getNativePix()); } /** * Provides an image for Tesseract to recognize. Copies the image buffer. * The source image may be destroyed immediately after SetImage is called. * SetImage clears all recognition results, and sets the rectangle to the * full image, so it may be followed immediately by a GetUTF8Text, and it * will automatically perform recognition. * * @param imagedata byte representation of the image * @param width image width * @param height image height * @param bpp bytes per pixel * @param bpl bytes per line */ public void setImage(byte[] imagedata, int width, int height, int bpp, int bpl) { nativeSetImageBytes(imagedata, width, height, bpp, bpl); } /** * The recognized text is returned as a String which is coded as UTF8. * * @return the recognized text */ public String getUTF8Text() { // Trim because the text will have extra line breaks at the end String text = nativeGetUTF8Text(); return text.trim(); } /** * Returns the mean confidence of text recognition. * * @return the mean confidence */ public int meanConfidence() { return nativeMeanConfidence(); } /** * Returns all word confidences (between 0 and 100) in an array. The number * of confidences should correspond to the number of space-delimited words * in GetUTF8Text(). * * @return an array of word confidences (between 0 and 100) for each * space-delimited word returned by GetUTF8Text() */ public int[] wordConfidences() { int[] conf = nativeWordConfidences(); // We shouldn't return null confidences if (conf == null) conf = new int[0]; return conf; } /** * Returns the result of page layout analysis as a Pixa, in reading order. * * @return Pixa contaning page layout bounding boxes */ public Pixa getRegions() { return new Pixa(nativeGetRegions(), 0, 0); } /** * Returns the textlines as a Pixa. * * Block IDs are not returned. * * @return Pixa containing textlines */ public Pixa getTextlines() { return new Pixa(nativeGetTextlines(), 0, 0); } /** * Returns the strips as a Pixa. * * Block IDs are not returned. * * @return Pixa containing strips */ public Pixa getStrips() { return new Pixa(nativeGetStrips(), 0, 0); } /** * Returns the word bounding boxes as a Pixa, in reading order. * * @return Pixa containing word bounding boxes */ public Pixa getWords() { return new Pixa(nativeGetWords(), 0, 0); } public ResultIterator getResultIterator() { int nativeResultIterator = nativeGetResultIterator(); if (nativeResultIterator == 0) { return null; } return new ResultIterator(nativeResultIterator); } /** * Make a HTML-formatted string with hOCR markup from the internal data * structures. * * @param page is 0-based but will appear in the output as 1-based. * @return HTML-formatted string with hOCR markup */ public String getHOCRText(int page){ return nativeGetHOCRText(page); } /** * Set the name of the input file. Needed only for training and * reading a UNLV zone file. * * @param name input file name */ public void setInputName(String name){ nativeSetInputName(name); } /** * Set the name of the output files. * Needed only for debugging. * @param name output file name */ public void setOutputName(String name){ nativeSetOutputName(name); } /** * Read a "config" file containing a set of variable, value pairs. * Searches the standard places: <i>tessdata/configs, tessdata/tessconfigs</i>. * * @param filename the configuration filename, without path */ public void ReadConfigFile(String filename){ nativeReadConfigFile(filename); } /** * The recognized text is returned as coded in the same format as a UTF8 * box file used in training. * * @param page is a 0-based page index that will appear in the box file. */ public String getBoxText(int page){ return nativeGetBoxText(page); } // ****************** // * Native methods * // ****************** /** * Initializes static native data. Must be called on object load. */ private static native void nativeClassInit(); /** * Initializes native data. Must be called on object construction. */ private native void nativeConstruct(); /** * Finalizes native data. Must be called on object destruction. */ private native void nativeFinalize(); private native boolean nativeInit(String datapath, String language); private native boolean nativeInitOem(String datapath, String language, int mode); private native String nativeGetInitLanguagesAsString(); private native void nativeClear(); private native void nativeEnd(); private native void nativeSetImageBytes( byte[] imagedata, int width, int height, int bpp, int bpl); private native void nativeSetImagePix(int nativePix); private native void nativeSetRectangle(int left, int top, int width, int height); private native String nativeGetUTF8Text(); private native int nativeMeanConfidence(); private native int[] nativeWordConfidences(); private native boolean nativeSetVariable(String var, String value); private native void nativeSetDebug(boolean debug); private native void nativeSetPageSegMode(int mode); private native int nativeGetRegions(); private native int nativeGetTextlines(); private native int nativeGetStrips(); private native int nativeGetWords(); private native int nativeGetResultIterator(); private native String nativeGetBoxText(int page_number); private native String nativeGetHOCRText(int page_number); private native void nativeSetInputName(String name); private native void nativeSetOutputName(String name); private native void nativeReadConfigFile(String fileName); }