/*
* Autopsy Forensic Browser
*
* Copyright 2012 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.coreutils;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.logging.Level;
import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.autopsy.coreutils.StringExtract.StringExtractUnicodeTable.SCRIPT;
/**
* Language and encoding aware utility to extract strings from stream of bytes
* Currently supports UTF-16 LE, UTF-16 BE and UTF8 Latin, Cyrillic, Chinese,
* Arabic
*
* TODO: process control characters
*
* TODO: handle tie better (when number of chars in 2 results is equal)
*/
public class StringExtract {
private static final Logger logger = Logger.getLogger(StringExtract.class.getName());
/**
* min. number of extracted chars to qualify as string
*/
public static final int MIN_CHARS_STRING = 4;
private StringExtractUnicodeTable unicodeTable;
/**
* currently enabled scripts
*/
private List<SCRIPT> enabledScripts;
private boolean enableUTF8;
private boolean enableUTF16;
//stored and reused results
private final StringExtractResult resUTF16En1 = new StringExtractResult();
private final StringExtractResult resUTF16En2 = new StringExtractResult();
private final StringExtractResult resUTF8 = new StringExtractResult();
/**
* supported scripts, can be overridden with enableScriptX methods
*/
private static final List<SCRIPT> SUPPORTED_SCRIPTS
= Arrays.asList(
SCRIPT.LATIN_1, SCRIPT.LATIN_2, SCRIPT.ARABIC, SCRIPT.CYRILLIC, SCRIPT.HAN,
SCRIPT.HIRAGANA, SCRIPT.KATAKANA, SCRIPT.HANGUL,
SCRIPT.ARMENIAN, SCRIPT.BENGALI, SCRIPT.KHMER, SCRIPT.ETHIOPIC,
SCRIPT.GEORGIAN, SCRIPT.HEBREW, SCRIPT.LAO, SCRIPT.MONGOLIAN, SCRIPT.THAI, SCRIPT.TIBETAN);
//current total string buffer, reuse for performance
private final StringBuilder curString = new StringBuilder();
/**
* Initializes the StringExtract utility Sets enabled scripts to all
* supported ones
*/
public StringExtract() {
unicodeTable = StringExtractUnicodeTable.getInstance();
if (unicodeTable == null) {
throw new IllegalStateException(
NbBundle.getMessage(StringExtract.class, "StringExtract.illegalStateException.cannotInit.msg"));
}
setEnabledScripts(SUPPORTED_SCRIPTS);
enableUTF8 = true;
enableUTF16 = true;
}
public boolean isEnableUTF8() {
return enableUTF8;
}
public void setEnableUTF8(boolean enableUTF8) {
this.enableUTF8 = enableUTF8;
}
public boolean isEnableUTF16() {
return enableUTF16;
}
public void setEnableUTF16(boolean enableUTF16) {
this.enableUTF16 = enableUTF16;
}
/**
* Sets the enabled scripts to ones provided, resets previous setting
*
* @param scripts scripts to consider for when extracting strings
*/
public final void setEnabledScripts(List<SCRIPT> scripts) {
this.enabledScripts = scripts;
}
/**
* Sets the enabled script to one provided, resets previous setting
*
* @param script script to consider for when extracting strings
*/
public final void setEnabledScript(SCRIPT script) {
this.enabledScripts = new ArrayList<SCRIPT>();
this.enabledScripts.add(script);
}
/**
* Check if extraction of the script is supported by the utility
*
* @param script script to check if supported
*
* @return true if the the utility supports the extraction of the script
*/
public static boolean isExtractionSupported(SCRIPT script) {
return SUPPORTED_SCRIPTS.contains(script);
}
/**
* Check if extraction of the script is enabled by this instance of the
* utility. For LATIN_2 (extended LATIN), enable also LATIN_1, even if it's
* not explicitely enabled.
*
* @param script script that was identified, to check if it is enabled
*
* @return true if the the script extraction is enabled
*/
public boolean isExtractionEnabled(SCRIPT script) {
if (script.equals(SCRIPT.LATIN_1)) {
return enabledScripts.contains(SCRIPT.LATIN_1)
|| enabledScripts.contains(SCRIPT.LATIN_2);
} else {
return enabledScripts.contains(script);
}
}
/**
* Determine if Basic Latin/English extraction is set enabled only
*
* @return true if only Basic Latin/English extraction is set enabled only
*/
public boolean isExtractionLatinBasicOnly() {
if (enabledScripts.size() == 1
&& enabledScripts.get(0).equals(SCRIPT.LATIN_1)) {
return true;
} else {
return false;
}
}
public static List<SCRIPT> getSupportedScripts() {
return SUPPORTED_SCRIPTS;
}
/**
* Runs the byte buffer through the string extractor
*
* @param buff
* @param len
* @param offset
*
* @return string extraction result, with the string extracted and
* additional info
*/
public StringExtractResult extract(byte[] buff, int len, int offset) {
if (this.enableUTF16 == false && this.enableUTF8 == false) {
return new StringExtractResult();
}
final int buffLen = buff.length;
int processedBytes = 0;
int curOffset = offset;
int startOffset = offset;
int curStringLen = 0;
//reset curString buffer
curString.delete(0, curString.length());
//keep track of first byte offset that hasn't been processed
//(one byte past the last byte processed in by last extraction)
int firstUnprocessedOff = offset;
while (curOffset < buffLen) {
//shortcut, skip processing empty bytes
if (buff[curOffset] == 0 && curOffset + 1 < buffLen && buff[curOffset + 1] == 0) {
curOffset += 2;
continue;
}
//extract using all methods and see which one wins
StringExtractResult resUTF16 = null;
boolean runUTF16 = false;
if (enableUTF16 && curOffset % 2 == 0) {
runUTF16 = true;
extractUTF16(buff, len, curOffset, true, resUTF16En1);
extractUTF16(buff, len, curOffset, false, resUTF16En2);
resUTF16 = resUTF16En1.numChars > resUTF16En2.numChars ? resUTF16En1 : resUTF16En2;
}
if (enableUTF8) {
extractUTF8(buff, len, curOffset, resUTF8);
}
StringExtractResult resWin = null;
if (enableUTF8 && enableUTF16) {
resWin = runUTF16 && resUTF16.numChars > resUTF8.numChars ? resUTF16 : resUTF8;
} else if (enableUTF16) {
resWin = resUTF16;
} else if (enableUTF8) {
resWin = resUTF8;
}
if (resWin.numChars >= MIN_CHARS_STRING) {
//record string
if (startOffset == offset) {
//advance start offset where first string starts it hasn't been advanced
startOffset = resWin.offset;
}
curStringLen += resWin.numChars;
curString.append(resWin.textString);
curString.append("\n");
curStringLen += resWin.numChars + 1;
//advance
curOffset += resWin.numBytes;
processedBytes += resWin.numBytes;
firstUnprocessedOff = resWin.offset + resWin.numBytes;
} else {
//if no encodings worked, advance byte
if (enableUTF8 == false) {
curOffset += 2;
} else {
++curOffset;
}
}
}
//build up the final result
StringExtractResult res = new StringExtractResult();
res.numBytes = processedBytes;
res.numChars = curStringLen;
res.offset = startOffset;
res.textString = curString.toString();
res.firstUnprocessedOff = firstUnprocessedOff; //save that of the last winning result
return res;
}
private StringExtractResult extractUTF16(byte[] buff, int len, int offset, boolean endianSwap, final StringExtractResult res) {
res.reset();
int curOffset = offset;
final StringBuilder tempString = new StringBuilder();
SCRIPT currentScript = SCRIPT.NONE;
boolean inControl = false;
//while we have 2 byte chunks
byte[] b = new byte[2];
while (curOffset < len - 1) {
b[0] = buff[curOffset++];
b[1] = buff[curOffset++];
if (endianSwap) {
byte temp = b[0];
b[0] = b[1];
b[1] = temp;
}
//convert the byte sequence to 2 byte char
//ByteBuffer bb = ByteBuffer.wrap(b);
//int byteVal = bb.getInt();
char byteVal = (char) b[1];
byteVal = (char) (byteVal << 8);
byteVal += b[0];
//skip if beyond range
if (byteVal > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
break;
}
//lookup byteVal in the unicode table
SCRIPT scriptFound = unicodeTable.getScript(byteVal);
if (scriptFound == SCRIPT.NONE) {
break;
}
/*
* else if (scriptFound == SCRIPT.CONTROL) { //update bytes
* processed res.numBytes += 2; continue; } else if (inControl) {
* break;
}
*/
final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
//allow generic and one of enabled scripts we locked in to
if (isGeneric
|| isExtractionEnabled(scriptFound)) {
if (currentScript == SCRIPT.NONE
&& !isGeneric) {
//handle case when this is the first char in the string
//lock into the script
currentScript = scriptFound;
}
//check if we are within the same script we are locked on to, or COMMON
if (currentScript == scriptFound
|| isGeneric) {
if (res.numChars == 0) {
//set the start offset of the string
res.offset = curOffset;
}
//update bytes processed
res.numBytes += 2;
//append the char
++res.numChars;
tempString.append(byteVal);
} else {
//bail out
break;
}
} else {
//bail out
break;
}
} //no more data
res.textString = tempString.toString();
return res;
}
private StringExtractResult extractUTF8(byte[] buff, int len, int offset, final StringExtractResult res) {
res.reset();
int curOffset = offset;
int ch = 0; //character being extracted
int chBytes; //num bytes consumed by current char (1 - 4)
final StringBuilder tempString = new StringBuilder();
SCRIPT currentScript = SCRIPT.NONE;
boolean inControl = false;
//decode and extract a character
while (curOffset < len) {
// based on "valid UTF-8 byte sequences" in the Unicode 5.0 book
final int curByte = buff[curOffset] & 0xFF; //ensure we are not comparing signed bytes to ints
if (curByte <= 0x7F) {
chBytes = 1;
ch = curByte;
} else if (curByte <= 0xC1) {
break;
} else if (curByte <= 0xDF) {
if (len - curOffset < 2) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
chBytes = 2;
ch = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
} else {
break;
}
} else if (curByte == 0xE0) {
if (len - curOffset < 3) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
chBytes = 3;
ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
} else {
break;
}
} else if (curByte <= 0xEC) {
if (len - curOffset < 3) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
chBytes = 3;
ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
} else {
break;
}
} else if (curByte == 0xED) {
if (len - curOffset < 3) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
chBytes = 3;
ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
} else {
break;
}
} else if (curByte <= 0xEF) {
if (len - curOffset < 3) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
chBytes = 3;
ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
} else {
break;
}
} else if (curByte == 0xF0) {
if (len - curOffset < 4) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
final int curByte_3 = buff[curOffset + 3] & 0xFF;
if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF
&& curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
chBytes = 4;
ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
} else {
break;
}
} else if (curByte <= 0xF3) {
if (len - curOffset < 4) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
final int curByte_3 = buff[curOffset + 3] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF
&& curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
chBytes = 4;
ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
} else {
break;
}
} else {
break;
}
curOffset += chBytes;
//skip if beyond range
if (ch > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
break;
}
//lookup byteVal in the unicode table
SCRIPT scriptFound = unicodeTable.getScript(ch);
if (scriptFound == SCRIPT.NONE) {
break;
}
/*
* else if (scriptFound == SCRIPT.CONTROL) { //update bytes
* processed res.numBytes += chBytes; continue; } else if
* (inControl) { break;
}
*/
final boolean isGeneric = StringExtractUnicodeTable.isGeneric(scriptFound);
//allow generic and one of enabled scripts we locked in to
if (isGeneric
|| isExtractionEnabled(scriptFound)) {
if (currentScript == SCRIPT.NONE
&& !isGeneric) {
//handle case when this is the first char in the string
//lock into the script
currentScript = scriptFound;
}
//check if we are within the same script we are locked on to, or COMMON
if (currentScript == scriptFound
|| isGeneric) {
if (res.numChars == 0) {
//set the start byte offset of the string
res.offset = curOffset;
}
//update bytes processed
res.numBytes += chBytes;
//append the char
++res.numChars;
tempString.append((char) ch);
} else {
//bail out
break;
}
} else {
//bail out
break;
}
} //no more data
res.textString = tempString.toString();
return res;
}
/*
* Extract UTF8/16 ASCII characters from byte buffer - only works for Latin,
* but fast
*
* The definition of printable are: -- All of the letters, numbers, and
* punctuation. -- space and tab -- It does NOT include newlines or control
* chars. -- When looking for ASCII strings, they evaluate each byte and
* when they find four or more printable characters they get printed out
* with a newline in between each string. -- When looking for Unicode
* strings, they evaluate each two byte sequence and look for four or more
* printable characters…
*
* @param readBuf the bytes that the string read from @param len buffer
* length @param offset offset to start converting from
*
*/
public static String extractASCII(byte[] readBuf, int len, int offset) {
final StringBuilder result = new StringBuilder();
StringBuilder temp = new StringBuilder();
int curLen = 0;
final char NL = (char) 10; // ASCII char for new line
final String NLS = Character.toString(NL);
boolean singleConsecZero = false; //preserve the current sequence of chars if 1 consecutive zero char
for (int i = offset; i < len; i++) {
char curChar = (char) readBuf[i];
if (curChar == 0 && singleConsecZero == false) {
//preserve the current sequence if max consec. 1 zero char
singleConsecZero = true;
} else {
singleConsecZero = false;
}
//ignore non-printable ASCII chars
if (isPrintableAscii(curChar)) {
temp.append(curChar);
++curLen;
} else if (!singleConsecZero) {
if (curLen >= MIN_CHARS_STRING) {
// add to the result and also add the new line at the end
result.append(temp);
result.append(NLS);
}
// reset the temp and curLen
temp = new StringBuilder();
curLen = 0;
}
}
result.append(temp);
return result.toString();
}
/**
* Determine if char is a printable ASCII char in range <32,126> and a tab
*
* @param c char to test
*
* @return true if it's a printable char, or false otherwise
*/
public static boolean isPrintableAscii(char c) {
return (c >= 32 && c <= 126) || c == 9;
}
/**
* Representation of the string extraction result
*/
public class StringExtractResult implements Comparable<StringExtractResult> {
int offset; ///< offset in input buffer where the first string starts
int numBytes; ///< num bytes in input buffer consumed
int numChars; ///< number of encoded characters extracted in the textString
int firstUnprocessedOff; ///< first byte past the last byte used in extraction, offset+numBytes for a single result, but we keep track of it for multiple extractions
String textString; ///< the actual text string extracted, of numChars long
void reset() {
offset = 0;
numBytes = 0;
numChars = 0;
firstUnprocessedOff = 0;
textString = null;
}
public int getFirstUnprocessedOff() {
return firstUnprocessedOff;
}
public int getStartOffset() {
return offset;
}
public int getNumBytes() {
return numBytes;
}
public int getTextLength() {
return numChars;
}
public String getText() {
return textString;
}
@Override
public int compareTo(StringExtractResult o) {
//result with highest num of characters is less than (wins)
//TODO handle tie - pick language with smallest number of chars
return o.numChars - numChars;
}
}
/**
* Encapsulates the loaded unicode table and different scripts and provides
* utilitities for the table and script lookup. Manages loading of the
* unicode table. Used as a singleton to ensure minimal resource usage for
* the unicode table.
*/
public static class StringExtractUnicodeTable {
public interface LanguageInfo {
String getLanguages();
}
/**
* Scripts listed in the unicodeTable loaded
*/
public static enum SCRIPT implements LanguageInfo {
NONE {
@Override
public String getLanguages() {
return toString();
}
},
COMMON {
@Override
public String getLanguages() {
return toString();
}
},
LATIN_1 {
@Override
public String toString() {
return "Latin - Basic"; //NON-NLS
}
@Override
public String getLanguages() {
return "English"; //NON-NLS
}
},
GREEK {
@Override
public String toString() {
return "Greek"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
CYRILLIC {
@Override
public String toString() {
return "Cyrillic"; //NON-NLS
}
@Override
public String getLanguages() {
return "Russian, Bulgarian, Serbian, Moldovan"; //NON-NLS
}
},
ARMENIAN {
@Override
public String toString() {
return "Armenian"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
HEBREW {
@Override
public String toString() {
return "Hebrew"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
ARABIC {
@Override
public String toString() {
return "Arabic"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
SYRIAC {
@Override
public String getLanguages() {
return toString();
}
},
THAANA {
@Override
public String getLanguages() {
return toString();
}
},
DEVANAGARI {
@Override
public String getLanguages() {
return toString();
}
},
BENGALI {
@Override
public String toString() {
return "Bengali"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
GURMUKHI {
@Override
public String getLanguages() {
return toString();
}
},
GUJARATI {
@Override
public String getLanguages() {
return toString();
}
},
ORIYA {
@Override
public String getLanguages() {
return toString();
}
},
TAMIL {
@Override
public String getLanguages() {
return toString();
}
},
TELUGU {
@Override
public String getLanguages() {
return toString();
}
},
KANNADA {
@Override
public String getLanguages() {
return toString();
}
},
MALAYALAM {
@Override
public String getLanguages() {
return toString();
}
},
SINHALA {
@Override
public String getLanguages() {
return toString();
}
},
THAI {
@Override
public String toString() {
return "Thai"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
LAO {
@Override
public String toString() {
return "Laotian"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
TIBETAN {
@Override
public String toString() {
return "Tibetian"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
MYANMAR {
@Override
public String getLanguages() {
return toString();
}
},
GEORGIAN {
@Override
public String toString() {
return "Georgian"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
HANGUL {
@Override
public String toString() {
return "Hangul"; //NON-NLS
}
@Override
public String getLanguages() {
return "Korean"; //NON-NLS
}
},
ETHIOPIC {
@Override
public String toString() {
return "Ethiopic"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
CHEROKEE {
@Override
public String getLanguages() {
return toString();
}
},
CANADIAN_ABORIGINAL {
@Override
public String getLanguages() {
return toString();
}
},
OGHAM {
@Override
public String getLanguages() {
return toString();
}
},
RUNIC {
@Override
public String getLanguages() {
return toString();
}
},
KHMER {
@Override
public String toString() {
return "Khmer"; //NON-NLS
}
@Override
public String getLanguages() {
return "Cambodian"; //NON-NLS
}
},
MONGOLIAN {
@Override
public String toString() {
return "Mongolian"; //NON-NLS
}
@Override
public String getLanguages() {
return toString();
}
},
HIRAGANA {
@Override
public String toString() {
return "Hiragana"; //NON-NLS
}
@Override
public String getLanguages() {
return "Japanese"; //NON-NLS
}
},
KATAKANA {
@Override
public String toString() {
return "Katakana"; //NON-NLS
}
@Override
public String getLanguages() {
return "Japanese"; //NON-NLS
}
},
BOPOMOFO {
@Override
public String getLanguages() {
return toString();
}
},
HAN {
@Override
public String toString() {
return "Han"; //NON-NLS
}
@Override
public String getLanguages() {
return "Chinese, Japanese, Korean"; //NON-NLS
}
},
YI {
@Override
public String getLanguages() {
return toString();
}
},
OLD_ITALIC {
@Override
public String getLanguages() {
return toString();
}
},
GOTHIC {
@Override
public String getLanguages() {
return toString();
}
},
DESERET {
@Override
public String getLanguages() {
return toString();
}
},
INHERITED {
@Override
public String getLanguages() {
return toString();
}
},
TAGALOG {
@Override
public String getLanguages() {
return toString();
}
},
HANUNOO {
@Override
public String getLanguages() {
return toString();
}
},
BUHID {
@Override
public String getLanguages() {
return toString();
}
},
TAGBANWA {
@Override
public String getLanguages() {
return toString();
}
},
LIMBU {
@Override
public String getLanguages() {
return toString();
}
},
TAI_LE {
@Override
public String getLanguages() {
return toString();
}
},
LINEAR_B {
@Override
public String getLanguages() {
return toString();
}
},
UGARITIC {
@Override
public String getLanguages() {
return toString();
}
},
SHAVIAN {
@Override
public String getLanguages() {
return toString();
}
},
OSMANYA {
@Override
public String getLanguages() {
return toString();
}
},
CYPRIOT {
@Override
public String getLanguages() {
return toString();
}
},
BRAILLE {
@Override
public String getLanguages() {
return toString();
}
},
BUGINESE {
@Override
public String getLanguages() {
return toString();
}
},
COPTIC {
@Override
public String getLanguages() {
return toString();
}
},
NEW_TAI_LUE {
@Override
public String getLanguages() {
return toString();
}
},
GLAGOLITIC {
@Override
public String getLanguages() {
return toString();
}
},
TIFINAGH {
@Override
public String getLanguages() {
return toString();
}
},
SYLOTI_NAGRI {
@Override
public String getLanguages() {
return toString();
}
},
OLD_PERSIAN {
@Override
public String getLanguages() {
return toString();
}
},
KHAROSHTHI {
@Override
public String getLanguages() {
return toString();
}
},
BALINESE {
@Override
public String getLanguages() {
return toString();
}
},
CUNEIFORM {
@Override
public String getLanguages() {
return toString();
}
},
PHOENICIAN {
@Override
public String getLanguages() {
return toString();
}
},
PHAGS_PA {
@Override
public String getLanguages() {
return toString();
}
},
NKO {
@Override
public String getLanguages() {
return toString();
}
},
CONTROL {
@Override
public String getLanguages() {
return toString();
}
},
LATIN_2 {
@Override
public String toString() {
return "Latin - Extended"; //NON-NLS
}
@Override
public String getLanguages() {
return "European"; //NON-NLS
}
}
};
private static final SCRIPT[] SCRIPT_VALUES = SCRIPT.values();
private static final String PROPERTY_FILE = "StringExtract.properties"; //NON-NLS
/**
* table has an entry for every possible 2-byte value
*/
private static final int UNICODE_TABLE_SIZE = 65536;
/**
* unicode lookup table with 2 byte index and value of script
*/
private static final char[] unicodeTable = new char[UNICODE_TABLE_SIZE];
private static StringExtractUnicodeTable instance = null; //the singleton instance
/**
* return instance of StringExtract of null if it could not be
* initialized
*
* @return
*/
public static synchronized StringExtractUnicodeTable getInstance() {
if (instance == null) {
instance = new StringExtractUnicodeTable();
if (!instance.init()) {
//error condition
instance = null;
}
}
return instance;
}
/**
* Lookup and get script given byte value of a potential character
*
* @param value
*
* @return the script type corresponding to the value
*/
public SCRIPT getScript(int value) {
char scriptVal = unicodeTable[value];
return SCRIPT_VALUES[scriptVal];
}
/**
* Check if the script belongs to generic/common (chars are shared
* between different scripts)
*
* @param script to check for
*
* @return true if the script is generic
*/
public static boolean isGeneric(SCRIPT script) {
return script == SCRIPT.COMMON; // || script == SCRIPT.LATIN_1;
}
public static int getUnicodeTableSize() {
return UNICODE_TABLE_SIZE;
}
/**
* Get the value of the script
*
* @param script the script to get value of
*
* @return the value corresponding to ordering in the SCRIPT enum
*/
public static int getScriptValue(SCRIPT script) {
return script.ordinal();
}
public static SCRIPT scriptForString(String scriptStringVal) {
SCRIPT script = SCRIPT.valueOf(scriptStringVal);
return script;
}
/**
* Initialization, loads unicode tables
*
* @return true if initialized properly, false otherwise
*/
private boolean init() {
Properties properties = new Properties();
try {
//properties.load(new FileInputStream("StringExtract.properties"));
InputStream inputStream = StringExtract.class.getResourceAsStream(PROPERTY_FILE);
properties.load(inputStream);
String table = properties.getProperty("UnicodeTable");
StringTokenizer st = new StringTokenizer(table, " ");
int toks = st.countTokens();
//logger.log(Level.INFO, "TABLE TOKS: " + toks);
if (toks != UNICODE_TABLE_SIZE) {
logger.log(Level.WARNING, "Unicode table corrupt, expecting: " + UNICODE_TABLE_SIZE, ", have: " + toks); //NON-NLS
return false;
}
int tableIndex = 0;
while (st.hasMoreTokens()) {
String tok = st.nextToken();
char code = (char) Integer.parseInt(tok);
unicodeTable[tableIndex++] = code;
}
logger.log(Level.INFO, "initialized, unicode table loaded"); //NON-NLS
} catch (IOException ex) {
logger.log(Level.WARNING, "Could not load" + PROPERTY_FILE); //NON-NLS
return false;
}
return true;
}
}
}