/**
*
* Copyright 2012-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*/
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
// _____ ____ __ __
///\ __`\ /\ _`\ /\ \__ /\ \__
//\ \ \/\ \ _____ __ ___ \ \,\L\_\ __ __ _\ \ ,_\ __ ___ \ \ ,_\
// \ \ \ \ \ /\ '__`\ /'__`\ /' _ `\ \/_\__ \ /'__`\/\ \/'\\ \ \/ /'__`\ /' _ `\\ \ \/
// \ \ \_\ \\ \ \L\ \/\ __/ /\ \/\ \ /\ \L\ \ /\ __/\/> </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_
// \ \_____\\ \ ,__/\ \____\\ \_\ \_\ \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\
// \/_____/ \ \ \/ \/____/ \/_/\/_/ \/_____/ \/____/\//\/_/ \/__/ \/__/\/_/ \/_/\/_/ \/__/
// \ \_\
// \/_/
//
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
package org.opensextant.util;
import static org.apache.commons.lang3.StringUtils.isBlank;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.opensextant.data.Language;
import org.supercsv.cellprocessor.Optional;
import org.supercsv.cellprocessor.constraint.NotNull;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.io.CsvListReader;
import org.supercsv.prefs.CsvPreference;
/**
*
* @author ubaldino
*/
public class TextUtils {
final static Pattern delws = Pattern.compile("\\s+");
// Match ALL empty lines:
// \n followed by other ootional whitespace
// Up to 2 empty lines or more. This matches 3 line endings
// The first EOL could be on a non-empty line, but then followed by 2 empty
// lines.
// The intent is to reduce 3 or more EOL to 2. Preserving paragraph breaks.
//
final static Pattern multi_eol = Pattern.compile("(\n[ \t\r]*){3,}");
final static Pattern multi_eol2 = Pattern.compile("(\n\r?){2,}");
/**
* Checks if non-ASCII and non-LATIN characters are present.
*
* @param data
* any textual data
* @return true if content is strictly ASCII or Latin1 extended.
*/
public final static boolean isLatin(String data) {
char[] ch = data.toCharArray();
boolean isLatin = true;
for (char c : ch) {
if (isASCII(c)) {
continue;
}
if (!Character.isLetter(c)) {
continue;
}
Character.UnicodeBlock blk = Character.UnicodeBlock.of(c);
if (blk == Character.UnicodeBlock.LATIN_1_SUPPLEMENT || blk == Character.UnicodeBlock.LATIN_EXTENDED_A
|| blk == Character.UnicodeBlock.LATIN_EXTENDED_B || blk == Character.UnicodeBlock.LATIN_EXTENDED_C
|| blk == Character.UnicodeBlock.LATIN_EXTENDED_D
|| blk == Character.UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
continue;
}
isLatin = false;
break;
}
//
return isLatin;
}
/**
* Helpful hints on parsing Unicode phrases. Reference:
* http://www.rgagnon.com/javadetails/java-0456.html
*/
private static final String ALPHAMAP_PLAIN_ASCII = "AaEeIiOoUu" // grave
+ "AaEeIiOoUuYy" // acute
+ "AaEeIiOoUuYy" // circumflex
+ "AaOoNn" // tilde
+ "AaEeIiOoUuYy" // umlaut
+ "Aa" // ring
+ "Cc" // cedilla
+ "OoUu" // double acute
+ "Oo" // Scandanavian o
+ "AaEe" // A/E wiht micron
;
private static final String ALPHAMAP_UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" // grave
+ "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" // acute
+ "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" // circumflex
+ "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" // tilde
+ "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" // umlaut
+ "\u00C5\u00E5" // ring
+ "\u00C7\u00E7" // cedilla
+ "\u0150\u0151\u0170\u0171" // double acute
+ "\u00D8\u00F8" // Scandanavian o Øø
+ "\u0100\u0101\u0112\u0113" // E-bar, A-bar
;
private static final String COMMON_DIACRITC_HASHMARKS = "\"'`\u00B4\u2018\u2019";
/**
* If a string has extended latin diacritics.
* @param s
* @return true if a single diacritic is found.
*/
public final static boolean hasDiacritics(final String s) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (ALPHAMAP_UNICODE.indexOf(c) >= 0) {
return true;
}
if (COMMON_DIACRITC_HASHMARKS.indexOf(c) >= 0) {
return true;
}
}
return false;
}
/**
* remove accents from a string and replace with ASCII equivalent Reference:
* http://www.rgagnon.com/javadetails/java-0456.html Caveat: This
* implementation is not exhaustive.
*
* @param s
* the string
* @return converted string
*/
public final static String replaceDiacritics(final String s) {
if (s == null) {
return null;
}
if ("".equals(s)) {
return s;
}
StringBuilder sb = new StringBuilder();
int n = s.length();
for (int i = 0; i < n; i++) {
char c = s.charAt(i);
int pos = ALPHAMAP_UNICODE.indexOf(c);
if (pos > -1) {
sb.append(ALPHAMAP_PLAIN_ASCII.charAt(pos));
} else {
sb.append(c);
}
}
return sb.toString();
}
/**
*
* @param c
* a character
* @return true if c is ASCII
*/
public final static boolean isASCII(char c) {
return c > 0 && c <= ASCII_END;
}
private final static int ASCII_END = 0x7F;
/**
* @param data
* bytes to test
* @return boolean if data is ASCII or not
*/
public static boolean isASCII(byte[] data) {
for (byte b : data) {
if (b < 0 || b > ASCII_END) {
return false;
}
}
return true;
}
/**
* Early exit test -- return false on first non-ASCII character found.
*
* @param t
* buffer of text
* @return true only if every char is in ASCII table.
*/
public static boolean isASCII(String t) {
char c;
for (int x = 0; x < t.length(); ++x) {
c = t.charAt(x);
if (c > ASCII_END) {
return false;
}
}
return true;
}
/**
* count the number of ASCII bytes
*
* @param data
* bytes to count
* @return count of ASCII bytes
*/
public static int countASCIIChars(byte[] data) {
int ascii = 0;
for (byte b : data) {
if (b > 0 || b <= ASCII_END) {
++ascii;
}
}
return ascii;
}
/**
* Replaces all 3 or more blank lines with a single paragraph break (\n\n)
*
* @param t
* text
* @return A string with fewer line breaks;
*
*/
public static String reduce_line_breaks(String t) {
Matcher m = multi_eol.matcher(t);
if (m != null) {
return m.replaceAll("\n\n");
}
return t;
}
/**
* Delete whitespace of any sort.
*
* @param t
* text
* @return String, without whitespace.
*/
public static String delete_whitespace(String t) {
Matcher m = delws.matcher(t);
if (m != null) {
return m.replaceAll("");
}
return t;
}
/**
* Minimize whitespace.
*
* @param t
* text
* @return scrubbed string
*/
public static String squeeze_whitespace(String t) {
Matcher m = delws.matcher(t);
if (m != null) {
return m.replaceAll(" ");
}
return t;
}
/**
* Replace line endings with SPACE
*
* @param t
* text
* @return scrubbed string
*/
public static String delete_eol(String t) {
return t.replace('\n', ' ').replace('\r', ' ');
}
public final static char NL = '\n';
public final static char CR = '\r';
public final static char SP = ' ';
public final static char TAB = '\t';
public final static char DEL = 0x7F;
/**
* Delete control chars from text data; leaving text and whitespace only.
* Delete char (^?) is also removed. Length may differ if ctl chars are
* removed.
*
* @param t
* text
* @return scrubbed buffer
*/
public static String delete_controls(String t) {
if (t == null) {
return null;
}
StringBuilder tmpCleanBuf = new StringBuilder();
for (char ch : t.toCharArray()) {
if ((ch < ' ' && !(ch == TAB || ch == NL)) || (ch == DEL)) {
continue;
}
tmpCleanBuf.append(ch);
}
return tmpCleanBuf.toString();
}
public static boolean hasDigits(String txt) {
return (countDigits(txt) > 0);
}
public static int countDigits(String txt) {
return count_digits(txt);
}
/**
* Counts all digits in text.
*
* @param txt
* text to count
* @return count of digits
*/
public static int count_digits(String txt) {
if (txt == null) {
return 0;
}
int digits = 0;
for (char c : txt.toCharArray()) {
if (Character.isDigit(c)) {
++digits;
}
}
return digits;
}
/**
* StringUtils in commons isNumeric("1.234") is NOT numeric. Here "1.234" is
* numeric.
*
* @param v
* val to parse
* @return true if val is a number
*/
public final static boolean isNumeric(final String v) {
if (v == null) {
return false;
}
for (char ch : v.toCharArray()) {
/*
* Is the character in .-+Ee ?
*/
if (ch == '.' || ch == '-' || ch == '+' || ch == 'e' || ch == 'E') {
continue;
}
if (!Character.isDigit(ch)) {
return false;
}
}
return true;
}
/**
* Counts all whitespace in text.
*
* @param txt
* text
* @return whitespace count
*/
public static int count_ws(String txt) {
if (txt == null) {
return 0;
}
int ws = 0;
for (char c : txt.toCharArray()) {
// isWhitespaceChar(c)?
if (Character.isWhitespace(c)) {
++ws;
}
}
return ws;
}
/**
* Count formatting whitespace. This is helpful in determining if text spans
* are phrases with multiple TAB or EOL characters. For that matter, any
* control character contributes to formatting in some way. DEL, VT, HT,
* etc. So all control characters ( c < ' ') are counted.
*
* @param txt
* input string
* @return count of format chars
*/
public static int countFormattingSpace(String txt) {
if (txt == null) {
return 0;
}
int ws = 0;
for (char c : txt.toCharArray()) {
// if (c == '\n' || c == '\r' || c == '\t') {
if (c < 0x20) {
++ws;
}
}
return ws;
}
/**
* For measuring the upper-case-ness of short texts. Returns true if ALL
* letters in text are UPPERCASE. Allows for non-letters in text.
*
* @param dat
* text or data
* @return true if text is Upper
*/
public static boolean isUpper(String dat) {
return checkCase(dat, 2);
}
public static boolean isLower(String dat) {
return checkCase(dat, 1);
}
/**
* detects if string alpha chars are purely lower case.
*
* @param text
* text
* @param textcase
* 1 lower, 2 upper
* @return if case matches given textcase param
*/
public static boolean checkCase(String text, int textcase) {
if (text == null) {
return false;
}
int caseCount = 0;
for (char c : text.toCharArray()) {
if (!Character.isLetter(c)) {
continue;
}
if (textcase == 1) {
if (Character.isUpperCase(c)) {
// Checking for lower case; Fail if upper case is found.
return false;
} else if (Character.isLowerCase(c)) {
++caseCount;
}
} else if (textcase == 2) {
if (Character.isLowerCase(c)) {
// Checking for upper case; Fail if lower case is found.
return false;
} else if (Character.isUpperCase(c)) {
++caseCount;
}
}
}
// IF at least one letter found in the case, return true.
// It is possible that mixed-language text that has no case-sense
// is mixed up with ASCII or Romance language text.
// test LOWER UPPER
// A b ==> no no
// A 寨 ==> no yes
// a 寨 ==> yes no
return caseCount > 0;
}
/**
* Measure character count, upper, lower, non-Character, whitespace
*
* @param text
* text
* @return int array with counts.
*/
public static int[] measureCase(String text) {
if (text == null) {
return null;
}
int u = 0, l = 0, ch = 0, nonCh = 0, ws = 0;
int[] counts = new int[5];
for (char c : text.toCharArray()) {
if (Character.isLetter(c)) {
++ch;
if (Character.isUpperCase(c)) {
++u;
} else if (Character.isLowerCase(c)) {
++l;
}
} else if (Character.isWhitespace(c)) {
++ws;
} else {
++nonCh; // Other content?
}
}
counts[0] = ch;
counts[1] = u;
counts[2] = l;
counts[3] = nonCh;
counts[4] = ws;
return counts;
}
/**
* a threshold for determining if character content in a document is upper case enough that the entire document can
* be considered upper case. These are constants you can override, since these thresholds are just heuristics. We
* don't expect you would pass in such things as arguments
* as they don't change from doc to doc much; they do change from domain to domain.
*
* IFF you are hitting these thresholds too closely, then you have to adapt these to your own data. These are
* meant to be very, very general. They would best apply to documents on the order of 200 to 10,000 bytes. Beyond
* that
* we don't find many texts that are that size and all lower or all upper where these heuristics are helpful.
* E.g., tweets in English -- these thresholds are easily influenced by a difference of one or two characters.
* @deprecated this threshold is dependent on the length of the signal.
*/
public static double UPPER_CASE_THRESHOLD = 0.75;
/**
* Since we live in a world that has made use of first-letter capital for a number of languages, this threshold is
* very high.
* "IS THIS UPPER CASE?, I WILL USE eBAY TODAY"
* "by the same convention, this is largely lower case; I will use eBay today."
* @deprecated this threshold is dependent on the length of the signal.
*/
public static double LOWER_CASE_THRESHOLD = 0.95;
/**
* First measureCase(Text) to acquire counts, then call this routine for a heuristic
* that suggests the text is mainly upper case. These routines may not work well on languages that are not
* Latin-alphabet.
*
* @param counts
* word stats from measureCase()
* @return true if counts represent text that exceeds the "UPPER CASE" threshold
*/
public static boolean isUpperCaseDocument(final int[] counts) {
// Method 1: Content = chars + non-chars (not whitespace)
// measure upper case against ALL content.
// Method 2: measure upper case against just char content.
//
// Method 2 seems best.
int content = counts[0] /* + counts[3]*/ ;
float uc = ((float) counts[1] / content);
if (content < 100) {
return uc > 0.50;
}
if (content < 500) {
return uc > 0.60;
}
// Imagine 1KB of text,.. 75% of it is upper case...the document is largely uppercase.
return uc > 0.75;
}
/**
* This measures the amount of upper case
* See Upper Case. Two methods to measure -- lower case count compared to all content (char+non-char)
* or compared to just char content.
*
* @param counts
* word stats from measureCase()
* @return true if counts represent text that exceeds the "lower case" threshold
*/
public static boolean isLowerCaseDocument(final int[] counts) {
int content = counts[0] /*+ counts[3]*/;
float lc = ((float) counts[2] / content);
if (content < 100) {
return lc > 0.97;
}
return lc > 0.98;
}
/**
* Find the text window(s) around a match. Given the size of a buffer, the
* match and desired width return
*
* <pre>
* prepreprepre MATCH postpostpost
* ^ ^ ^ ^
* l-width l l+len l+len+width
* left1 left2 right1 right2
* </pre>
*
* @param offset
* offset of match
* @param width
* width of window left and right of match
* @param textsize
* size of buffer containing match; used for boundary conditions
* @param matchlen
* length of match
* @return window offsets left of match, right of match: [ l1, l2, r1, r2 ]
*/
public static int[] get_text_window(int offset, int matchlen, int textsize, int width) {
/*
*/
int left_x = offset - width;
int left_y = offset - 1;
int right_x = offset + matchlen;
int right_y = right_x + width;
if (left_x < 0) {
left_x = 0;
}
// Fix left side of bounds
if (left_y < left_x) {
left_y = left_x;
}
// Fix right side of bounds
if (right_y >= textsize) {
right_y = textsize;
}
if (right_x > right_y) {
right_x = right_y;
}
int[] slice = { left_x, left_y, right_x, right_y };
return slice;
}
/**
* Get a single text window around the offset.
*
* @param offset
* offset of match
* @param width
* width of window left and right of match
* @param textsize
* size of buffer containing match; used for boundary conditions
* @return window offsets of a text span contianing match [ left, right ]
*/
public static int[] get_text_window(int offset, int textsize, int width) {
/*
* left .... match .... right
*/
int half = (width / 2);
int left = offset - half;
int right = offset + half;
if (left < 0) {
left = 0;
}
// Fix right side of bounds
if (right >= textsize) {
right = textsize;
}
int[] slice = { left, right };
return slice;
}
/**
* Static method -- use only if you are sure of thread-safety.
*
* @param text
* text or data
* @return identifier for the text, an MD5 hash
* @throws NoSuchAlgorithmException
* on err
* @throws UnsupportedEncodingException
* on err
*/
public static String text_id(String text) throws NoSuchAlgorithmException, UnsupportedEncodingException {
if (text == null) {
return null;
}
MessageDigest md5 = MessageDigest.getInstance("MD5");
/*
* For this to be reproducible on all machines, we cannot rely on a
* default encoding for getBytes. So use getBytes(enc) to be explicit.
*
*/
md5.update(text.getBytes("UTF-8"));
return md5_id(md5.digest());
}
/**
*
* @param md5digest
* byte array
* @return MD5 hash for the data
*/
public static String md5_id(byte[] md5digest) {
// Thanks to javacream:
// create hex string from the 16-byte hash
StringBuilder hashbuf = new StringBuilder(md5digest.length * 2);
for (byte b : md5digest) {
int intVal = b & 0xff;
if (intVal < 0x10) {
hashbuf.append("0");
}
hashbuf.append(Integer.toHexString(intVal));
}
return hashbuf.toString().toLowerCase();
}
/**
* Get a list of values into a nice, scrubbed array of values, no
* whitespace.
*
* a, b, c d e, f => [ "a", "b", "c d e", "f" ]
*
* @param s
* string to split
* @param delim
* delimiter, no default.
* @return list of split strings, which are also whitespace trimmed
*/
public static List<String> string2list(String s, String delim) {
if (s == null) {
return null;
}
List<String> values = new ArrayList<String>();
String[] _vals = s.split(delim);
for (String v : _vals) {
String val = v.trim();
if (!val.isEmpty()) {
values.add(val);
}
}
return values;
}
/**
* Given a string S and a list of characters to replace with a substitute,
*
* return the new string, S'.
*
* "-name-with.invalid characters;" // replace "-. ;" with "_"
* "_name_with_invalid_characters_" //
*
* @param buf
* buffer
* @param replace
* string of characters to replace with the one substitute char
* @param substitution
* string to insert in place of chars
* @return scrubbed text
*/
public static String fast_replace(String buf, String replace, String substitution) {
StringBuilder _new = new StringBuilder();
for (char ch : buf.toCharArray()) {
if (replace.indexOf(ch) >= 0) {
_new.append(substitution);
} else {
_new.append(ch);
}
}
return _new.toString();
}
/**
* Remove instances of any char in the remove string from buf
*
* @param buf
* text
* @param remove
* string to remove
* @return scrubbed text
*/
public static String removeAny(String buf, String remove) {
StringBuilder _new = new StringBuilder();
for (char ch : buf.toCharArray()) {
if (remove.indexOf(ch) < 0) {
_new.append(ch);
}
}
return _new.toString();
}
/**
* Replace any of the removal chars with the sub. A many to one replacement.
* alt: use regex String.replace(//, '')
*
* @param buf
* text
* @param remove
* string to replace
* @param sub
* the replacement string
* @return scrubbed text
*/
public static String replaceAny(String buf, String remove, String sub) {
StringBuilder _new = new StringBuilder();
for (char ch : buf.toCharArray()) {
if (remove.indexOf(ch) < 0) {
_new.append(ch);
} else {
_new.append(sub);
}
}
return _new.toString();
}
/**
* compare to trim( string, chars ), but you can trim any chars
*
* Example: - a b c remove "-" from string above.
*
* @param buf
* text
* @param remove
* string to remove
* @return scrubbed text
*/
public static String removeAnyLeft(String buf, String remove) {
boolean eval = true;
// Start from left.
int x = 0;
for (char ch : buf.toCharArray()) {
if (eval && remove.indexOf(ch) >= 0) {
++x;
continue;
} else {
eval = false; // shunt the evaluation of the chars.
}
}
return buf.substring(x);
}
/**
* Normalization: Clean the ends, Remove Line-endings from middle of entity.
*
* <pre>
* Example:
* TEXT: **The Daily Newsletter of \n\rBarbara, So.**
* CLEAN: __The Daily Newsletter of __Barbara, So___
*
* Where "__" represents omitted characters.
* </pre>
*
* @param str
* text
* @return scrubbed text
*/
public static String normalizeTextEntity(String str) {
if (isBlank(str)) {
return "";
}
char[] chars = str.toCharArray();
int s1 = 0, s2 = chars.length - 1;
int end = s2;
while (s1 < s2 && !(Character.isLetter(chars[s1]) || Character.isDigit(chars[s1]))) {
++s1;
}
// No text found
if (s1 == s2) {
return null;
}
while (s2 > s1 && !(Character.isLetter(chars[s2]) || Character.isDigit(chars[s2]))) {
--s2;
}
if (s1 == 0 && s2 == end) {
// No cleanup to do.
return squeeze_whitespace(str);
}
// NOT possible, I hope...
if (s2 <= s1) {
return "";
}
// Some cleanup was done on ends of String. Now clear up whitespace.
//
return squeeze_whitespace(str.substring(s1, s2 + 1));
}
private final static Pattern tokenizer = Pattern.compile("\\s+");
/**
* Return just white-space delmited tokens.
*
* @param str
* text
* @return tokens
*/
public static String[] tokens(String str) {
return tokenizer.split(str.trim());
}
/**
* Return tokens on the right most part of a buffer. If a para break occurs,
* \n\n or \r\n\r\n, then return the part on the right of the break.
*
* @param str
* text
* @return whitespace delimited tokens
*/
public static final String[] tokensRight(String str) {
if (str.length() == 0) {
return null;
}
String[] toks = multi_eol2.split(str);
if (toks.length == 0) {
return null;
}
return tokens(toks[toks.length - 1]); // Rightmost
}
/**
* See tokensRight()
*
* @param str
* text
* @return whitespace delimited tokens
*/
public static final String[] tokensLeft(String str) {
if (str.length() == 0) {
return null;
}
String[] toks = multi_eol2.split(str);
if (toks.length == 0) {
return null;
}
return tokens(toks[0]); // Leftmost
}
/**
* Intended only as a filter for punctuation within a word. Text of the form
* A.T.T. or U.S. becomes ATT and US. A text such as Mr.Pibbs incorrectly
* becomes MrPibbs but for the purposes of normalizing tokens this should be
* fine. Use appropriate tokenization prior to using this as a filter.
*
* @param word
* phrase with periods denoting some abbreviation.
* @return scrubbed text
*/
public static String normalizeAbbreviation(String word) {
return word.replace(".", "");
}
/**
* Supports Phoneticizer utility from OpenSextant v1.x Remove diacritics
* from a phrase
*
* @param word
* text
* @return scrubbed text
*/
public static String removeDiacritics(String word) {
// first, fully decomposed all chars
String tmpWord = Normalizer.normalize(word, Normalizer.Form.NFD);
StringBuilder newWord = new StringBuilder();
char[] chars = tmpWord.toCharArray();
// now, discard any characters from one of the "Mark" categories.
for (char c : chars) {
if (Character.getType(c) != Character.NON_SPACING_MARK
&& Character.getType(c) != Character.COMBINING_SPACING_MARK
&& Character.getType(c) != Character.ENCLOSING_MARK) {
newWord.append(c);
}
}
return newWord.toString();
}
/**
* Normalize to "Normalization Form Canonical Decomposition" (NFD) REF:
* http:
* //stackoverflow.com/questions/3610013/file-listfiles-mangles-unicode-
* names-with-jdk-6-unicode-normalization-issues This supports proper file
* name retrieval from file system, among other things. In many situations
* we see unicode file names -- Java can list them, but in using the
* Java-provided version of the filename the OS/FS may not be able to find
* the file by the name given in a particular normalized form.
*
* @param str
* text
* @return normalized string, encoded with NFD bytes
*/
public static String normalizeUnicode(String str) {
Normalizer.Form form = Normalizer.Form.NFD;
if (!Normalizer.isNormalized(str, form)) {
return Normalizer.normalize(str, form);
}
return str;
}
/**
* Matches non-text after a word.
*/
final static Pattern CLEAN_WORD_RIGHT = Pattern.compile("[^\\p{L}\\p{Nd}]+$");
/**
* Matches non-text preceeding text
*/
final static Pattern CLEAN_WORD_LEFT = Pattern.compile("^[^\\p{L}\\p{Nd}]+");
/**
* Obscure punctuation pattern that also deals with Unicode single and
* double quotes
*/
final static Pattern CLEAN_WORD_PUNCT = Pattern.compile("[\"'.`\\u00B4\\u2018\\u2019]");
/**
* Remove any leading and trailing punctuation and some internal
* punctuation. Internal punctuation which indicates conjunction of two
* tokens, e.g. a hyphen, should have caused a split into separate tokens at
* the tokenization stage.
*
* Phoneticizer utility from OpenSextant v1.x Remove punctuation from a
* phrase
*
* @param word
* text
* @return scrubbed text
*/
public static String removePunctuation(String word) {
String tmp = CLEAN_WORD_LEFT.matcher(word).replaceAll(" ");
tmp = CLEAN_WORD_RIGHT.matcher(tmp).replaceAll(" ");
// remove some internal punctuation. To be removed: char hex
// unicode_name
// " 22 QUOTATION MARK
// ' 27 APOSTROPHE
// . 2e FULL STOP
// ` 60 GRAVE ACCENT
// � b4 ACUTE ACCENT
// � 2018 LEFT SINGLE QUOTATION MARK
// � 2019 RIGHT SINGLE QUOTATION MARK
return CLEAN_WORD_PUNCT.matcher(tmp).replaceAll("").trim();
}
// Alphabetic list of top-N languages -- ISO-639_1 "ISO2" language codes
//
public final static String arabicLang = "ar";
public final static String bahasaLang = "id";
public final static String chineseLang = "zh";
public final static String chineseTradLang = "zt";
public final static String englishLang = "en";
public final static String farsiLang = "fa";
public final static String frenchLang = "fr";
public final static String germanLang = "de";
public final static String italianLang = "it";
public final static String japaneseLang = "ja";
public final static String koreanLang = "ko";
public final static String portugueseLang = "pt";
public final static String russianLang = "ru";
public final static String spanishLang = "es";
public final static String turkishLang = "tr";
public final static String thaiLang = "th";
public final static String vietnameseLang = "vi";
public final static String romanianLang = "ro";
private final static Map<String, Language> languageMapISO639 = new HashMap<String, Language>();
/*
* Initialize some langauge metadata.
*/
static {
try {
// initLanguageData(); // Barely useful -- this pulls out lang
// Locales.
initLOCLanguageData(); // LOC language data is a list of all known
// languages w/ISO codes.
// initICULanguageData(); ICU did not seem to be the right solution.
} catch (Exception err) {
err.printStackTrace();
}
}
/**
* If caller wants to add language they can.
*
* @return map of lang ID to language obj
*/
public static Map<String, Language> getLanguageMap() {
return languageMapISO639;
}
/**
* Initialize language codes and metadata. This establishes a map for the
* most common language codes/names that exist in at least ISO-639-1 and
* have a non-zero 2-char ID.
*
* <pre>
* Based on:
* http://stackoverflow.com/questions/674041/is-there-an-elegant-way
* -to-convert-iso-639-2-3-letter-language-codes-to-java-lo
*
* Actual code mappings: en => eng eng => en
*
* cel => '' // Celtic; Avoid this.
*
* tr => tur tur => tr
*
* Names: tr => turkish tur => turkish turkish => tr // ISO2 only
*
* </pre>
*/
public static void initLanguageData() {
Locale[] locales = Locale.getAvailableLocales();
for (Locale locale : locales) {
Language l = new Language(locale.getISO3Language(), locale.getLanguage(), locale.getDisplayLanguage());
addLanguage(l);
}
}
/**
* This is Libray of Congress data for language IDs. This is offered as a
* tool to help downstream language ID and enrich metadata when tagging data
* from particular countries.
*
* Reference: http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
*
* @throws java.io.IOException
* if resource file is not found
*/
public static void initLOCLanguageData() throws java.io.IOException {
//
// DATA FILE: http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
java.io.InputStream io = TextUtils.class.getResourceAsStream("/ISO-639-2_utf-8.txt");
java.io.Reader featIO = new InputStreamReader(io, "UTF-8");
CsvListReader langReader = new CsvListReader(featIO, new CsvPreference.Builder('"', '|', "\n").build());
CellProcessor[] cells = { new Optional(), new Optional(), new Optional(), new Optional(), new NotNull() };
List<Object> lang = null;
/*
* ISO3,XX,ISO2,NAME,NAME_FR
*/
while ((lang = langReader.read(cells)) != null) {
//
String names = (String) lang.get(3);
if (isBlank(names)) {
continue;
}
if ("NAME".equals(names)) {
continue;
}
List<String> namelist = TextUtils.string2list(names, ";");
String iso3 = (String) lang.get(0);
if (iso3.startsWith("#")) {
continue;
}
String iso2 = (String) lang.get(2);
Language l = new Language(iso3, iso2, namelist.get(0));
addLanguage(l);
}
langReader.close();
// Popular languages that go by other codes.
// ISO languages as listed by LOC are listed with Bibliographic vs.
// Terminological codes.
// FRE vs. FRA are subtle difference for French, but important if you
// cannot find French by lang ID.
//
// Fully override French and Trad Chinese:
Language fr = new Language("fra", "fr", "French");
addLanguage(fr, true);
Language zhtw = new Language("zh-tw", "zt", "Chinese/Taiwain");
addLanguage(zhtw, true);
// Delicately insert more common names and codes as well as locales
// here.
Language zh = new Language("zho", "zh", "Chinese");
languageMapISO639.put("zho", zh);
Language zhcn = new Language("chi", "zh", "Chinese");
languageMapISO639.put("zh-cn", zhcn);
Language fas = new Language("per", "fa", "Farsi");
languageMapISO639.put("farsi", fas);
// Locales of English -- are still "English"
Language en1 = new Language("eng", "en", "English");
languageMapISO639.put("en-gb", en1);
languageMapISO639.put("en-us", en1);
languageMapISO639.put("en-au", en1);
}
public static void addLanguage(Language lg) {
addLanguage(lg, false);
}
/**
* Extend the basic language dictionary. Note -- First language is listed in
* language map by Name, and is not overwritten. Language objects may be
* overwritten in map using lang codes.
*
* For example, fre = French(fre), fra = French(fra), and french =
* French(fra)
*
* the last one, 'french' = could have been the French(fre) or (fra).
*
* Example, 'ger' and 'deu' are both valid ISO 3-alpha codes for German.
* What to do?
*
* TODO: Create a language object that lists both language
* biblio/terminology codes.
*
* @param lg
* language object
* @param override
* if this value should overwrite an existing one.
*/
public static void addLanguage(Language lg, boolean override) {
if (lg == null) {
return;
}
if (lg.getCode() != null) {
if (override || !languageMapISO639.containsKey(lg.getCode())) {
languageMapISO639.put(lg.getCode(), lg);
}
}
if (lg.getISO639_1_Code() != null) {
if (override || !languageMapISO639.containsKey(lg.getISO639_1_Code())) {
languageMapISO639.put(lg.getISO639_1_Code(), lg);
}
}
if (lg.getNameCode() != null) {
if (!languageMapISO639.containsKey(lg.getNameCode())) {
languageMapISO639.put(lg.getNameCode(), lg);
}
}
}
/**
* Given an ISO2 char code (least common denominator) retrieve Language
* Name.
*
* This is best effort, so if your code finds nothing, this returns code
* normalized to lowercase.
*
* @param code
* lang ID
* @return name of language
*/
public static String getLanguageName(String code) {
if (code == null) {
return null;
}
Language L = getLanguage(code);
return (L != null ? L.getName() : null);
}
/**
* ISO2 and ISO3 char codes for languages are unique.
*
* @param code
* iso2 or iso3 code
* @return the other code.
*/
public static Language getLanguage(String code) {
if (code == null) {
return null;
}
String lookup = code.toLowerCase();
Language l = languageMapISO639.get(lookup);
if (l != null) {
return l;
}
// Keep looking.
if (lookup.contains("_")) {
lookup = lookup.split("_")[0];
l = languageMapISO639.get(lookup);
if (l != null) {
return l;
}
}
return null;
}
/**
* ISO2 and ISO3 char codes for languages are unique.
*
* @param code
* iso2 or iso3 code
* @return the other code.
*/
public static String getLanguageCode(String code) {
if (code == null) {
return null;
}
Language l = getLanguage(code);
if (l != null) {
return l.getCode();
}
return null;
}
private static boolean _isRomanceLanguage(String l) {
return (l.equals(spanishLang) || l.equals(portugueseLang) || l.equals(italianLang) || l.equals(frenchLang)
|| l.equals(romanianLang));
}
/**
* European languages = Romance + GER + ENG Extend definition as needed.
*
* @param l
* language ID
* @return true if language is European in nature
*/
public static boolean isEuroLanguage(String l) {
Language lang = getLanguage(l);
if (lang == null) {
return false;
}
String id = lang.getISO639_1_Code();
return (_isRomanceLanguage(id) || id.equals(germanLang) || id.equals(englishLang));
}
/**
* Romance languages = SPA + POR + ITA + FRA + ROM
*
* Extend definition as needed.
*
* @param l
* lang ID
* @return true if language is a Romance language
*/
public static boolean isRomanceLanguage(String l) {
Language lang = getLanguage(l);
if (lang == null) {
return false;
}
String id = lang.getISO639_1_Code();
return _isRomanceLanguage(id);
}
/**
* Utility method to check if lang ID is English...
*
* @param x
* a langcode
* @return whether langcode is english
*/
public static boolean isEnglish(String x) {
Language lang = getLanguage(x);
if (lang == null) {
return false;
}
String id = lang.getISO639_1_Code();
return (id.equals(englishLang));
}
/**
* Utility method to check if lang ID is Chinese(Traditional or
* Simplified)...
*
* @param x
* a langcode
* @return whether langcode is chinese
*/
public static boolean isChinese(String x) {
Language lang = getLanguage(x);
if (lang == null) {
return false;
}
String id = lang.getISO639_1_Code();
return (id.equals(chineseLang) || id.equals(chineseTradLang));
}
/**
* Utility method to check if lang ID is Chinese, Korean, or Japanese
*
* @param x
* a langcode
* @return whether langcode is a CJK language
*/
public static boolean isCJK(String x) {
Language lang = getLanguage(x);
if (lang == null) {
return false;
}
String id = lang.getISO639_1_Code();
if (isBlank(id)) {
return false;
}
return (id.equals(koreanLang) || id.equals(japaneseLang) || id.equals(chineseLang)
|| id.equals(chineseTradLang));
}
/**
* Returns a ratio of Chinese/Japanese/Korean characters: CJK chars / ALL
*
* TODO: needs testing; not sure if this is sustainable if block; or if it
* is comprehensive. TODO: for performance reasons the internal chain of
* comparisons is embedded in the method; Otherwise for each char, an
* external method invocation is required.
*
* @param buf
* the character to be tested
* @return true if CJK, false otherwise
*/
public static double measureCJKText(String buf) {
if (buf == null) {
return -1.0;
}
int cjkCount = countCJKChars(buf.toCharArray());
return ((double) cjkCount) / buf.length();
}
private final static int LATIN1_END = 0xFE;
/**
*
* Counts the CJK characters in buffer, buf chars Inspiration:
* http://stackoverflow
* .com/questions/1499804/how-can-i-detect-japanese-text-in-a-java-string
* Assumption is that the char array is Unicode characters.
*
* @param chars
* char array for the text in question.
* @return count of CJK characters
*/
public static int countCJKChars(char[] chars) {
int cjkCount = 0;
for (char ch : chars) {
// Ignore ASCII outright.
// Ignore Latin-1 outright.
if (ch < LATIN1_END) {
continue;
}
Character.UnicodeBlock blk = Character.UnicodeBlock.of(ch);
if (isCJK(blk)) {
// increment counter:
++cjkCount;
}
}
return cjkCount;
}
/**
* A simple test to see if text has any CJK characters at all. It returns
* after the first such character.
*
* @param buf
* text
* @return if buf has at least one CJK char.
*/
public static boolean hasCJKText(String buf) {
if (buf == null) {
return false;
}
char ch;
for (int x = 0; x < buf.length(); ++x) {
ch = buf.charAt(x);
// Ignore ASCII outright.
// Ignore Latin-1 outright.
if (ch < LATIN1_END) {
continue;
}
Character.UnicodeBlock blk = Character.UnicodeBlock.of(ch);
if (isCJK(blk)) {
return true;
}
}
return false;
}
public static boolean isCJK(Character.UnicodeBlock blk) {
// Chinese/CJK group:
return isChinese(blk) || isJapanese(blk) || isKorean(blk);
}
public static boolean isChinese(Character.UnicodeBlock blk) {
return (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)
|| (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A)
|| (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B)
|| (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C)
|| (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D)
|| (blk == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS)
|| (blk == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS)
|| (blk == Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT)
|| (blk == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION)
|| (blk == Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS)
|| (blk == Character.UnicodeBlock.KANGXI_RADICALS) || (blk == Character.UnicodeBlock.YI_SYLLABLES)
|| (blk == Character.UnicodeBlock.YI_RADICALS) || (blk == Character.UnicodeBlock.BOPOMOFO)
|| (blk == Character.UnicodeBlock.BOPOMOFO_EXTENDED) || (blk == Character.UnicodeBlock.KANBUN);
}
/**
* Likely to be uniquely Korean if the character block is in Hangul. But
* also, it may be Korean if block is part of the CJK ideographs at large.
* User must check if text in its entirety is part of CJK & Hangul,
* independently. This method only detects if character block is uniquely
* Hangul or not.
*
* @param blk
* a Java Unicode block
* @return true if char block is Hangul
*/
public static boolean isKorean(Character.UnicodeBlock blk) {
return (blk == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) || (blk == Character.UnicodeBlock.HANGUL_JAMO)
|| (blk == Character.UnicodeBlock.HANGUL_SYLLABLES)
|| (blk == Character.UnicodeBlock.HANGUL_JAMO_EXTENDED_A)
|| (blk == Character.UnicodeBlock.HANGUL_JAMO_EXTENDED_B);
}
/**
* Checks if char block is uniquely Japanese. Check other chars isChinese
*
* @param blk
* a Java Unicode block
* @return true if char block is Hiragana or Katakana
*/
public static boolean isJapanese(Character.UnicodeBlock blk) {
return (blk == Character.UnicodeBlock.HIRAGANA) || (blk == Character.UnicodeBlock.KATAKANA)
|| (blk == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
}
/**
* Compress bytes from a Unicode string. Conversion to bytes first to avoid
* unicode or platform-dependent IO issues.
*
* @param buf
* UTF-8 encoded text
* @return byte array
* @throws IOException
* on error with compression or text encoding
*/
public static byte[] compress(String buf) throws IOException {
return compress(buf, "UTF-8");
}
/**
*
* @param buf
* text
* @param charset
* character set encoding for text
* @return byte array for the compressed result
* @throws IOException
* on error with compression or text encoding
*/
public static byte[] compress(String buf, String charset) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gz = new GZIPOutputStream(out);
gz.write(buf.getBytes(charset));
gz.close();
return out.toByteArray();
}
/**
*
* @param gzData
* byte array containing gzipped buffer
* @return buffer UTF-8 decoded string
*
* @throws IOException
* on error with decompression or text encoding
*/
public static String uncompress(byte[] gzData) throws IOException {
return uncompress(gzData, "UTF-8");
}
private final static int ONEKB = 1024;
/**
*
* @param gzData
* byte array containing gzipped buffer
* @param charset
* character set decoding for text
* @return buffer of uncompressed, decoded string
* @throws IOException
* on error with decompression or text encoding
*/
public static String uncompress(byte[] gzData, String charset) throws IOException {
GZIPInputStream gzipInputStream = new GZIPInputStream(new ByteArrayInputStream(gzData));
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] buf = new byte[ONEKB];
int len;
while ((len = gzipInputStream.read(buf)) > 0) {
out.write(buf, 0, len);
}
gzipInputStream.close();
out.close();
return new String(out.toByteArray(), charset);
}
/**
* Unicode and social media -- We encounter all sorts of hangups when
* processing modern unicode text. XML issues, JNI issues, escape utilities,
* etc. All sorts of problems arise with emoticons aka emoji, and other
* symbols used in online media. So these utilities are offered to help
* remove such things prior to data processing.
*/
// UnicodeBlock.MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS;
private static final Pattern SCRUB_SYM = Pattern.compile("\\p{block=Miscellaneous Symbols And Pictographs}+");
private static final Pattern SCRUB_SYM2 = Pattern.compile("\\p{block=Transport and Map Symbols}+");
private static final Pattern SCRUB_EMOTICONS = Pattern.compile("\\p{block=Emoticons}+");
private static final Pattern SCRUB_ALPHASUP = Pattern.compile("\\p{block=Enclosed Alphanumeric Supplement}+");
private static final Pattern SCRUB_TILES1 = Pattern.compile("\\p{block=Mahjong Tiles}+");
private static final Pattern SCRUB_TILES2 = Pattern.compile("\\p{block=Domino Tiles}+");
private static final Pattern SCRUB_SYM_MISC = Pattern.compile("\\p{block=Miscellaneous Symbols}+");
private static final Pattern SCRUB_PLAYCARDS = Pattern.compile("\\p{block=Playing Cards}+");
/**
* replace Emoticons with something less nefarious -- UTF-16 characters do
* not play well with some I/O routines.
*
* @param t
* text
* @return scrubbed text
*/
public static String removeEmoticons(String t) {
return SCRUB_EMOTICONS.matcher(t).replaceAll("{icon}");
}
/**
* Replace symbology
*
* @param t
* text
* @return scrubbed text
*/
public static String removeSymbols(String t) {
String _new = SCRUB_SYM.matcher(t).replaceAll("{sym}");
_new = SCRUB_SYM2.matcher(_new).replaceAll("{sym2}");
_new = SCRUB_ALPHASUP.matcher(_new).replaceAll("{asup}");
_new = SCRUB_TILES1.matcher(_new).replaceAll("{tile1}");
_new = SCRUB_TILES2.matcher(_new).replaceAll("{tile2}");
_new = SCRUB_SYM_MISC.matcher(_new).replaceAll("{sym}");
_new = SCRUB_PLAYCARDS.matcher(_new).replaceAll("{card}");
return _new;
}
/**
* Count number of non-alphanumeric chars are present.
*
* @param t
* @return
*/
public static int countNonText(final String t) {
int nonText = 0;
for (char c : t.toCharArray()) {
if (!Character.isLetter(c) && Character.isDigit(c) && Character.isWhitespace(c)) {
++nonText;
}
}
return nonText;
}
}