/* StringUtilRegular.java - realm of regular expressions.
*
* Copyright (c) 2005-2011 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under GNU Public License.
*/
package wikokit.base.wikipedia.util;
import wikokit.base.wikipedia.language.Encodings;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
//import java.util.regex.PatternSyntaxException;
/** String usefull functions via regular expressions */
public class StringUtilRegular {
//private static final String table_rus_default = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя";
private static final String[][] table_lat_ru_default =
{
{"A", "А"},
{"B","Б"},
{"V","В"},
{"G","Г"},
{"D","Д"},
{"E","Е"},
{"Yo","Ё"},
{"Zh","Ж"},
{"Z","З"},
{"I","И"}, //И"},
{"J","Й"},
{"K","К"},
{"L","Л"},
{"M","М"},
{"N","Н"},
{"O","О"},
{"P","П"},
{"R","Р"},
{"S","С"},
{"T","Т"},
{"U","У"},
{"F","Ф"},
{"X","Х"},
{"C","Ц"},
{"Ch","Ч"},
{"Sh","Ш"},
{"W","Щ"},
{"~","Ъ"},
{"Y","Ы"},
{"'","Ь"},
{"E'","Э"},
{"Yu","Ю"},
{"Ya","Я"},
{"a","а"},
{"b","б"},
{"v","в"},
{"g","г"},
{"d","д"},
{"e","е"},
{"yo","ё"},
{"zh","ж"},
{"z","з"},
{"i","и"},
{"j","й"},
{"k","к"},
{"l","л"},
{"m","м"},
{"n","н"},
{"o","о"},
{"p","п"},
{"r","р"},
{"s","с"},
{"t","т"},
{"u","у"},
{"f","ф"},
{"x","х"},
{"c","ц"},
{"ch","ч"},
{"sh","ш"},
{"w","щ"},
{"~","ъ"},
{"y","ы"},
{"'","ь"},
{"e'","э"},
{"yu","ю"},
{"ya","я"}
};
public StringUtilRegular() {
}
/** Strips non-word letters in source array "words".
* E.g. {"\nword1", "\t word-long2\r\n"} -> {"word1", "word-long2"}.
*/
public static void stripNonWordLetters(String words[]) {
String str_pattern = "\\A\\W*(.+?)\\W*\\Z";
List<String> result = new ArrayList<String>();
Pattern p = Pattern.compile(str_pattern);
for(int i=0; i<words.length; i++) {
Matcher m = p.matcher(words[i]);
if (m.find()){
words[i] = m.group(1);
}
}
}
/** Gets first letters till space (starting at first column). */
private final static Pattern ptrn_letters_till_space = Pattern.compile(
"(\\S\\S+)\\s");
private final static Pattern ptrn_letters_till_hyphen = Pattern.compile(
"^\\s*([^-]+?)-");
/** Gets first letters till space, hyphen or pipe. */
private final static Pattern ptrn_letters_till_pipe = Pattern.compile(
"\\A(\\S\\S+?)\\|");
private final static String NULL_STRING = new String();
/** Gets first letters till space.
* E.g. "word1 " -> "word1", "\t word-long2\r\n" -> "word-long2"
*/
public static String getLettersTillSpace(String text) {
Matcher m = ptrn_letters_till_space.matcher(text);
if (m.find()){
return m.group(1);
}
return NULL_STRING;
}
/** Gets first letters till space " ", ... or pipe "|" (shortest string).
* E.g. "word1 " -> "word1", "\t word-long2\r\n" -> "word-long2"
* This functions is used by WPOSRu.guessPOS().
*/
public static String getLettersTillSpaceHyphenOrPipe(String text) {
Matcher m;
String s_space = text; // because max(length) = text.length()
boolean b_space = false;
String s_hyphen = text;
boolean b_hyphen = false;
String s_pipe = text;
boolean b_pipe = false;
m = ptrn_letters_till_space.matcher(text);
if (m.find()) {
b_space = true;
s_space = m.group(1);
}
m = ptrn_letters_till_hyphen.matcher(text);
if (m.find()) {
b_hyphen = true;
s_hyphen = m.group(1);
}
m = ptrn_letters_till_pipe.matcher(text);
if (m.find()) {
b_pipe = true;
s_pipe = m.group(1);
}
if(b_space && s_space.length() <= s_hyphen.length()
&& s_space.length() <= s_pipe.length())
return s_space;
if(b_hyphen && s_hyphen.length() <= s_space.length()
&& s_hyphen.length() <= s_pipe.length())
return s_hyphen;
if(b_pipe && s_pipe.length() <= s_hyphen.length()
&& s_pipe.length() <= s_space.length())
return s_pipe;
return NULL_STRING;
}
/** Corresponds to different type of whitespaces,
* @see http://stackoverflow.com/questions/1822772/java-regular-expression-to-match-all-whitespace-characters
* @see http://en.wikipedia.org/wiki/Space_%28punctuation%29
*/
private final static Pattern ptrn_whitespace = Pattern.compile("\\p{Z}"); // "\\s");
/** Replaces special spaces by usual whitespace, e.g. in quote author names "Name Surname"
*/
public static String replaceComplexSpacesByTrivialSpaces(String text) {
Matcher match = ptrn_whitespace.matcher(text);
return match.replaceAll(" ");
}
/** Gets first letters till first hyphen "-".
* E.g. "word1 " -> "word1", "\t word-long2\r\n" -> "word-long2"
*/
public static String getLettersTillHyphen(String text) {
Matcher m = ptrn_letters_till_hyphen.matcher(text);
if (m.find()){
return m.group(1);
}
return NULL_STRING;
}
/** Encodes the text to latinitsa, e.g.: женьшень -> zhen'shen' (Russian) */
// public static String encodeRussianToLatinitsa (String text) {
//
// Latin1ToUTF8
// encodeRussianToLatinitsa (text, "ISO8859_1", "UTF8");
//
public static String encodeRussianToLatinitsa (String text, String enc_from, String enc_to) {
String[][] table_lat_ru = new String [table_lat_ru_default.length][2];
for(int i=0; i<table_lat_ru_default.length; i++) {
table_lat_ru [i] = new String [2];
table_lat_ru [i][0] = table_lat_ru_default [i][0];
table_lat_ru [i][1] = Encodings.FromTo(table_lat_ru_default [i][1], enc_from, enc_to);
}
//table_rus = Encodings.Latin1ToUTF8(table_rus_default);
String result = "";
for(int i=0; i<text.length(); i++) {
String c = text.substring(i, i+1);
boolean bfound = false;
for(String[] t:table_lat_ru) {
if(c.equals(t[1])) { // equals to Russian letter
result += t[0]; // substitute by the same English letter
bfound = true;
break;
}
}
if(!bfound) {
result += c;
}
}
return result;
}
// Wiktionary
/** Gets position of 2nd, 3rd or 4th level header ===? Header ===? */
private final static Pattern ptrn_2345_level = Pattern.compile(
//"===?=?\\s*[^=]+\\s*===?=?\\s*\\n");
"={2,5}\\s*[^=]+\\s*={2,5}\\s*\\n");
/** Gets position of first header in text from start_pos,
* e.g. 2nd, 3rd, 4th, or 5th level header ==?=?=? Header ==?=?=?,
* If header is absent then return -1.
*/
public static int getFirstHeaderPosition(int start_pos, String text) {
Matcher m = null;
if(start_pos < 0 || start_pos > text.length()-1) {
return -1;
}
if(0 == start_pos) {
m = ptrn_2345_level.matcher(text);
} else {
m = ptrn_2345_level.matcher(text.substring(start_pos));
}
if (m.find()){
return start_pos + m.start();
}
return -1;
}
/** Gets position of 2nd, 3rd or 4th level header ===? Header ===? */
private final static Pattern ptrn_empty_line = Pattern.compile(
"^\\s*$", Pattern.MULTILINE);
/** Gets position of first header in text from start_pos,
* e.g. 2nd, 3rd or 4th level header ==?=? Header ==?=?,
* If header is absent then return -1.
*/
public static int getFirstEmptyLinePosition(int start_pos, String text) {
Matcher m = null;
if(start_pos < 0 || start_pos > text.length()-1) {
return -1;
}
if(0 == start_pos) {
m = ptrn_empty_line.matcher(text);
} else {
m = ptrn_empty_line.matcher(text.substring(start_pos));
}
if (m.find()){
return start_pos + m.start();
}
return -1;
}
/** Gets text from 'start_pos' position till position of first header
* in text, or till the end of text (if header is absent).
*/
public static String getTextTillFirstHeaderPosition(int start_pos, String text) {
if(start_pos < 0 || start_pos > text.length()-1) {
return NULL_STRING;
}
int header_pos = getFirstHeaderPosition(start_pos, text);
if(-1 == header_pos) { // header is absent may be
return text.substring(start_pos);
}
return text.substring(start_pos, header_pos);
}
/** Gets text from 'start_pos' position till the nearest position:
* (1) of first header text, or (2) of first empty line,
* (3) or till the end of text (if header and empty lines are absent).
*/
public static String getTextTillFirstHeaderOrEmptyLine(int start_pos, String text) {
if(start_pos < 0 || start_pos > text.length()-1) {
return NULL_STRING;
}
int header_pos = getFirstHeaderPosition (start_pos, text);
int empty_line_pos = getFirstEmptyLinePosition(start_pos, text);
if(-1 == header_pos && -1 == empty_line_pos) { // header is absent may be
return text.substring(start_pos);
}
// select min(header_pos, empty_line_pos) but != -1
if(-1 == header_pos) { // header is absent may be
return substringAndchopLastNewline(text, start_pos, empty_line_pos);
}
if(-1 == empty_line_pos) { // empty lines are absent may be
return substringAndchopLastNewline(text, start_pos, header_pos);
}
if(empty_line_pos < header_pos) {
return substringAndchopLastNewline(text, start_pos, empty_line_pos);
}
return substringAndchopLastNewline(text, start_pos, header_pos);
}
/** Gets text substring from 'start_pos' position till 'end_pos' position
* and chop last symbol if it is newline \n symbol.
*/
public static String substringAndchopLastNewline(String text, int start_pos, int end_pos) {
if(start_pos < 0 || start_pos >= end_pos || end_pos > text.length()-1) {
return NULL_STRING;
}
if(end_pos > 0 && '\n' == text.charAt(end_pos-1)) {
end_pos --;
}
return text.substring(start_pos, end_pos);
}
}