/**
* Copyright 2014, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.util;
import java.util.ArrayList;
import java.util.List;
import edu.emory.clir.clearnlp.util.constant.CharConst;
import edu.emory.clir.clearnlp.util.constant.MetaConst;
import edu.emory.clir.clearnlp.util.constant.StringConst;
/**
* @since 3.0.0
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
*/
public class StringUtils
{
private StringUtils() {}
static public int getLCSLength(String[] x, String[] y)
{
final int M = x.length;
final int N = y.length;
int[][] counts = new int[M+1][N+1];
int i, j;
for (i=1; i<=M; i++)
for (j=1; j<=N; j++)
counts[i][j] = (x[i-1].equals(y[j-1])) ? counts[i-1][j-1] + 1 : Math.max(counts[i][j-1], counts[i-1][j]);
return counts[M][N];
}
/** @return the specific number of spaces. */
static public String spaces(int length)
{
StringBuilder build = new StringBuilder();
int i;
for (i=0; i<length; i++)
build.append(StringConst.SPACE);
return build.toString();
}
static public boolean startsWithAny(String str, String... suffixes)
{
for (String suffix : suffixes)
{
if (str.startsWith(suffix))
return true;
}
return false;
}
static public boolean endsWithAny(String str, String... suffixes)
{
for (String suffix : suffixes)
{
if (str.endsWith(suffix))
return true;
}
return false;
}
static public String trim(String s, int trimSize)
{
return s.substring(0, s.length()-trimSize);
}
// ====================================== Conversion ======================================
/**
* This method converts characters in [128, 256) correctly where {@link String#toUpperCase()} doesn't.
* About 2+ times faster than {@link String#toUpperCase()}.
*/
static public String toUpperCase(String s)
{
char[] array = s.toCharArray();
boolean b = CharUtils.toUpperCase(array);
return b ? new String(array) : s;
}
/**
* This method converts characters in [128, 256) correctly where {@link String#toUpperCase()} doesn't.
* About 2+ times faster than {@link String#toLowerCase()}.
*/
static public String toLowerCase(String s)
{
char[] array = s.toCharArray();
boolean b = CharUtils.toLowerCase(array);
return b ? new String(array) : s;
}
static public String[] toUpperCase(String[] source)
{
int i, size = source.length;
String[] target = new String[size];
for (i=0; i<size; i++)
target[i] = toUpperCase(source[i]);
return target;
}
static public String[] toLowerCase(String[] source)
{
int i, size = source.length;
String[] target = new String[size];
for (i=0; i<size; i++)
target[i] = toLowerCase(source[i]);
return target;
}
// ====================================== Simplify ======================================
/**
* @return a simplified form of the specific word-form.
* @see MetaUtils#containsHyperlink(String)
* @see #collapseDigits(String)
* @see #collapsePunctuation(String)
*/
static public String toSimplifiedForm(String s)
{
if (MetaUtils.endsWithFileExtension(s) || MetaUtils.containsHyperlink(s))
return MetaConst.HYPERLINK;
if (s.length() == 1)
{
char c = s.charAt(0);
if (CharUtils.isCurrency(c)) return StringConst.DOLLAR;
if (CharUtils.isSingleQuotationMark(c)) return StringConst.SINGLE_QUOTE;
if (CharUtils.isDoubleQuotationMark(c)) return StringConst.DOUBLE_QUOTE;
if (CharUtils.isListMark(c) || CharUtils.isHyphen(c)) return StringConst.HYPHEN;
}
s = collapseDigits(s);
s = collapsePunctuation(s);
return s;
}
static public String toLowerCaseSimplifiedForm(String s)
{
return toLowerCase(toSimplifiedForm(s));
}
// ====================================== Collapse ======================================
static public String collapseDigits(String s)
{
StringBuilder build = new StringBuilder();
char[] cs = s.toCharArray();
int i, j, size = cs.length;
char curr, prev = 0;
for (i=0; i<size; i++)
{
i = collapseDigitsAux(cs, i);
curr = cs[i];
if (curr == CharConst.PERCENT)
{
if (CharUtils.isDigit(prev))
continue;
}
else if (CharUtils.isPreDigitSymbol(curr) || curr == CharConst.COMMA || curr == CharConst.COLON || curr == CharConst.FW_SLASH || curr == CharConst.EQUAL)
{
if (i+1 < size && CharUtils.isDigit(cs[j = collapseDigitsAux(cs, i+1)]))
{
if (i == 0)
{
i = j;
curr = cs[i];
}
else if (CharUtils.isDigit(prev))
{
i = j;
continue;
}
}
}
if (CharUtils.isDigit(curr))
{
if (!CharUtils.isDigit(prev))
build.append(CharConst.ZERO);
}
else
build.append(curr);
prev = curr;
}
return build.toString();
}
static private int collapseDigitsAux(char[] cs, int index)
{
char curr = cs[index];
if (curr == CharConst.DOLLAR || curr == CharConst.POUND)
{
if (index+1 < cs.length && CharUtils.isDigit(cs[index+1]))
return index + 1;
}
return index;
}
static public String collapsePunctuation(String s)
{
StringBuilder build = new StringBuilder();
char[] cs = s.toCharArray();
int i, size = cs.length;
for (i=0; i<size; i++)
{
if (i > 1 && CharUtils.isPunctuation(cs[i]) && cs[i] == cs[i-1] && cs[i] == cs[i-2])
continue;
build.append(cs[i]);
}
return (build.length() < size) ? build.toString() : s;
}
static public List<String> stripPunctuation(List<String> tokens)
{
List<String> list = new ArrayList<>();
for (String token : tokens)
{
if (!containsPunctuationOnly(token))
list.add(token);
}
return list;
}
static public List<String> stripPunctuation(String[] tokens)
{
List<String> list = new ArrayList<>();
for (String token : tokens)
{
if (!containsPunctuationOnly(token))
list.add(token);
}
return list;
}
// ====================================== Boolean ======================================
/**
* @return {@code true} if the specific string includes only upper-case characters.
* @see CharUtils#isUpperCase(char).
*/
static public boolean containsUpperCaseOnly(String s)
{
char[] cs= s.toCharArray();
int i, size = cs.length;
for (i=0; i<size; i++)
{
if (!CharUtils.isUpperCase(cs[i]))
return false;
}
return true;
}
/**
* @return {@code true} if the specific string includes only lower-case characters.
* @see CharUtils#isLowerCase(char).
*/
static public boolean containsLowerCaseOnly(String s)
{
char[] cs= s.toCharArray();
int i, size = cs.length;
for (i=0; i<size; i++)
{
if (!CharUtils.isLowerCase(cs[i]))
return false;
}
return true;
}
/**
* @return {@code true} if the specific string contains any digit.
* @see CharUtils#isDigit(char).
*/
static public boolean containsDigit(String s)
{
char[] cs= s.toCharArray();
int i, size = cs.length;
for (i=0; i<size; i++)
{
if (CharUtils.isDigit(cs[i]))
return true;
}
return false;
}
public static boolean containsDigitOnly(String s)
{
return CharUtils.containsDigitOnly(s.toCharArray());
}
static public boolean containsPunctuation(String s)
{
char[] cs= s.toCharArray();
int i, size = cs.length;
for (i=0; i<size; i++)
{
if (CharUtils.isPunctuation(cs[i]))
return true;
}
return false;
}
public static boolean containsPunctuationOnly(String s)
{
return CharUtils.containsPunctuationOnly(s.toCharArray());
}
public static boolean containsPunctuationOrWhiteSpacesOnly(String s)
{
return CharUtils.containsPunctuationOrWhiteSpacesOnly(s.toCharArray());
}
public static boolean containsPunctuationOrDigitsOrWhiteSpacesOnly(String s)
{
return CharUtils.containsPunctuationOrDigitsOrWhiteSpacesOnly(s.toCharArray());
}
public static boolean isDouble(String s)
{
for (char c : s.toCharArray())
{
if (!Character.isDigit(c) && c != '.' && c != '-' && c != '+')
return false;
}
return true;
}
// ====================================== Getters ======================================
static public String[] getPrefixes(String form, int n)
{
int i, length = form.length() - 1;
if (length < n) n = length;
String[] prefixes = new String[n];
for (i=0; i<n; i++)
prefixes[i] = form.substring(0, i+1);
return prefixes;
}
static public String[] getSuffixes(String form, int n)
{
int i, length = form.length() - 1;
if (length < n) n = length;
String[] suffixes = new String[n];
for (i=0; i<n; i++)
suffixes[i] = form.substring(length-i);
return suffixes;
}
static public String getPrefix(String form, int n)
{
return (n < form.length()) ? toLowerCase(form.substring(0, n)) : null;
}
static public String getSuffix(String form, int n)
{
return (n < form.length()) ? toLowerCase(form.substring(form.length()-n)) : null;
}
static public String getShape(String form, int maxRepetitions)
{
StringBuilder build = new StringBuilder();
char curr, prev = CharConst.EMPTY;
char cs[] = form.toCharArray();
int i, len = cs.length;
int repetition = 0;
for (i=0; i<len; i++)
{
curr = cs[i];
if (CharUtils.isUpperCase(curr)) curr = 'A';
else if (CharUtils.isLowerCase(curr)) curr = 'a';
else if (CharUtils.isDigit(curr)) curr = '1';
else if (CharUtils.isPunctuation(curr)) curr = '.';
else curr = 'x';
if (curr == prev)
repetition++;
else
{
prev = curr;
repetition = 0;
}
if (repetition < maxRepetitions)
build.append(curr);
}
return build.toString();
}
}