package edu.stanford.nlp.semparse.open.ling;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import fig.basic.LogInfo;
public class LingUtils {
// Compute an abstraction of a string
public static String computePhraseShape(String x) {
StringBuilder buf = new StringBuilder();
char lastc = 0;
for (int i = 0; i < x.length(); i++) {
char c = x.charAt(i);
if (Character.isDigit(c)) c = '0';
else if (Character.isLetter(c)) c = Character.isLowerCase(c) ? 'a' : 'A';
else if (Character.isWhitespace(c) || Character.isSpaceChar(c)) c = ' ';
if (c != lastc) buf.append(c);
lastc = c;
}
return buf.toString();
}
/** Collapse consecutive duplicated tokens */
public static String collapse(String x) {
return collapse(x.split(" "));
}
/** Collapse consecutive duplicated tokens */
public static String collapse(String[] x) {
StringBuilder sb = new StringBuilder();
String lastToken = "";
for (String token : x) {
if (!lastToken.equals(token)) {
sb.append(token).append(" ");
lastToken = token;
}
}
return sb.toString().trim();
}
/** Collapse consecutive duplicated tokens */
public static String collapse(List<String> x) {
StringBuilder sb = new StringBuilder();
String lastToken = "";
for (String token : x) {
if (!lastToken.equals(token)) {
sb.append(token).append(" ");
lastToken = token;
}
}
return sb.toString().trim();
}
/** Join into string */
public static String join(String[] x) {
StringBuilder sb = new StringBuilder();
for (String token : x) {
sb.append(token).append(" ");
}
return sb.toString().trim();
}
/** Join into string */
public static String join(List<String> x) {
StringBuilder sb = new StringBuilder();
for (String token : x) {
sb.append(token).append(" ");
}
return sb.toString().trim();
}
public static final Pattern ALPHANUMERIC = Pattern.compile("[A-Za-z0-9]+");
public static Set<String> getBagOfWords(String string) {
Set<String> answer = new HashSet<>();
Matcher matcher = ALPHANUMERIC.matcher(string.replaceAll("[0-9]+", "0"));
while (matcher.find()) {
answer.add(matcher.group());
}
return answer;
}
public static final Pattern ALPHA_OR_NUMERIC = Pattern.compile("[a-z]+|[0-9]+");
public static List<String> getAlphaOrNumericTokens(String string) {
List<String> answer = new ArrayList<>();
Matcher matcher = ALPHA_OR_NUMERIC.matcher(string.toLowerCase());
while (matcher.find()) {
answer.add(matcher.group());
}
return answer;
}
public static String whitespaceNormalize(String x) {
return x.replaceAll("\\s+", " ").trim();
}
/**
* Simple normalization. (Include whitespace normalization)
*/
public static String simpleNormalize(String string) {
// Remove diacritics
string = Normalizer.normalize(string, Normalizer.Form.NFD).replaceAll("[\u0300-\u036F]", "");
// Special symbols
string = string
.replaceAll("‚", ",")
.replaceAll("„", ",,")
.replaceAll("·", ".")
.replaceAll("…", "...")
.replaceAll("ˆ", "^")
.replaceAll("˜", "~")
.replaceAll("‹", "<")
.replaceAll("›", ">")
.replaceAll("[‘’´`]", "'")
.replaceAll("[“”«»]", "\"")
.replaceAll("[•†‡]", "")
.replaceAll("[‐‑–—]", "-")
.replaceAll("[\\u2E00-\\uFFFF]", ""); // Remove all Han characters
// Citation
string = string.replaceAll("\\[(nb ?)?\\d+\\]", "");
string = string.replaceAll("\\*+$", "");
// Year in parentheses
string = string.replaceAll("\\(\\d* ?-? ?\\d*\\)", "");
// Outside Quote
string = string.replaceAll("^\"(.*)\"$", "$1");
// Numbering
if (!string.matches("^[0-9.]+$"))
string = string.replaceAll("^\\d+\\.", "");
return string.replaceAll("\\s+", " ").trim();
}
/**
* More aggressive normalization. (Include simple and whitespace normalization)
*/
public static String aggressiveNormalize(String string) {
// Dashed / Parenthesized information
string = simpleNormalize(string);
string = string.trim().replaceAll("\\[[^\\]]*\\]", "");
string = string.trim().replaceAll("[\\u007F-\\uFFFF]", "");
string = string.trim().replaceAll(" - .*$", "");
string = string.trim().replaceAll("\\([^)]*\\)$", "");
return string.replaceAll("\\s+", " ").trim();
}
/**
* Normalize text depending on the level.
* - <= 0 : no normalization
* - 1 : strip whitespace
* - 2 : simple
* - >= 3 : aggressive
*/
public static String normalize(String string, int level) {
if (level == 1) return whitespaceNormalize(string);
if (level == 2) return simpleNormalize(string);
if (level >= 3) return aggressiveNormalize(string);
return string;
}
/**
* Find the head word of the phrase (lemmatized).
* The algorithm is approximate, so the answer may be incorrect
* (especially when there are proper names with prepositions)
*
* Algorithm:
* - If there is a preposition, wh-word, or "that", return the last noun preceding it.
* - Otherwise, return the last non-number token.
*/
public static String findHeadWord(String phrase, boolean lemmatized) {
LingData lingData = LingData.get(phrase);
int index = findHeadWordIndex(phrase);
if (index >= 0)
return lemmatized ? lingData.lemmaTokens.get(index) : lingData.tokens.get(index);
return "";
}
public static String findHeadWord(String phrase) {
return findHeadWord(phrase, true);
}
public static int findHeadWordIndex(String phrase) {
phrase = hackPhrase(phrase);
LingData lingData = LingData.get(phrase);
int modifierIndex = -1;
for (int i = 1; i < lingData.length; i++) {
String posTag = lingData.posTags.get(i);
if ("IN".equals(posTag) || "WP".equals(posTag) || "WDT".equals(posTag) || "TO".equals(posTag)) {
modifierIndex = i;
break;
}
}
if (modifierIndex > 0 && "O".equals(lingData.nerTags.get(modifierIndex - 1))) {
for (int i = modifierIndex - 1; i >= 0; i--) {
if (lingData.posTags.get(i).charAt(0) == 'N') return i;
}
return modifierIndex - 1;
} else {
// Find the last non-digit
for (int i = lingData.length - 1; i >= 0; i--) {
if (!"CD".equals(lingData.posTags.get(i)) && !"DATE".equals(lingData.nerTags.get(i))) {
return i;
}
}
return lingData.length - 1;
}
}
private static String hackPhrase(String phrase) {
phrase = phrase.replaceAll(" wiki(pedia)?$", "");
phrase = phrase.replaceAll("^(type|name) of ", "");
phrase = phrase.replaceAll(" (episodes?) (season \\d+)$", " $1 of $2");
return phrase;
}
/**
* Find the last word (lemmatized).
*/
public static String findLastWord(String phrase, boolean lemmatized) {
LingData lingData = LingData.get(phrase);
int index = findLastWordIndex(phrase);
if (index >= 0)
return lemmatized ? lingData.lemmaTokens.get(index) : lingData.tokens.get(index);
return "";
}
public static String findLastWord(String phrase) {
return findLastWord(phrase, true);
}
public static int findLastWordIndex(String phrase) {
return LingData.get(phrase).length - 1;
}
/**
* Find the last noun (lemmatized).
*/
public static String findLastNoun(String phrase, boolean lemmatized) {
LingData lingData = LingData.get(phrase);
int index = findLastNounIndex(phrase);
if (index >= 0)
return lemmatized ? lingData.lemmaTokens.get(index) : lingData.tokens.get(index);
return "";
}
public static String findLastNoun(String phrase) {
return findLastNoun(phrase, true);
}
public static int findLastNounIndex(String phrase) {
LingData lingData = LingData.get(phrase);
for (int i = lingData.length - 1; i >= 0 ; i--) {
if (lingData.posTags.get(i).charAt(0) == 'N')
return i;
}
return -1;
}
public static void main(String[] args) {
// Test
LogInfo.log(simpleNormalize("This is a book†[a][1]"));
LogInfo.log(aggressiveNormalize("This is a book†[a][1]"));
LogInfo.log(simpleNormalize("Apollo 11 (1969) 「阿波罗」"));
LogInfo.log(simpleNormalize("\"Apollo 11 (1969)\""));
LogInfo.log(simpleNormalize("“Erdős café – ε’s delight”"));
LogInfo.log(aggressiveNormalize("“Erdős café – ε’s delight”"));
LogInfo.log(simpleNormalize("1. 3.14 is Pi"));
LogInfo.log(simpleNormalize("3.14"));
LogInfo.log(simpleNormalize("314"));
//LogInfo.log(findHeadWord("the mentalist episodes season 2"));
}
}