package edu.stanford.nlp.ie.machinereading.common;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.StringTokenizer;
/**
* Simple string tokenization
*/
public class SimpleTokenize {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(SimpleTokenize.class);
/** Basic string tokenization, skipping over white spaces */
public static ArrayList<String> tokenize(String line) {
ArrayList<String> tokens = new ArrayList<>();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreElements()) {
tokens.add(tokenizer.nextToken());
}
return tokens;
}
/** Basic string tokenization, skipping over white spaces */
public static ArrayList<String> tokenize(String line, String separators) {
ArrayList<String> tokens = new ArrayList<>();
StringTokenizer tokenizer = new StringTokenizer(line, separators);
while (tokenizer.hasMoreElements()) {
tokens.add(tokenizer.nextToken());
}
return tokens;
}
/**
* Finds the first non-whitespace character starting at start
*/
private static int findNonWhitespace(String s, int start) {
for (; start < s.length(); start++) {
if (Character.isWhitespace(s.charAt(start)) == false)
return start;
}
return -1;
}
private static int findWhitespace(String s, int start) {
for (; start < s.length(); start++) {
if (Character.isWhitespace(s.charAt(start)))
return start;
}
return -1;
}
/**
* Replaces all occurences of \" with "
*/
private static String normalizeQuotes(String str) {
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < str.length(); i++) {
// do not include \ if followed by "
if (str.charAt(i) == '\\' && i < str.length() - 1 && str.charAt(i + 1) == '\"') {
continue;
} else {
buffer.append(str.charAt(i));
}
}
return buffer.toString();
}
/**
* String tokenization, considering everything within quotes as 1 token
* Regular quotes inside tokens MUST be preceded by \
*/
public static ArrayList<String> tokenizeWithQuotes(String line) {
ArrayList<String> tokens = new ArrayList<>();
int position = 0;
while ((position = findNonWhitespace(line, position)) != -1) {
int end = -1;
// found quoted token (not preceded by \)
if (line.charAt(position) == '\"' && (position == 0 || line.charAt(position - 1) != '\\')) {
// find the first quote not preceded by \
int current = position;
for (;;) {
// found end of string first
if ((end = line.indexOf('\"', current + 1)) == -1) {
end = line.length();
break;
} else { // found a quote
if (line.charAt(end - 1) != '\\') { // valid quote
end++;
break;
} else { // quote preceded by \
current = end;
}
}
}
// do not include the quotes in the token
tokens.add(normalizeQuotes(line.substring(position + 1, end - 1)));
}
// regular token
else {
if ((end = findWhitespace(line, position + 1)) == -1)
end = line.length();
tokens.add(new String(line.substring(position, end)));
}
position = end;
}
return tokens;
}
/**
* Constructs a valid quote-surrounded token All inside quotes are preceded by
* \
*/
public static String quotify(String str) {
StringBuffer buffer = new StringBuffer();
buffer.append('\"');
for (int i = 0; i < str.length(); i++) {
if (str.charAt(i) == '\"')
buffer.append('\\');
buffer.append(str.charAt(i));
}
buffer.append('\"');
return buffer.toString();
}
/** Implements a simple test */
public static void main(String[] argv) {
String in = "T \"Athens \\\"the beautiful\\\"\" \"Athens\" \"\" \"Greece\"";
log.info("Input: " + in);
log.info(tokenizeWithQuotes(in));
}
}