package ilarkesto.core.diff;
import java.util.LinkedList;
import java.util.List;
public class WordTokenizer implements DiffTokenizer {
@Override
public List<String> tokenize(String s) {
List<String> ret = new LinkedList<String>();
if (s == null) return ret;
boolean word = false;
StringBuilder token = null;
int len = s.length();
for (int i = 0; i < len; i++) {
char ch = s.charAt(i);
if (isWordChar(ch)) {
if (token == null) {
token = new StringBuilder();
token.append(ch);
} else {
if (!word) {
ret.add(token.toString());
token = new StringBuilder();
}
token.append(ch);
}
word = true;
} else {
if (token == null) {
token = new StringBuilder();
token.append(ch);
} else {
if (word) {
ret.add(token.toString());
token = new StringBuilder();
}
token.append(ch);
}
word = false;
}
}
if (token != null) ret.add(token.toString());
return ret;
}
static boolean isWordChar(char ch) {
return Character.isLetterOrDigit(ch);
}
@Override
public String concat(List<String> tokens) {
StringBuilder sb = new StringBuilder();
for (String token : tokens) {
sb.append(token);
}
return sb.toString();
}
}