package nicetext;
/**
* @author vikasing
*/
public class NGramExtracter {
public static NGrams extract(String text) {
NGrams nGrams = new NGrams();
text = text.toLowerCase().trim();
text = removeSpecialChars(text);
text = text.replaceAll(" +", " ");
String[] words = text.split(" ");
String nGram;
for (int j = 0; j < words.length; j++) {
words[j] = words[j].replaceAll(" +", "");
if (!words[j].isEmpty() && words[j].length() > 1) {
nGram = words[j];
if (nGrams.getMonoGrams().containsKey(nGram)) {
nGrams.getMonoGrams().put(nGram, nGrams.getMonoGrams().get(nGram) + 1);
} else {
nGrams.getMonoGrams().put(nGram, 1.0);
}
}
if (words.length > j + 1) {
words[j + 1] = words[j + 1].replaceAll(" +", "");
if (!words[j].isEmpty() && words[j].length() > 1 && !words[j + 1].isEmpty() && words[j + 1].length() > 1) {
nGram = words[j] + " " + words[j + 1];
if (nGrams.getBiGrams().containsKey(nGram)) {
nGrams.getBiGrams().put(nGram, nGrams.getBiGrams().get(nGram) + 1);
} else {
nGrams.getBiGrams().put(nGram, 1.0);
}
}
if (words.length > j + 2) {
words[j + 2] = words[j + 2].replaceAll(" +", "");
if (!words[j].isEmpty() && words[j].length() > 1 && !words[j + 1].isEmpty() && words[j + 1].length() > 1 && !words[j + 2].isEmpty() && words[j + 2].length() > 1) {
nGram = words[j] + " " + words[j + 1] + " " + words[j + 2];
if (nGrams.getTriGrams().containsKey(nGram)) {
nGrams.getTriGrams().put(nGram, nGrams.getTriGrams().get(nGram) + 1);
} else {
nGrams.getTriGrams().put(nGram, 1.0);
}
}
}
}
}
return nGrams;
}
private static String removeSpecialChars(String text) {
text = text.replaceAll("[^a-zA-Z 0-9]+", "");
return text;
}
}