package org.xbib.elasticsearch.common.decompound.patricia;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
/**
*
*/
public class Decompounder {
private CompactPatriciaTrie kompvvTree;
private CompactPatriciaTrie kompvhTree;
private CompactPatriciaTrie grfTree;
public Decompounder(InputStream kompvv, InputStream kompvh, InputStream gfred, double threshold)
throws IOException {
kompvvTree = new CompactPatriciaTrie();
kompvvTree.load(kompvv);
kompvvTree.setIgnoreCase(true);
kompvvTree.setThreshold(threshold);
kompvhTree = new CompactPatriciaTrie();
kompvhTree.load(kompvh);
kompvhTree.setIgnoreCase(true);
kompvhTree.setThreshold(threshold);
grfTree = new CompactPatriciaTrie();
grfTree.load(gfred);
grfTree.setIgnoreCase(true);
grfTree.setThreshold(threshold); // previous value = 0.46
}
public Decompounder(CompactPatriciaTrie kompvv, CompactPatriciaTrie kompvh, CompactPatriciaTrie gfred, double threshold) {
kompvvTree = kompvv;
kompvhTree = kompvh;
grfTree = gfred;
grfTree.setThreshold(threshold); // previous value = 0.46
}
private String reverse(String torev) {
StringBuilder ret = new StringBuilder();
for (int i = torev.length(); i > 0; i--) {
ret.append(torev.substring(i - 1, i));
}
return ret.toString();
}
public List<String> decompound(String string) {
String word = string;
word = reduceToBaseForm(word);
List<String> list = new ArrayList<>();
String classvv = kompvvTree.classify(word + "<");
String classvh = kompvhTree.classify(reverse(word) + "<");
StringBuilder numStrvv = new StringBuilder();
StringBuilder numStrvh = new StringBuilder();
StringBuilder suffixvv = new StringBuilder();
StringBuilder suffixvh = new StringBuilder();
String vvpart1 = "";
String vhpart1 = "";
String vvpart2 = "";
String vhpart2 = "";
int numvv = 0;
int numvh = 0;
boolean vhOk = true;
boolean vvOk = true;
if ("undecided".equals(classvv)) {
vvOk = false;
}
if ("undecided".equals(classvh)) {
vhOk = false;
}
if (vvOk) {
for (int i = 0; i < classvv.length(); i++) {
char c = classvv.charAt(i);
if ((c <= '9') && (c >= '0')) {
numStrvv.append(c);
} else {
suffixvv.append(c);
}
}
}
if (vhOk) {
for (int i = 0; i < classvh.length(); i++) {
char c = classvh.charAt(i);
if ((c <= '9') && (c >= '0')) {
numStrvh.append(c);
} else {
suffixvh.append(c);
}
}
}
if (vvOk) {
numvv = Integer.parseInt(numStrvv.toString());
}
if (vhOk) {
numvh = Integer.parseInt(numStrvh.toString());
}
if (vvOk && numvv >= word.length()) {
vvOk = false;
}
if (vhOk && numvh >= word.length()) {
vhOk = false;
}
if (vvOk) {
for (int i = 0; i < suffixvv.length(); i++) {
if (word.length() > (numvv + i)) {
if (suffixvv.charAt(i) != word.charAt(numvv + i)) {
vvOk = false;
}
} else {
vvOk = false;
}
}
}
if (vhOk) {
for (int i = 0; i < suffixvh.length(); i++) {
if (suffixvh.charAt(i) != word.charAt(numvh + 1 + i)) {
vvOk = false;
}
}
}
if (vvOk) {
vvpart1 = word.substring(0, numvv);
vvpart2 = word.substring(numvv + suffixvv.length(), word.length());
if (vvpart2.length() <= 3) {
vvOk = false;
}
}
if (vhOk) {
vhpart1 = word.substring(0, word.length() - numvh);
vhpart2 = word.substring(word.length() - (numvh + suffixvh.length()), word.length());
if (vhpart1.length() <= 3) {
vhOk = false;
}
}
if (vvOk && vhOk) {
if ((vvpart1.equals(vhpart1)) || ((vhpart1.length() - vvpart1.length()) < 3)) {
list.add(vvpart1);
if (vhpart2.length() < vvpart2.length()) {
list.add(vhpart2);
} else if (vhpart2.length() > vvpart2.length()) {
list.add(vvpart2);
}
} else {
list.add(vvpart1);
list.add(word.substring(vvpart1.length() + suffixvv.length(), word.length() - numvh));
list.add(vhpart2);
}
if (vvpart2.equals(vhpart2)) {
list.add(vvpart2);
}
} else if (vvOk && !vhOk) {
list.add(vvpart1);
list.add(vvpart2);
} else if (vhOk && !vvOk) {
list.add(vhpart1);
list.add(vhpart2);
} else {
list.add(word);
}
List<String> retvec2 = new ArrayList<>();
List<String> l;
if (list.size() > 1) {
for (String s : list) {
l = decompound(s);
retvec2.addAll(l);
}
} else {
retvec2 = list;
}
return retvec2;
}
public String reduceToBaseForm(String word) {
String result = word;
String baseForm = grfTree.classify(reverse(word));
if (!"undecided".equals(baseForm)) {
StringTokenizer st = new StringTokenizer(baseForm, ",");
baseForm = st.nextToken();
StringBuilder numStr = new StringBuilder();
StringBuilder suffix = new StringBuilder();
for (int i = 0; i < baseForm.length(); i++) {
char c = baseForm.charAt(i);
if ((c <= '9') && (c >= '0')) {
numStr.append(c);
} else {
suffix.append(c);
}
}
if (numStr.length() > 0) {
int cutpos = Integer.parseInt(numStr.toString());
if (cutpos > result.length()) {
cutpos = result.length();
}
result = result.substring(0, result.length() - cutpos) + suffix;
}
}
return result;
}
}