/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter;
import static java.util.Arrays.asList;
import java.io.File;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary;
import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes;
import de.uni_leipzig.asv.utils.Pretree;
public class AsvToolboxSplitterAlgorithm
implements SplitterAlgorithm
{
private final Zerleger2 splitter;
private final Log logger;
public AsvToolboxSplitterAlgorithm(File kompVVicTree, File kompVHic, File grfExt)
throws ResourceInitializationException
{
logger = LogFactory.getLog(this.getClass());
splitter = new Zerleger2();
splitter.init(kompVVicTree.getAbsolutePath(), kompVHic.getAbsolutePath(),
grfExt.getAbsolutePath());
}
@Override
public DecompoundingTree split(String aWord)
{
// splitter.kZerlegung("katalogleichen");
// splitter.kZerlegung("nischenthemen");
// splitter.kZerlegung("brennbaukästen");
// splitter.kZerlegung("autokorrelationszeit");
// splitter.kZerlegung("providerdaten");
// splitter.kZerlegung("zahnärzten");
logger.debug("SPLITTING WORD: "+aWord);
Vector<String> split = splitter.kZerlegung(aWord);
String joined = StringUtils.join(split, "").replace("(", "").replace(")", "");
if (!joined.equals(aWord)) {
logger.error("Failed while splitting " + aWord + " into " + split);
}
if (StringUtils.join(split, "").contains("()")) {
logger.error(aWord + " -> " + split);
throw new IllegalStateException("Failed while splitting " + aWord + " into " + split, null);
}
StringBuilder splitStr = new StringBuilder();
for (int i = 0; i < split.size(); i++) {
if ((splitStr.length() > 0) && !split.get(i).startsWith("(")) {
splitStr.append("+");
}
splitStr.append(split.get(i));
}
return new DecompoundingTree(splitStr.toString());
}
@Override
public void setDictionary(Dictionary aDict)
{
// Nothing to do
}
@Override
public void setLinkingMorphemes(LinkingMorphemes aMorphemes)
{
// Nothing to do
}
@Override
public void setMaximalTreeDepth(int aDepth)
{
// Nothing to do
}
public class Zerleger2
{
Pretree kompvvTree = new Pretree();
Pretree kompvhTree = new Pretree();
Pretree grfTree = new Pretree();
String anweisungGrf = new String();
String anweisungKomp = new String();
// boolean d = true; // debugguing
String reverse(String torev)
{
String ret = new String();
for (int i = torev.length(); i > 0; i--) {
ret += torev.substring(i - 1, i);
}
return ret;
}
public Vector<String> kZerlegung(String aAktwort)
{
// if (d) {
// logger.debug("grf: " + aAktwort + "->");
// }
String aktwort = grundFormReduktion(aAktwort);
// if (d) {
// logger.debug(aktwort);
// }
Vector<String> retvec = new Vector<String>();
String classvv = new String();
String classvh = new String();
String zahlStrvv = "", zahlStrvh = "", suffixvv = "", suffixvh = "", vvteil1 = "", vhteil1 = "", vvteil2 = "", vhteil2 = "";
Vector<String> zervh = new Vector<String>();
Vector<String> zervv = new Vector<String>();
int zahlvv = 0, zahlvh = 0;
boolean vhOk, vvOk;
// if (d) {
// logger.debug("Zerlege " + aktwort);
// }
classvv = kompvvTree.classify(aktwort + "<");
classvh = kompvhTree.classify(reverse(aktwort) + "<");
// if (d) {
// logger.debug("VV liefert " + classvv);
// }
// if (d) {
// logger.debug("VH liefert " + classvh);
// }
zervv = new Vector<String>();
zervh = new Vector<String>();
zervv.addElement(aktwort);
zervh.addElement(aktwort);
vvOk = true;
vhOk = true;
if (classvv.equals("undecided")) {
vvOk = false;
}
if (classvh.equals("undecided")) {
vhOk = false;
}
if (vvOk) {
for (int i = 0; i < classvv.length(); i++) {
char c = classvv.charAt(i);
// if (d) {
// logger.debug("Parse: " + c + " " + (int) c);
// }
if ((c < 58) && (c > 47)) {
zahlStrvv += c;
}
else {
suffixvv += c;
}
} // rof i
}
if (vhOk) {
for (int i = 0; i < classvh.length(); i++) {
char c = classvh.charAt(i);
// if (d) {
// logger.info("Parse: " + c + " " + (int) c);
// }
if ((c < 58) && (c > 47)) {
zahlStrvh += c;
}
else {
suffixvh += c;
}
} // rof i
}
if (vvOk) {
zahlvv = new Integer(zahlStrvv).intValue();
}
if (vhOk) {
zahlvh = new Integer(zahlStrvh).intValue();
}
if (vvOk) {
if (zahlvv >= aktwort.length()) {
vvOk = false;
}
}
if (vhOk) {
if (zahlvh >= aktwort.length()) {
vhOk = false;
}
}
if (vvOk) {
for (int i = 0; i < suffixvv.length(); i++) {
// if (d) {
// logger.debug("VV matche " + suffixvv.charAt(i) + " und "
// + aktwort.charAt(zahlvv + i));
// }
if (aktwort.length() > (zahlvv + i)) {
if (suffixvv.charAt(i) != aktwort.charAt(zahlvv + i)) {
vvOk = false;
}
}
else {
vvOk = false;
}
}
}
if (vhOk) {
for (int i = 0; i < suffixvh.length(); i++) {
if (suffixvh.charAt(i) != aktwort.charAt(zahlvh + 1 + i)) {
vvOk = false;
}
}
}
// nun abschneiden durchf�hren
if (vvOk) {
zervv.removeElement(aktwort);
vvteil1 = aktwort.substring(0, zahlvv);
vvteil2 = aktwort.substring(zahlvv + suffixvv.length(), aktwort.length());
zervv.addElement(vvteil1);
zervv.addElement(vvteil2);
// if (d) {
// logger.debug("VV zerlegt in " + vvteil1 + " " + vvteil2);
// }
if (vvteil2.length() <= 3) {
vvOk = false;
}
}
if (vhOk) {
zervh.removeElement(aktwort);
vhteil1 = aktwort.substring(0, aktwort.length() - zahlvh);
vhteil2 = aktwort.substring(aktwort.length() - (zahlvh + suffixvh.length()),
aktwort.length());
zervh.addElement(vhteil1);
zervh.addElement(vhteil2);
// if (d) {
// logger.debug("VH zerlegt in " + vhteil1 + " " + vhteil2);
// }
if (vhteil1.length() <= 3) {
vhOk = false;
}
}
if (vvOk && vhOk) { // beide ok
if (vvteil1.equals(vhteil1)) {
retvec.addElement(vvteil1);
if (vhteil2.length() < vvteil2.length()) {
retvec.addElement(vhteil2);
}
else if (vhteil2.length() > vvteil2.length()) {
retvec.addElement(vvteil2);
}
}
else if ((vhteil1.length() - vvteil1.length()) < 3) {
retvec.addElement(vvteil1);
if (vhteil2.length() < vvteil2.length()) {
retvec.addElement(vhteil2);
}
else if (vhteil2.length() > vvteil2.length()) {
retvec.addElement(vvteil2);
}
}
// sonst 3 teile
else {
retvec.addElement(vvteil1);
retvec.addElement(aktwort.substring(vvteil1.length() + suffixvv.length(),
aktwort.length() - zahlvh));
retvec.addElement(vhteil2);
}
if (vvteil2.equals(vhteil2)) {
retvec.addElement(vvteil2);
}
}
else if (vvOk && !vhOk) { // nur vvOK
retvec.addElement(vvteil1);
retvec.addElement(vvteil2);
}
else if (vhOk && !vvOk) { // nur vhOK
retvec.addElement(vhteil1);
retvec.addElement(vhteil2);
}
else { // keine Zerlegung gefunden -> lassen
retvec.addElement(aktwort);
}
// if (d) {
// logger.debug("Pre-Ergebnis: [" + aAktwort + "] -> " + retvec);
// }
if (retvec.size() == 1) {
// If there was no split, return verbatim
retvec.clear();
retvec.add(aAktwort);
}
else if (retvec.size() == 2) {
String w1 = retvec.get(0);
String w2 = retvec.get(1);
retvec.clear();
if (!aAktwort.startsWith(w1)) {
// throw new
// IllegalStateException("Bad assumption: first split not changed by
// grundFormReduktion");
logger.error("Unable to map split " + asList(w1, w2)
+ " back to original " + aAktwort + "... no splitting");
retvec.add(aAktwort);
}
else {
retvec.add(w1);
int restBegin = w1.length();
handleLastSplit(aAktwort, restBegin, w2, retvec);
}
}
else if (retvec.size() == 3) {
String w1 = retvec.get(0);
String w2 = retvec.get(1);
String w3 = retvec.get(2);
retvec.clear();
if (!aAktwort.startsWith(w1)) {
// throw new
// IllegalStateException("Bad assumption: first split not changed by
// grundFormReduktion");
logger.error("Unable to map split " + asList(w1, w2, w3)
+ " back to original " + aAktwort + "... no splitting");
retvec.add(aAktwort);
}
else {
retvec.add(w1);
int morphi = aAktwort.indexOf(w2, w1.length());
if (morphi == -1) {
// throw new
// IllegalStateException("Bad assumption: second split not changed by
// grundFormReduktion");
logger.error("Unable to map split " + asList(w1, w2, w3)
+ " back to original " + aAktwort + "... no splitting");
retvec.clear();
retvec.add(aAktwort);
}
else {
if (morphi > w1.length()) {
retvec.add("(" + aAktwort.substring(w1.length(), morphi) + ")");
}
retvec.add(w2);
int restBegin = w2.length() + morphi;
handleLastSplit(aAktwort, restBegin, w3, retvec);
}
}
}
// if (d) {
// logger.debug("Ergebnis: " + retvec);
// }
Vector<String> retvec2 = new Vector<String>();
if (retvec.size() > 1) {
for (String aktelement : retvec) {
if (aktelement.startsWith("(")) {
// This is a linking morpheme
retvec2.addElement(aktelement);
continue;
}
Vector<String> zwischen = kZerlegung(aktelement);
for (String string : zwischen) {
retvec2.addElement(string);
}
}
} // rof if enum
else {
retvec2 = retvec;
}
// if (d) {
// logger.debug("Ergebnis2: " + retvec2.toString());
// }
return retvec2;
} // end kZerlegung
public void handleLastSplit(String aAktwort, int aSplitBegin, String aSplit,
Vector<String> retvec)
{
boolean found = false;
for (int i = 0; i < (aSplit.length() - 1); i++) {
int restOffset = aSplitBegin + i;
String rest = aAktwort.substring(restOffset);
String restGrund = grundFormReduktion(rest);
boolean isEqual = aSplit.equals(restGrund) || aSplit.equals(rest);
boolean isStartsWith = aSplit.startsWith(restGrund) || aSplit.startsWith(rest);
boolean isInvStartsWith = rest.startsWith(aSplit) || restGrund.startsWith(aSplit);
if (isEqual || isStartsWith || isInvStartsWith) {
if (i > 0) {
retvec.add("(" + aAktwort.substring(aSplitBegin, restOffset) + ")");
}
}
if (isEqual) {
retvec.add(aAktwort.substring(restOffset));
found = true;
}
else if (aSplit.startsWith(rest)) {
retvec.add(rest);
found = true;
}
else if (aSplit.startsWith(restGrund)) {
retvec.add(restGrund);
retvec.add("(" + rest.substring(restGrund.length()) + ")");
found = true;
}
else if (isInvStartsWith) {
retvec.add(aSplit);
retvec.add("(" + rest.substring(aSplit.length()) + ")");
// retvec.add(restGrund);
found = true;
}
if (found) {
break;
}
}
if (!found) {
retvec.add(aAktwort.substring(aSplitBegin));
// throw new
// IllegalStateException("Bad assumption: last split does not start a grundform of
// a suffix of aktwort");
}
}
public String grundFormReduktion(String wort)
{
String retwort = wort;
anweisungGrf = grfTree.classify(reverse(wort));
// logger.info("Anweisung f�r "+wort+": "+anweisungGrf);
if (!anweisungGrf.equals("undecided")) {
StringTokenizer kommatok = new StringTokenizer(anweisungGrf, ",");
anweisungGrf = kommatok.nextToken(); // nehme bei
// mehreren
// nurerstes
// parsing anweisung
String zahlStr = new String();
String suffix = new String();
for (int i = 0; i < anweisungGrf.length(); i++) {
char c = anweisungGrf.charAt(i);
// logger.info("Parse: "+c+" "+(int)c);
if ((c < 58) && (c > 47)) {
zahlStr += c;
}
else {
suffix += c;
}
} // rof i
// logger.info(anweisungGrf+"->"+zahlStr+"-"+suffix+"'");
int cutpos = new Integer(zahlStr).intValue();
if (cutpos > retwort.length()) {
cutpos = retwort.length();
}
retwort = retwort.substring(0, retwort.length() - cutpos) + suffix;
}
String[] alternatives = retwort.split(";");
if (alternatives.length > 0) {
retwort = retwort.split(";")[0];
}
else {
retwort = wort;
}
return retwort;
}
public void init(String kompvv, String kompvh, String gfred)
{
// B�ume initialisierung
// logger.info("Loading from "+grfFile);
logger.debug("Loading " + kompvv + " ...");
kompvvTree.load(kompvv);
// logger.debug("loaded");
kompvvTree.setIgnoreCase(true);
kompvvTree.setThresh(0.51);
// Kompositazerlegung-Beum initialisieren
logger.debug("Loading " + kompvh + " ...");
kompvhTree.load(kompvh);
// logger.debug("loaded");
kompvhTree.setIgnoreCase(true); // Trainingsmenge in
// lowcase :(
kompvhTree.setThresh(0.51); // weiss nicht?
logger.debug("Loading " + gfred + " ...");
grfTree.load(gfred);
// logger.debug("loaded");
grfTree.setIgnoreCase(true); // Trainingsmenge in lowcase
// :(
grfTree.setThresh(0.46); // weiss nicht?
}
// inititialisieren mit pretrees
public void init2(Pretree kompvv, Pretree kompvh, Pretree gfred)
{
// B�ume initialisierung
kompvvTree = kompvv;
kompvvTree.setIgnoreCase(true);
kompvvTree.setThresh(0.51);
// Kompositazerlegung-Beum initialisieren
kompvhTree = kompvh;
kompvhTree.setIgnoreCase(true); // Trainingsmenge in lowcase
// :(
kompvhTree.setThresh(0.51); // weiss nicht?
grfTree = gfred;
grfTree.setIgnoreCase(true); // Trainingsmenge in lowcase :(
grfTree.setThresh(0.46); // weiss nicht?
}
} // end class Zerleger
}