/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import de.abelssoft.wordtools.jwordsplitter.AbstractWordSplitter;
import de.abelssoft.wordtools.jwordsplitter.impl.GermanWordSplitter;
import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary;
import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes;
import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode;
/**
* Wrapper for the JWordSplitter algorithm.
*
*/
public class JWordSplitterAlgorithm
implements SplitterAlgorithm
{
private AbstractWordSplitter splitterHiddenLinking;
private AbstractWordSplitter splitter;
private Dictionary dict;
@Override
public DecompoundingTree split(String aWord)
{
if (splitter == null) {
try {
splitterHiddenLinking = new InternalGermanWordSplitter(true);
splitter = new InternalGermanWordSplitter(false);
}
catch (IOException e) {
throw new IllegalStateException("Unable to access dictionary", e);
}
}
DecompoundingTree t = new DecompoundingTree(aWord);
// Just append on child to the tree
String[] splits = splitter.splitWord(aWord).toArray(new String[0]);
String[] splitsNoLink = splitterHiddenLinking.splitWord(aWord).toArray(new String[0]);
if (splits.length != splitsNoLink.length) {
throw new IllegalStateException(
"Something is fishy - more must have happened than just hiding the links");
}
if (splits.length > 1) {
StringBuilder splitStringMorph = new StringBuilder();
for (int i = 0; i < splits.length; i++) {
String base = splitsNoLink[i];
String full = splits[i];
if (!full.startsWith(base)) {
throw new IllegalStateException(
"Something is fishy - links should be at the end");
}
String link = full.substring(base.length());
// Split with linking morphemes
splitStringMorph.append(base);
if (link.length() > 0) {
splitStringMorph.append("(").append(link).append(")");
}
splitStringMorph.append("+");
}
String splitStringMorphStr = splitStringMorph.toString();
t.getRoot().addChild(new ValueNode<DecompoundedWord>(DecompoundedWord.createFromString(splitStringMorphStr)));
}
return t;
}
@Override
public void setDictionary(Dictionary aDict)
{
dict = aDict;
splitter = null;
splitterHiddenLinking = null;
}
@Override
public void setLinkingMorphemes(LinkingMorphemes aMorphemes)
{
// Not needed for this algorithm
}
@Override
public void setMaximalTreeDepth(int aDepth)
{
// Not needed for this algorithm
}
private class InternalGermanWordSplitter extends GermanWordSplitter
{
public InternalGermanWordSplitter(boolean aHideConnectingCharacters)
throws IOException
{
super(aHideConnectingCharacters);
}
@Override
protected Set<String> getWordList()
throws IOException
{
if (dict == null) {
return super.getWordList();
}
else {
return new HashSet<String>(dict.getAll());
}
}
}
}