/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter;
import java.util.ArrayList;
import java.util.List;
import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary;
import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes;
import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode;
/**
* Implements a simple left to right split algorithm.
*
* Goes from left to right to the word. If a word is found the right side is evaluation from left to
* right. At the end we have a right balanced tree. All leaves are the smallest word fractions.
*
* The leaves will combined to all possible splits.
*
*/
public class LeftToRightSplitterAlgorithm
implements SplitterAlgorithm
{
private Dictionary dict;
private LinkingMorphemes morphemes;
private int maxTreeDepth = Integer.MAX_VALUE;
private int minWordLength = 1;
private int minRestLength = 2;
/**
* Empty constructor.
*
* Before you use this class set the dictionary and the linking morphemes with the setter
* methods
*/
public LeftToRightSplitterAlgorithm()
{
// Nothing to do
}
/**
* Create a instance of the algorithm
*
* @param aDict
* A dictionary with all words
* @param aMorphemes
* A LinkingMorphemes class
*/
public LeftToRightSplitterAlgorithm(Dictionary aDict, LinkingMorphemes aMorphemes)
{
setDictionary(aDict);
setLinkingMorphemes(aMorphemes);
}
@Override
public DecompoundingTree split(String aWord)
{
aWord = aWord.toLowerCase();
DecompoundingTree t = new DecompoundingTree(aWord);
t.getRoot().getValue().getSplits().get(0).setSplitAgain(true);
ltrSplit(t.getRoot(), 0);
return t;
}
/**
* The basic split algorithm. Moves the word from left to right and checks for valid words.
*
* @param aParent
* The parent node
* @param aDepth
* the current depth.
*/
protected void ltrSplit(ValueNode<DecompoundedWord> aParent, int aDepth)
{
if (aDepth > maxTreeDepth) {
return;
}
for (int i = 0; i < aParent.getValue().getSplits().size(); i++) {
Fragment element = aParent.getValue().getSplits().get(i);
if (element.shouldSplitAgain()) {
List<DecompoundedWord> results = makeSplit(element.getWord());
for (DecompoundedWord result : results) {
DecompoundedWord copy = aParent.getValue().createCopy();
if (result.getSplits().size() > 1) {
result.getSplits().get(1).setSplitAgain(true);
copy.replaceSplitElement(i, result);
ValueNode<DecompoundedWord> child = new ValueNode<DecompoundedWord>(copy);
aParent.addChild(child);
ltrSplit(child, aDepth + 1);
}
else if (result.getSplits().size() == 1 && !result.equals(aParent.getValue())) {
copy.replaceSplitElement(i, result);
ValueNode<DecompoundedWord> child = new ValueNode<DecompoundedWord>(copy);
aParent.addChild(child);
}
}
}
}
}
/**
* Splits a word in two word.
*
* @param aWord
* a word.
* @return the splits.
*/
protected List<DecompoundedWord> makeSplit(String aWord)
{
List<DecompoundedWord> result = new ArrayList<DecompoundedWord>();
for (int i = 0; i < aWord.length(); i++) {
String leftWord = aWord.substring(0, i + 1);
String rightWord = aWord.substring(i + 1);
boolean leftGood = dict.contains(leftWord) && leftWord.length() >= minWordLength;
boolean rightGood = rightWord.length() > minRestLength || rightWord.length() == 0;
if (leftGood && rightGood) {
// createFromString removes the trailing + if rightWord is empty.
DecompoundedWord split = DecompoundedWord.createFromString(leftWord + "+"
+ rightWord);
split.setSplitPos(i);
result.add(split);
}
// Check if left word contains linking morphemes
for (String morpheme : morphemes.getAll()) {
try {
String leftWithoutMorpheme = leftWord.substring(0,
leftWord.length() - morpheme.length());
if (leftWord.endsWith(morpheme) && dict.contains(leftWithoutMorpheme)
&& rightGood) {
DecompoundedWord split = DecompoundedWord
.createFromString(leftWithoutMorpheme + "(" + morpheme + ")+"
+ rightWord);
split.setSplitPos(i);
result.add(split);
}
}
catch (StringIndexOutOfBoundsException e) {
continue;
}
}
}
return result;
}
@Override
public void setDictionary(Dictionary aDict)
{
dict = aDict;
}
public Dictionary getDictionary()
{
return dict;
}
@Override
public void setLinkingMorphemes(LinkingMorphemes aLinkingMorphemes)
{
morphemes = aLinkingMorphemes;
}
public LinkingMorphemes getMorphemes()
{
return morphemes;
}
@Override
public void setMaximalTreeDepth(int aDepth)
{
maxTreeDepth = aDepth;
}
public int getMaxTreeDepth()
{
return maxTreeDepth;
}
public void setMinWordLength(int aMinWordLength)
{
minWordLength = aMinWordLength;
}
public int getMinWordLength()
{
return minWordLength;
}
public void setMinRestLength(int aMinRestLength)
{
minRestLength = aMinRestLength;
}
public int getMinRestLength()
{
return minRestLength;
}
}