/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package de.tudarmstadt.ukp.dkpro.core.decompounding.splitter; import java.util.ArrayList; import java.util.List; import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.Dictionary; import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.LinkingMorphemes; import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.SimpleDictionary; import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.TrieStructure; import de.tudarmstadt.ukp.dkpro.core.decompounding.trie.ValueNode; /** * A data driven algorithm, that uses a TRIE to look for splits * */ public class DataDrivenSplitterAlgorithm implements SplitterAlgorithm { private TrieStructure forwardTrie; private TrieStructure backwardTrie; private LinkingMorphemes morphemes; private int maxTreeDepth = Integer.MAX_VALUE; /** * Empty constructor * * Before you use this class set the dictionary and the linking morphemes with the setter * methods */ public DataDrivenSplitterAlgorithm() { } /** * Constructor * * @param aDictionary * A simple dictionary object * @param aMorphemes * A list of linking morphemes */ public DataDrivenSplitterAlgorithm(SimpleDictionary aDictionary, LinkingMorphemes aMorphemes) { setDictionary(aDictionary); setLinkingMorphemes(aMorphemes); } @Override public DecompoundingTree split(String aWord) { aWord = aWord.toLowerCase(); DecompoundingTree t = new DecompoundingTree(aWord); t.getRoot().getValue().getSplits().get(0).setSplitAgain(true); splitIt(t.getRoot(), 0); return t; } /** * Builds a splitting tree * * @param aParent * The parent node * @param aDepth * the current depth. */ protected void splitIt(ValueNode<DecompoundedWord> aParent, int aDepth) { if (aDepth > maxTreeDepth) { return; } // Iterate over all split elements for (int i = 0; i < aParent.getValue().getSplits().size(); i++) { Fragment element = aParent.getValue().getSplits().get(i); // Do something if split element should be splitted if (element.shouldSplitAgain()) { // Split List<DecompoundedWord> results = makeSplit(element.getWord()); for (DecompoundedWord result : results) { if (result.getSplits().size() > 1) { // Left site DecompoundedWord resultCopy1 = result.createCopy(); resultCopy1.getSplits().get(0).setSplitAgain(true); DecompoundedWord parentCopy1 = aParent.getValue().createCopy(); parentCopy1.replaceSplitElement(i, resultCopy1); ValueNode<DecompoundedWord> node1 = new ValueNode<DecompoundedWord>( parentCopy1); aParent.addChild(node1); splitIt(node1, aDepth + 1); // Right site DecompoundedWord resultCopy2 = result.createCopy(); resultCopy2.getSplits().get(1).setSplitAgain(true); DecompoundedWord parentCopy2 = aParent.getValue().createCopy(); parentCopy2.replaceSplitElement(i, resultCopy2); ValueNode<DecompoundedWord> node2 = new ValueNode<DecompoundedWord>( parentCopy2); aParent.addChild(node2); splitIt(node2, aDepth + 1); } } } } } /** * Makes a single split on a given word. Returns all possible splittings. All splits consist of * two elements * * @param aWord * a word. * @return all possible splittings. */ protected List<DecompoundedWord> makeSplit(String aWord) { List<DecompoundedWord> returnList = new ArrayList<DecompoundedWord>(); if (aWord.length() - 5 <= 0) { DecompoundedWord s = new DecompoundedWord(); s.appendSplitElement(new Fragment(aWord)); returnList.add(s); return returnList; } int[] forward = new int[aWord.length() - 2]; int[] backward = new int[aWord.length() - 2]; int[] diffForward = new int[aWord.length() - 3]; int[] diffBackward = new int[aWord.length() - 3]; boolean[] maxForward = new boolean[aWord.length() - 5]; boolean[] maxBackward = new boolean[aWord.length() - 5]; // Read successor from trie for (int i = 2; i < aWord.length(); i++) { String subword = aWord.substring(0, i + 1); forward[i - 2] = forwardTrie.getSuccessors(subword); } for (int i = aWord.length() - 3; i > -1; i--) { String subword = aWord.substring(i); backward[i] = backwardTrie .getSuccessors(new StringBuffer(subword).reverse().toString()); } // Make difference for (int i = 1; i < forward.length; i++) { diffForward[i - 1] = forward[i - 1] - forward[i]; } for (int i = backward.length - 2; i > -1; i--) { diffBackward[i] = backward[i + 1] - backward[i]; } // Mark local maximas for (int i = 1; i < diffForward.length - 1; i++) { if (diffForward[i - 1] < diffForward[i] && diffForward[i] > diffForward[i + 1]) { maxForward[i - 1] = true; } else { maxForward[i - 1] = false; } } for (int i = diffBackward.length - 2; i > 0; i--) { if (diffBackward[i - 1] < diffBackward[i] && diffBackward[i] > diffBackward[i + 1]) { maxBackward[i - 1] = true; } else { maxBackward[i - 1] = false; } } // String debugForward = ""; // for (int i = 0; i < word.length(); i++) { // if (i > 3 && i < word.length() -1 && maxForward[i-4]) { // debugForward += "|"; // } // debugForward += word.charAt(i); // } // System.out.println("[DEBUG] F:" +debugForward); // // String debugBackward = ""; // for (int i = 0; i < word.length(); i++) { // debugBackward += word.charAt(i); // if (i < word.length()-5 && i > 0 && maxBackward[i-1]) { // debugBackward += "|"; // } // } // System.out.println("[DEBUG] B:" +debugBackward); // Get all split positions List<Integer> splitPos = new ArrayList<Integer>(); for (int i = 0; i < maxForward.length - 3; i++) { boolean maxF = maxForward[i]; boolean maxB = maxBackward[i + 2]; if (maxF && maxB) { splitPos.add(i + 4); } } // Create all splits if (splitPos.size() > 0) { for (Integer pos : splitPos) { DecompoundedWord s = new DecompoundedWord(); s.appendSplitElement(new Fragment(aWord.substring(0, pos))); s.appendSplitElement(new Fragment(aWord.substring(pos))); returnList.addAll(checkForMorphemes(s)); } } else { DecompoundedWord s = new DecompoundedWord(); s.appendSplitElement(new Fragment(aWord)); returnList.add(s); } return returnList; } protected List<DecompoundedWord> checkForMorphemes(DecompoundedWord aSplit) { List<DecompoundedWord> result = new ArrayList<DecompoundedWord>(); result.add(aSplit); int pos; String word = aSplit.getSplits().get(1).getWord(); if ((pos = morphemes.startsWith(word)) > 0) { String m = word.substring(0, pos); String newWord = word.substring(pos); DecompoundedWord copy = aSplit.createCopy(); copy.getSplits().get(0).setMorpheme(m); copy.getSplits().get(1).setWord(newWord); result.add(copy); } return result; } @Override public void setDictionary(Dictionary aDict) { forwardTrie = TrieStructure.createForDict(aDict); backwardTrie = TrieStructure.createForDictReverse(aDict); } @Override public void setLinkingMorphemes(LinkingMorphemes aLinkingMorphemes) { morphemes = aLinkingMorphemes; } @Override public void setMaximalTreeDepth(int aDepth) { maxTreeDepth = aDepth; } }