/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import de.tudarmstadt.ukp.dkpro.core.decompounding.dictionary.igerman98.Affix;
/**
* The igerman98 dictionary from www.j3e.de/ispell/igerman98
*
* A current version of the german dictionary de_DE can be found in
* /src/main/resources/de_DE.dic
*
* This class can also be used to read other ispell/hunspell dictionaries.
*
*/
public class German98Dictionary
extends SimpleDictionary
{
private static final String PREFIX_KEY = "PFX";
private static final String SUFFIX_KEY = "SFX";
private Map<Character, List<Affix>> affixes = new HashMap<Character, List<Affix>>();
public German98Dictionary(File aDict, File aAffix)
{
try {
readAffixFile(new BufferedReader(new FileReader(aAffix)));
setWords(readFileToSet(new BufferedReader(new FileReader(aDict))));
}
catch (IOException e) {
e.printStackTrace();
}
}
public German98Dictionary(InputStream aDictStream, InputStream aAffixStream)
{
try {
readAffixFile(new BufferedReader(new InputStreamReader(aAffixStream)));
setWords(readFileToSet(new BufferedReader(new InputStreamReader(aDictStream))));
}
catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
protected Set<String> readFileToSet(BufferedReader aReader)
throws IOException
{
Set<String> words = new HashSet<String>();
// First line contains number of entries -> skip
String line = aReader.readLine();
while ((line = aReader.readLine()) != null) {
if (line.equals("") || line.substring(0, 1).equals("#")
|| line.substring(0, 1).equals("\t")) {
// Ignore lines starting with hash of tab (comments)
continue;
}
String[] split = line.split("/");
String word = split[0].toLowerCase();
char[] flags = {};
if (split.length > 1) {
flags = split[1].toCharArray();
}
if (word.length() > 2) {
words.add(word);
}
if (flags.length > 0) {
words.addAll(buildWords(word, flags));
}
}
return words;
}
/**
* Reads the affix file and processes the data
*
* @param aReader
* a reader.
*/
protected void readAffixFile(BufferedReader aReader)
{
try {
String line;
while ((line = aReader.readLine()) != null) {
if (line.startsWith(PREFIX_KEY) || line.startsWith(SUFFIX_KEY)) {
parseAffix(line, aReader);
}
}
}
catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* Parse a affix in the affix file
*
* @param aHeader
* The header of the affix
* @param aReader
* The file reader to read the rest of the affix
* @throws IOException if an I/O error occurs.
*/
private void parseAffix(String aHeader, BufferedReader aReader)
throws IOException
{
String args[] = aHeader.split("\\s+");
boolean crossProduct = args[2].equals("Y");
int numLines = Integer.parseInt(args[3]);
for (int i = 0; i < numLines; i++) {
String line = aReader.readLine();
if (line == null) {
throw new IOException("Unexpected end of file after reading [" + i +
"] lines. Expected were [" + numLines + "] lines.");
}
String ruleArgs[] = line.split("\\s+");
Character flag = ruleArgs[1].toCharArray()[0];
Affix a = new Affix(args[0]);
a.setCrossProduct(crossProduct);
a.setFlag(flag);
a.setStripping(ruleArgs[2]);
a.setAffix(ruleArgs[3]);
a.setCondition(ruleArgs[4]);
List<Affix> list = affixes.get(flag);
if (list == null) {
list = new ArrayList<Affix>();
affixes.put(flag, list);
}
list.add(a);
}
}
/**
* Uses affixes to build new words
*
* @param aWord
* a word.
* @param aFlags
* flags.
* @return inflected word forms.
*/
protected List<String> buildWords(String aWord, char[] aFlags)
{
List<String> words = new ArrayList<String>();
for (char c : aFlags) {
List<Affix> aff = affixes.get(c);
if (aff == null) {
continue;
}
for (Affix affix : aff) {
String w = affix.handleWord(aWord);
if (w != null && w.length() > 2) {
words.add(w);
}
}
}
return words;
}
}