/* * Copyright 2013 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.readability.measure; import java.util.Arrays; import java.util.HashSet; import java.util.Set; /** * Counts syllables in words. * * This class is based on the methods of 'syll_en' and 'syll_de' * in Linux'Style' command (a part of 'diction' package). * * */ public class WordSyllableCounter { private final String[] vowelsArray = {"a", "e", "i", "o", "u"}; private final String[] enVowelsArray = {"a", "e", "i", "o", "u", "y"}; private final String[] deVowelsArray = {"a", "e", "i", "o", "u", "ä", "ö", "ü"}; private final Set<String> vowels; private final Set<String> deVowels; private final Set<String> enVowels; private final String languageCode; public WordSyllableCounter(String languageCode) { vowels = new HashSet<String>(Arrays.asList(vowelsArray)); deVowels = new HashSet<String>(Arrays.asList(deVowelsArray)); enVowels = new HashSet<String>(Arrays.asList(enVowelsArray)); this.languageCode = languageCode; } private boolean isVowel(String character) { if(languageCode.equals("en")) { return enVowels.contains(character); } else if(languageCode.equals("de")) { return deVowels.contains(character); } else { return vowels.contains(character); } } public int countSyllables(Iterable<String> words) { int count = 0; for (String word : words) { count = count + countSyllables(word); } return count; } public int countSyllables(String word){ String lowcaseWord = word.toLowerCase(); int count = 0; if (this.languageCode.equals("en")) { if (lowcaseWord.length() >=2 && lowcaseWord.substring(lowcaseWord.length() - 2, lowcaseWord.length()).equals("ed")) { lowcaseWord = lowcaseWord.substring(0, lowcaseWord.length() - 2); } } else if (this.languageCode.equals("de")) { if (lowcaseWord.length() >= 2 && lowcaseWord.charAt(lowcaseWord.length() - 1) == 'e' && !isVowel(lowcaseWord.substring(lowcaseWord.length() - 2, lowcaseWord.length() - 1))) { count++; lowcaseWord = lowcaseWord.substring(0, lowcaseWord.length() - 2); } } for(int i = 0; i < lowcaseWord.length() - 1; ++ i){ String curCh = lowcaseWord.substring(i, i+ 1); String nextCh = lowcaseWord.substring(i + 1, i + 2); if(isVowel(curCh) && !isVowel(nextCh)) ++ count; } return (count == 0 ? 1 : count); } }