/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.kea.stemmers;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* A stemmer for German words. The algorithm is based on the report
* "A Fast and Simple Stemming Algorithm for German Words" by Joerg
* Caumanns (joerg.caumanns@isst.fhg.de).
*
* Changed stem() from protected to public.
* Changed coding for umlaute to unicode.
*
* @author Gerhard Schwarz
* @version $Id: GermanStemmer.java,v 1.1 2004/12/15 01:13:54 mdewsnip Exp $
*/
public class GermanStemmer extends Stemmer
{
/**
*
*/
private static final long serialVersionUID = 1L;
/**
* Buffer for the terms while stemming them.
*/
private StringBuffer sb = new StringBuffer();
/**
* Indicates if a term is handled as a noun.
*/
private boolean uppercase = false;
/**
* Amount of characters that are removed with <tt>substitute()</tt> while stemming.
*/
private int substCount = 0;
/**
* Stemms the given term to an unique <tt>discriminator</tt>.
*
* @param term The term that should be stemmed.
* @return Discriminator for <tt>term</tt>
*/
public String stem( String term )
{
// Mark a possible noun.
uppercase = Character.isUpperCase( term.charAt( 0 ) );
// Use lowercase for medium stemming.
term = term.toLowerCase();
if ( !isStemmable( term ) )
return term;
// Reset the StringBuffer.
sb.delete( 0, sb.length() );
sb.insert( 0, term );
// Stemming starts here...
substitute( sb );
strip( sb );
optimize( sb );
resubstitute( sb );
removeParticleDenotion( sb );
return sb.toString();
}
/**
* Checks if a term could be stemmed.
*
* @return true if, and only if, the given term consists in letters.
*/
private boolean isStemmable( String term )
{
for ( int c = 0; c < term.length(); c++ ) {
if ( !Character.isLetter( term.charAt( c ) ) ) return false;
}
return true;
}
/**
* suffix stripping (stemming) on the current term. The stripping is reduced
* to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
* from which all regular suffixes are build of. The simplification causes
* some overstemming, and way more irregular stems, but still provides unique.
* discriminators in the most of those cases.
* The algorithm is context free, except of the length restrictions.
*/
private void strip( StringBuffer buffer )
{
boolean doMore = true;
while ( doMore && buffer.length() > 3 ) {
if ( ( buffer.length() + substCount > 5 ) &&
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
{
buffer.delete( buffer.length() - 2, buffer.length() );
}
else if ( ( buffer.length() + substCount > 4 ) &&
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
buffer.delete( buffer.length() - 2, buffer.length() );
}
else if ( ( buffer.length() + substCount > 4 ) &&
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
buffer.delete( buffer.length() - 2, buffer.length() );
}
else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
// "t" occurs only as suffix of verbs.
else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
else {
doMore = false;
}
}
}
/**
* Does some optimizations on the term. This optimisations are
* contextual.
*/
private void optimize( StringBuffer buffer )
{
// Additional step for female plurals of professions and inhabitants.
if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
buffer.deleteCharAt( buffer.length() -1 );
strip( buffer );
}
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
buffer.setCharAt( buffer.length() - 1, 'x' );
}
}
/**
* Removes a particle denotion ("ge") from a term.
*/
private void removeParticleDenotion( StringBuffer buffer )
{
if ( buffer.length() > 4 ) {
for ( int c = 0; c < buffer.length() - 3; c++ ) {
if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
buffer.delete( c, c + 2 );
return;
}
}
}
}
/**
* Do some substitutions for the term to reduce overstemming:
*
* - Substitute Umlauts with their corresponding vowel: ae,oe,ue -> a,o,u,
* "eszet" is substituted by "ss"
* - Substitute a second char of a pair of equal characters with
* an asterisk: ?? -> ?*
* - Substitute some common character combinations with a token:
* sch/ch/ei/ie/ig/st -> $/c/%/&/#/!
*/
private void substitute( StringBuffer buffer )
{
substCount = 0;
for ( int c = 0; c < buffer.length(); c++ ) {
// Replace the second char of a pair of the equal characters with an asterisk
if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
buffer.setCharAt( c, '*' );
}
// Substitute Umlauts.
else if ( buffer.charAt( c ) == '\u00E4' ) {
buffer.setCharAt( c, 'a' );
}
else if ( buffer.charAt( c ) == '\u00F6' ) {
buffer.setCharAt( c, 'o' );
}
else if ( buffer.charAt( c ) == '\u00FC' ) {
buffer.setCharAt( c, 'u' );
}
// Take care that at least one character is left left side from the current one
if ( c < buffer.length() - 1 ) {
if ( buffer.charAt( c ) == '\u00DF' ) {
buffer.setCharAt( c, 's' );
buffer.insert( c + 1, 's' );
substCount++;
}
// Masking several common character combinations with an token
else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
{
buffer.setCharAt( c, '$' );
buffer.delete( c + 1, c + 3 );
substCount =+ 2;
}
else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
buffer.setCharAt( c, 'C' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
buffer.setCharAt( c, '%' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
buffer.setCharAt( c, '&' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
buffer.setCharAt( c, '#' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
buffer.setCharAt( c, '!' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
}
}
}
/**
* Undoes the changes made by substitute(). That are character pairs and
* character combinations. Umlauts will remain as their corresponding vowel,
* as "eszet" remains as "ss".
*/
private void resubstitute( StringBuffer buffer )
{
for ( int c = 0; c < buffer.length(); c++ ) {
if ( buffer.charAt( c ) == '*' ) {
char x = buffer.charAt( c - 1 );
buffer.setCharAt( c, x );
}
else if ( buffer.charAt( c ) == '$' ) {
buffer.setCharAt( c, 's' );
buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
}
else if ( buffer.charAt( c ) == 'C' ) {
buffer.setCharAt( c, 'c' );
buffer.insert( c + 1, 'h' );
}
else if ( buffer.charAt( c ) == '%' ) {
buffer.setCharAt( c, 'e' );
buffer.insert( c + 1, 'i' );
}
else if ( buffer.charAt( c ) == '&' ) {
buffer.setCharAt( c, 'i' );
buffer.insert( c + 1, 'e' );
}
else if ( buffer.charAt( c ) == '#' ) {
buffer.setCharAt( c, 'i' );
buffer.insert( c + 1, 'g' );
}
else if ( buffer.charAt( c ) == '!' ) {
buffer.setCharAt( c, 's' );
buffer.insert( c + 1, 't' );
}
}
}
}