/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.linguistic;
import java.util.EnumMap;
import org.carrot2.core.LanguageCode;
import org.carrot2.shaded.guava.common.base.Predicate;
import org.carrot2.shaded.guava.common.collect.Maps;
import org.carrot2.text.linguistic.lucene.ArabicStemmerAdapter;
import org.carrot2.text.linguistic.lucene.HindiStemmerAdapter;
import org.carrot2.text.linguistic.morfologik.MorfologikStemmerAdapter;
import org.carrot2.text.linguistic.snowball.SnowballProgram;
import org.carrot2.text.linguistic.snowball.stemmers.*;
import org.carrot2.util.annotations.ThreadSafe;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.factory.FallbackFactory;
import org.carrot2.util.factory.IFactory;
import org.carrot2.util.factory.NewClassInstanceFactory;
import org.carrot2.util.factory.SingletonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Bindable
@ThreadSafe
public class DefaultStemmerFactory implements IStemmerFactory
{
private final static Logger logger = LoggerFactory.getLogger(DefaultStemmerFactory.class);
private final static EnumMap<LanguageCode, IFactory<IStemmer>> stemmerFactories;
/**
* Functional verification for {@link IStemmer}.
*/
private final static Predicate<IStemmer> stemmerVerifier = new Predicate<IStemmer>()
{
@Override
public boolean apply(IStemmer stemmer)
{
// Assume functional if there's no exception.
stemmer.stem("verification");
return true;
}
};
/**
* Initialize factories.
*/
static
{
stemmerFactories = createDefaultStemmers();
}
@Override
public IStemmer getStemmer(LanguageCode languageCode)
{
return stemmerFactories.get(languageCode).createInstance();
}
/**
* Create default stemmer factories.
*/
private static EnumMap<LanguageCode, IFactory<IStemmer>> createDefaultStemmers()
{
final IFactory<IStemmer> identity = new SingletonFactory<IStemmer>(new IdentityStemmer());
final EnumMap<LanguageCode, IFactory<IStemmer>> map = Maps.newEnumMap(LanguageCode.class);
// Adapters to third-party libraries.
map.put(LanguageCode.POLISH, new NewClassInstanceFactory<IStemmer>(MorfologikStemmerAdapter.class));
map.put(LanguageCode.ARABIC, new NewClassInstanceFactory<IStemmer>(ArabicStemmerAdapter.class));
// Adapters to snowball.
map.put(LanguageCode.DANISH, snowball(DanishStemmer.class));
map.put(LanguageCode.DUTCH, snowball(DutchStemmer.class));
map.put(LanguageCode.ENGLISH, snowball(EnglishStemmer.class));
map.put(LanguageCode.FINNISH, snowball(FinnishStemmer.class));
map.put(LanguageCode.FRENCH, snowball(FrenchStemmer.class));
map.put(LanguageCode.GERMAN, snowball(GermanStemmer.class));
map.put(LanguageCode.HUNGARIAN, snowball(HungarianStemmer.class));
map.put(LanguageCode.ITALIAN, snowball(ItalianStemmer.class));
map.put(LanguageCode.NORWEGIAN, snowball(NorwegianStemmer.class));
map.put(LanguageCode.PORTUGUESE, snowball(PortugueseStemmer.class));
map.put(LanguageCode.ROMANIAN, snowball(RomanianStemmer.class));
map.put(LanguageCode.RUSSIAN, snowball(RussianStemmer.class));
map.put(LanguageCode.SPANISH, snowball(SpanishStemmer.class));
map.put(LanguageCode.SWEDISH, snowball(SwedishStemmer.class));
map.put(LanguageCode.TURKISH, snowball(TurkishStemmer.class));
// Identity stemming for Chinese.
map.put(LanguageCode.CHINESE_SIMPLIFIED, identity);
// Specialized stemming for Hindi (ported from Lucene)
map.put(LanguageCode.HINDI, new NewClassInstanceFactory<IStemmer>(HindiStemmerAdapter.class));
// Decorate everything with a fallback identity stemmer.
for (LanguageCode lc : LanguageCode.values())
{
if (map.containsKey(lc))
{
IFactory<IStemmer> factory = map.get(lc);
if (factory != identity)
{
factory = new FallbackFactory<IStemmer>(
factory, identity, stemmerVerifier,
logger, "Stemmer for "
+ lc.toString() + " (" + lc.getIsoCode() + ") is not available."
+ " This may degrade clustering quality of "
+ lc.toString() + " content. Cause: {}");
map.put(lc, factory);
}
}
else
{
map.put(lc, identity);
}
}
return map;
}
private static IFactory<IStemmer> snowball(final Class<? extends SnowballProgram> clazz) {
return new IFactory<IStemmer>() {
@Override
public IStemmer createInstance() {
try {
return new SnowballStemmerAdapter(clazz.newInstance());
} catch (InstantiationException |IllegalAccessException e) {
throw new RuntimeException(e);
}
}
};
}
}