package com.transmem.nlp;
import java.util.Hashtable;
import java.util.logging.Logger;
/**
* Wrap language tools in a object factory. The tools are used to segment a sentence,
* filter words by stoplist, stemming words for indexing, chunk-parsing, and tagging.
* This class offers static methods to be shared by all instances in the application scope.
*/
public class LanguageManager
{
public static final Logger log_ = Logger.getLogger(LanguageManager.class.getName());
public static final int SEGMENTER = 1;
public static final int FILTER = 2;
public static final int STEMMER = 4;
public static final int TAGGER = 8;
public static final int PARSER = 16;
public static final int ALIGNER = 32; //TODO: aligning involves 2 languages, consider later
public static final int INDEXER = SEGMENTER | FILTER | STEMMER; ///to be used for indexing a sentence
public static final int ALL = 255; //all tools included
private static Hashtable<String,String> langnames_ = null;
private static Hashtable<String,Object> tools_ = null;
public static void setLanguageNames(Hashtable<String,String> langnames)
{
langnames_ = langnames;
}
public static String getLangNameByCode(String langCode)
{
return langnames_.get(langCode);
}
public static void addLanguageName(String code, String name)
{
langnames_.put(code, name);
}
/**
* Create object for a class.
* @param className - class name as string.
* @return Object that can be casted to a derived class.
*/
private static Object createObject(String className) throws LanguageException
{
if (tools_ == null)
{
tools_ = new Hashtable<String,Object>();
}
else if (tools_.containsKey(className))
{
return tools_.get(className);
}
Class classObject = null;
Object object = null;
try
{
if (className.indexOf('.')<0)
{
classObject = Class.forName("com.transmem.nlp."+className);
}
else
{
classObject = Class.forName(className);
}
object = classObject.newInstance();
tools_.put(className,object);
return object;
}
catch (ClassNotFoundException cfe)
{
log_.severe("LanguageManager.createObject('"+className+"') class not found exception");
throw new LanguageException("Class '"+className+"' not found");
}
catch (Exception e)
{
//System.err.println(e.toString());
//log
log_.severe("LanguageManager.createObject('"+className+"') instantiate exception:"+e);
throw new LanguageException("Class '"+className+"' instantiation error: "+e);
}
}
/**
* Create ISegmenter object.
* @param name - language name such as Chinese.
*/
public static ISegmenter createSegmenter(String name) throws LanguageException
{
assert(name != null);
if (name.length()==2)
{
if (langnames_ == null)
{
loadLangNames();
}
name = langnames_.get(name);
if (name == null)
{
throw new LanguageException("createSegmenter("+name+") error: invalid name");
}
}
if (!name.endsWith("Segmenter"))
{
return (ISegmenter)createObject(name+"Segmenter");
}
else
{
return (ISegmenter)createObject(name);
}
}
/**
* Create IStemmer object.
* @param name - language name such as English
*/
public static IStemmer createStemmer(String name) throws LanguageException
{
assert(name != null);
if (name.length()==2)
{
if (langnames_ == null)
{
loadLangNames();
}
name = langnames_.get(name);
if (name == null)
{
throw new LanguageException("createStemmer("+name+") error: invalid name");
}
}
if (!name.endsWith("Stemmer"))
{
return (IStemmer)createObject(name+"Stemmer");
}
else
{
return (IStemmer)createObject(name);
}
}
/**
* Create IFilter object.
* @param name - English, Chinese
*/
public static IFilter createFilter(String name) throws LanguageException
{
assert(name != null);
if (name.length()==2)
{
if (langnames_ == null)
{
loadLangNames();
}
name = langnames_.get(name);
if (name == null)
{
throw new LanguageException("createFilter("+name+") error: invalid name");
}
}
if (!name.endsWith("Filter"))
{
return (IFilter)createObject(name+"Filter");
}
else
{
return (IFilter)createObject(name);
}
}
/**
* Create IAligner object.
* @param name - English, Chinese ?
*/
public static IAligner createAligner(String name) throws LanguageException
{
assert(name != null);
if (name.length()==2)
{
if (langnames_ == null)
{
loadLangNames();
}
name = langnames_.get(name);
if (name == null)
{
throw new LanguageException("createAligner("+name+") error: invalid name");
}
}
if (!name.endsWith("Aligner"))
{
return (IAligner)createObject(name+"Aligner");
}
else
{
return (IAligner)createObject(name);
}
}
/**
* Create ITagger object.
* @param name - English, Chinese
*/
public static ITagger createTagger(String name) throws LanguageException
{
assert(name != null);
if (name.length()==2)
{
if (langnames_ == null)
{
loadLangNames();
}
name = langnames_.get(name);
if (name == null)
{
throw new LanguageException("createTagger("+name+") error: invalid name");
}
}
if (!name.endsWith("Tagger"))
{
return (ITagger)createObject(name+"Tagger");
}
else
{
return (ITagger)createObject(name);
}
}
/**
* Create IParser object.
* @param name - language name such as English, Chinese.
*/
public static IParser createParser(String name) throws LanguageException
{
assert(name != null);
if (name.length()==2)
{
if (langnames_ == null)
{
loadLangNames();
}
name = langnames_.get(name);
if (name == null)
{
throw new LanguageException("createParser("+name+") error: invalid name");
}
}
if (!name.endsWith("Parser"))
{
return (IParser)createObject(name+"Parser");
}
else
{
return (IParser)createObject(name);
}
}
/**
* Create a ILinguist object for the given language and selected tools.
* The language can be specified by the 2-letter code such as 'EN' for English, 'ZH' for Chinese.
* The selected tools are specified as an int with wanted tools combined, for example,
* STEMMER | FILTER means stemmer and filter tools are wanted in the linguist object.
* @param langcode - 2-letter language code
* @param tools - int for tools
* @return ILinguist object
* @throws LanguageException for invalid langcode or failure of creating the tool object.
*/
public static ILinguist createLinguist(String langcode, int tools) throws LanguageException
{
if (langnames_ == null)
{
loadLangNames();
}
if (langnames_ == null)
{
throw new LanguageException("LanguageManager.createLinguist('"+langcode+"'), langnames not loaded from DB");
}
String langName = langnames_.get(langcode);
if (langName == null)
{
throw new LanguageException("LanguageManger.createLinguist('"+langcode+"') exception: langcode not found");
}
Linguist linguist = new Linguist(langcode, langName);
//System.out.println("langName="+langName);
if ((tools & LanguageManager.SEGMENTER) == LanguageManager.SEGMENTER)
{
linguist.setSegmenter(createSegmenter(langName));
}
if ((tools & LanguageManager.FILTER) == LanguageManager.FILTER)
{
linguist.setFilter(createFilter(langName));
}
if ((tools & LanguageManager.STEMMER) == LanguageManager.STEMMER)
{
linguist.setStemmer(createStemmer(langName));
}
//if ((tools & LanguageManager.ALIGNER) == LanguageManager.ALIGNER)
//{
// linguist.setAligner(createAligner(langName));
//}
if ((tools & LanguageManager.TAGGER) == LanguageManager.TAGGER)
{
linguist.setTagger(createTagger(langName));
}
if ((tools & LanguageManager.PARSER) == LanguageManager.PARSER)
{
linguist.setParser(createParser(langName));
}
return linguist;
}
/**
* Load language names into the static hashtable with the 2-letter language codes as the key.
* The language names should be capitalised (first-letter uppercase) single word (no spaces or symbols).
*/
public static void loadLangNames()
{
//load from DB, or simply hard-code it here for the moment
langnames_ = new Hashtable<String,String>();
langnames_.put("EN","English");
langnames_.put("ZH","Chinese");
langnames_.put("AR","Arabic");
langnames_.put("DE","German");
langnames_.put("EL","Greek");
langnames_.put("ES","Spanish");
langnames_.put("FR","French");
langnames_.put("IT","Italian");
langnames_.put("JA","Japanese");
langnames_.put("KO","Korean");
langnames_.put("RU","Russian");
//log_.info("langnames_ hashtable created with "+langnames_.size()+" instances added.");
}
}