/*
* #!
* Ontopia Classify
* #-
* Copyright (C) 2001 - 2013 The Ontopia Project
* #-
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* !#
*/
package net.ontopia.topicmaps.classify;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* INTERNAL: Object representing a particular language. The object is
* really just a container for a stemmer, a stop list, and a frequency
* analyzer.
*/
public class Language {
// Define a logging category.
static Logger log = LoggerFactory.getLogger(Language.class.getName());
// Initializer
private static List<Language> languages;
static {
languages = new ArrayList<Language>();
languages.add(Language.getLanguage("en"));
languages.add(Language.getLanguage("no"));
}
protected String id;
protected TermStemmerIF stemmer;
protected StopList stoplist;
protected FrequencyAnalyzer frequency;
Language(String id) {
this.id = id;
this.stemmer = new SnowballStemmer(id);
this.frequency = new FrequencyAnalyzer("net/ontopia/topicmaps/classify/lang/" + id + ".freq");
this.stoplist = new StopList("net/ontopia/topicmaps/classify/lang/" + id + ".stop");
}
/**
* INTERNAL: Used to add additional languages by passing in all
* parameters explicitly.
*/
public Language(String id, TermStemmerIF stemmer,
StopList stoplist, FrequencyAnalyzer frequency) {
this.id = id;
this.stemmer = stemmer;
this.stoplist = stoplist;
this.frequency = frequency;
}
public TermStemmerIF getStemmer() {
return stemmer;
}
public TermAnalyzerIF getStopListAnalyzer() {
return stoplist;
}
public TermAnalyzerIF getFrequencyAnalyzer() {
return frequency;
}
public int getScore(Document doc) {
// score is the number of stop words found in the document
StopWordCounter slc = new StopWordCounter();
slc.stoplist = stoplist;
doc.visitTokens(slc);
return slc.count;
}
public String toString() {
return "Language[" + id + "]";
}
public static Language getLanguage(String lang) {
return new Language(lang);
}
/**
* INTERNAL: Registers a new language for use by detectLanguage.
* <b>Warning:</b> this method is not idempotent.
*/
public static void registerLanguage(Language lang) {
languages.add(lang);
}
/**
* INTERNAL: Detects the language of the document based on the
* built-in languages and new languages registered.
*/
public static Language detectLanguage(Document doc) {
Language high = null;
int highscore = -1;
for (Language lang : languages) {
int score = lang.getScore(doc);
log.debug("Score '" + lang + "'=" + score);
if (score >= highscore) {
highscore = score;
high = lang;
}
}
log.debug("Detected language '" + high + "'");
return high;
}
static class StopWordCounter extends TokenVisitor {
StopList stoplist;
int count;
public void visit(Token token) {
if (token.getType() == Token.TYPE_VARIANT &&
stoplist.isStopWord(token.getValue()))
count++;
}
}
}