/*
* #!
* Ontopia Classify
* #-
* Copyright (C) 2001 - 2013 The Ontopia Project
* #-
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* !#
*/
package net.ontopia.topicmaps.classify;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import net.ontopia.utils.OntopiaRuntimeException;
import gnu.trove.map.hash.TObjectDoubleHashMap;
import au.com.bytecode.opencsv.CSVReader;
/**
* INTERNAL: A frequency table giving the frequency with which a
* particular word is used in a particular language.
*/
public class FrequencyAnalyzer implements TermAnalyzerIF {
protected TObjectDoubleHashMap<String> freqs;
/**
* INTERNAL: Loads a frequency table as a resource. The format is a
* plain text file where each line is 'term;factor' where factor is
* a real in the range 0-1. The score of the term after
* classification is multiplied with the factor. Thus, a factor of
* 0.5 will reduce the score of the term by half.
*/
public FrequencyAnalyzer(String filename) {
ClassLoader cloader = FrequencyAnalyzer.class.getClassLoader();
if (cloader == null)
throw new OntopiaRuntimeException("Cannot find class loader.");
InputStream istream = cloader.getResourceAsStream(filename);
if (istream == null)
throw new OntopiaRuntimeException("Cannot find resource: " + filename);
this.freqs = load(istream);
// istream is closed inside load
}
/**
* INTERNAL: Loads a frequency table from a file. The format is a
* plain text file where each line is 'term;factor' where factor is
* a real in the range 0-1. The score of the term after
* classification is multiplied with the factor. Thus, a factor of
* 0.5 will reduce the score of the term by half.
*/
public FrequencyAnalyzer(File file) {
FileInputStream istream = null;
try {
istream = new FileInputStream(file);
this.freqs = load(istream);
} catch (IOException e) {
throw new OntopiaRuntimeException(e);
}
// istream is closed inside load
}
private TObjectDoubleHashMap<String> load(InputStream istream) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(istream, "utf-8"));
TObjectDoubleHashMap<String> freqs = new TObjectDoubleHashMap<String>();
char separator = ';';
char quoteCharacter = '"';
CSVReader csv = new CSVReader(reader, separator, quoteCharacter);
try {
String [] tuple = null;
while ((tuple = csv.readNext()) != null) {
String term = tuple[0].toLowerCase();
double factor = Double.parseDouble(tuple[1]);
freqs.put(term, factor);
}
} finally {
csv.close();
}
return freqs;
} catch (IOException e) {
throw new OntopiaRuntimeException(e);
} finally {
try {
istream.close();
} catch (IOException e) {
throw new OntopiaRuntimeException(e);
}
}
}
public void analyzeTerm(Term term) {
double total = 0;
Variant[] variants = term.getVariants();
for (int i=0; i < variants.length; i++) {
Variant variant = variants[i];
double freq = freqs.get(variant.getValue().toLowerCase());
if (freq > 0d)
total += freq;
else
total += 1d;
}
double average = (total / variants.length);
if (average > 0d)
term.multiplyScore(average, "frequency adjustment");
}
public void startAnalysis(TermDatabase tdb) {
}
public void endAnalysis() {
}
}