package com.cybozu.labs.langdetect;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import net.arnx.jsonic.JSON;
import net.arnx.jsonic.JSONException;
import com.cybozu.labs.langdetect.util.LangProfile;
/**
* Language Detector Factory Class
*
* This class manages an initialization and constructions of {@link Detector}.
*
* Before using language detection library,
* load profiles with {@link DetectorFactory#loadProfile(String)} method
* and set initialization parameters.
*
* When the language detection,
* construct Detector instance via {@link DetectorFactory#create()}.
* See also {@link Detector}'s sample code.
*
* <ul>
* <li>4x faster improvement based on Elmer Garduno's code. Thanks!</li>
* </ul>
*
* @see Detector
* @author Nakatani Shuyo
*/
public class DetectorFactory {
public HashMap<String, double[]> wordLangProbMap;
public ArrayList<String> langlist;
public Long seed = null;
public DetectorFactory() {
wordLangProbMap = new HashMap<String, double[]>();
langlist = new ArrayList<String>();
}
/**
* Load profiles from specified directory.
* This method must be called once before language detection.
*
* @param profileDirectory profile directory path
* @throws LangDetectException Can't open profiles(error code = {@link ErrorCode#FileLoadError})
* or profile's format is wrong (error code = {@link ErrorCode#FormatError})
*/
public void loadProfile(String profileDirectory) throws LangDetectException {
loadProfile(new File(profileDirectory));
}
/**
* Load profiles from specified directory.
* This method must be called once before language detection.
*
* @param profileDirectory profile directory path
* @throws LangDetectException Can't open profiles(error code = {@link ErrorCode#FileLoadError})
* or profile's format is wrong (error code = {@link ErrorCode#FormatError})
*/
public void loadProfile(File profileDirectory) throws LangDetectException {
File[] listFiles = profileDirectory.listFiles();
if (listFiles == null)
throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Not found profile: " + profileDirectory);
int langsize = listFiles.length, index = 0;
for (File file: listFiles) {
if (file.getName().startsWith(".") || !file.isFile()) continue;
FileInputStream is = null;
try {
is = new FileInputStream(file);
LangProfile profile = JSON.decode(is, LangProfile.class);
addProfile(profile, index, langsize);
++index;
} catch (JSONException e) {
throw new LangDetectException(ErrorCode.FormatError, "profile format error in '" + file.getName() + "'");
} catch (IOException e) {
throw new LangDetectException(ErrorCode.FileLoadError, "can't open '" + file.getName() + "'");
} finally {
try {
if (is!=null) is.close();
} catch (IOException e) {}
}
}
}
/**
* Load profiles from specified directory.
* This method must be called once before language detection.
*
* @param profileDirectory profile directory path
* @throws LangDetectException Can't open profiles(error code = {@link ErrorCode#FileLoadError})
* or profile's format is wrong (error code = {@link ErrorCode#FormatError})
*/
public void loadProfile(List<String> json_profiles) throws LangDetectException {
int index = 0;
int langsize = json_profiles.size();
if (langsize < 2)
throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Need more than 2 profiles");
for (String json: json_profiles) {
try {
LangProfile profile = JSON.decode(json, LangProfile.class);
addProfile(profile, index, langsize);
++index;
} catch (JSONException e) {
throw new LangDetectException(ErrorCode.FormatError, "profile format error");
}
}
}
/**
* @param profile
* @param langsize
* @param index
* @throws LangDetectException
*/
public void addProfile(LangProfile profile, int index, int langsize) throws LangDetectException {
String lang = profile.name;
if (this.langlist.contains(lang)) {
throw new LangDetectException(ErrorCode.DuplicateLangError, "duplicate the same language profile");
}
this.langlist.add(lang);
for (String word: profile.freq.keySet()) {
if (!this.wordLangProbMap.containsKey(word)) {
this.wordLangProbMap.put(word, new double[langsize]);
}
int length = word.length();
if (length >= 1 && length <= 3) {
double prob = profile.freq.get(word).doubleValue() / profile.n_words[length - 1];
this.wordLangProbMap.get(word)[index] = prob;
}
}
}
/**
* Clear loaded language profiles (reinitialization to be available)
*/
public void clear() {
this.langlist.clear();
this.wordLangProbMap.clear();
}
/**
* Construct Detector instance
*
* @return Detector instance
* @throws LangDetectException
*/
public Detector create() throws LangDetectException {
return createDetector();
}
/**
* Construct Detector instance with smoothing parameter
*
* @param alpha smoothing parameter (default value = 0.5)
* @return Detector instance
* @throws LangDetectException
*/
public Detector create(double alpha) throws LangDetectException {
Detector detector = createDetector();
detector.setAlpha(alpha);
return detector;
}
private Detector createDetector() throws LangDetectException {
if (this.langlist.size()==0)
throw new LangDetectException(ErrorCode.NeedLoadProfileError, "need to load profiles");
Detector detector = new Detector(this);
return detector;
}
public void setSeed(long seed) {
this.seed = seed;
}
public final List<String> getLangList() {
return Collections.unmodifiableList(this.langlist);
}
}