/*
* Copyright (C) 2008-2015 by Holger Arndt
*
* This file is part of the Universal Java Matrix Package (UJMP).
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership and licensing.
*
* UJMP is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* UJMP is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with UJMP; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
package org.ujmp.core.text;
import java.io.File;
import org.ujmp.core.Matrix;
import org.ujmp.core.calculation.Calculation.Ret;
import org.ujmp.core.util.io.IntelligentFileReader;
public enum Language {
ENGLISH(English.ALPHABET, English.ALLOWEDCHARACTERS, English.CHARFREQUENCIES,
English.CHARBIGRAMFREQUENCIES, English.AVERAGE_WORD_LENGTH), //
GERMAN(German.ALPHABET, German.ALLOWEDCHARACTERS, German.CHARFREQUENCIES,
German.CHARBIGRAMFREQUENCIES, German.AVERAGE_WORD_LENGTH);
private final char[] alphabet;
private final char[] allowedCharacters;
private final Matrix charFrequencies;
private final Matrix charBigramFrequencies;
private final double averageWordLength;
private Language(char[] alphabet, char[] allowedCharacters, Matrix charFrequencies,
Matrix charBigramFrequencies, double averageWordLength) {
this.alphabet = alphabet;
this.allowedCharacters = allowedCharacters;
this.charFrequencies = charFrequencies;
this.charBigramFrequencies = charBigramFrequencies;
this.averageWordLength = averageWordLength;
}
public char[] getAlphabet() {
return alphabet;
}
public char[] getAllowedCharacters() {
return allowedCharacters;
}
public static final Language guess(File file) {
return guess(IntelligentFileReader.load(file));
}
public double getAverageWordLength() {
return averageWordLength;
}
public static final Language guess(String string) {
double bestSim = 0;
Language bestLanguage = null;
for (Language lang : values()) {
Matrix count = TextUtil.getCharacterBigramFrequencies(string, lang.getAlphabet());
double sum = count.getValueSum();
Matrix freq = count.divide(Ret.NEW, true, sum).log(Ret.NEW);
// add NaN where value is 0 to compare only found letters
// for (long[] c : freq.allCoordinates()) {
// if (freq.getAsDouble(c) == 0) {
// freq.setAsDouble(Double.NaN, c);
// } else {
// System.out.println(freq.getAsDouble(c));
// }
// }
double sim = freq.cosineSimilarityTo(lang.getCharacterBigramFrequencies().log(Ret.NEW),
true);
System.out.println(lang + ": " + sim);
if (sim > bestSim) {
bestSim = sim;
bestLanguage = lang;
}
}
return bestLanguage;
}
public Matrix getCharacterFrequencies() {
return charFrequencies;
}
public Matrix getCharacterBigramFrequencies() {
return charBigramFrequencies;
}
}