/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.eval;
import com.google.common.io.CharStreams;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.language.LanguageIdentifier;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Evaluate the quality of our language detection.
*
* @since 2.9
*/
class LanguageDetectionEval {
private final LanguageIdentifier languageIdentifier;
private int totalInputs = 0;
private int totalFailures = 0;
LanguageDetectionEval() {
languageIdentifier = new LanguageIdentifier();
}
private void evaluate(Language language) throws IOException {
if (language.isVariant()) {
return;
}
String evalTextFile = "/org/languagetool/dev/eval/lang/" + language.getShortCode() + ".txt";
InputStream stream = LanguageDetectionEval.class.getResourceAsStream(evalTextFile);
System.out.println("=== " + language + " ===");
if (stream == null) {
throw new RuntimeException("No eval data found for " + language);
} else {
int minChars = 0;
int failures = 0;
List<String> list = getLines(stream);
for (String line : list) {
try {
int minChar = getShortestCorrectDetection(line, language);
minChars += minChar;
} catch (DetectionException e) {
//System.out.println("FAIL: " + e.getMessage());
failures++;
}
}
int avgMinChars = minChars / list.size();
System.out.println("Average minimum size still correctly detected: " + avgMinChars);
System.out.println("Detection failures: " + failures + " of " + list.size());
totalFailures += failures;
}
}
private int getShortestCorrectDetection(String line, Language expectedLanguage) {
totalInputs++;
String[] tokens = line.split("\\s+");
for (int i = tokens.length; i > 0; i--) {
String text = String.join(" ", Arrays.asList(tokens).subList(0, i));
Language detectedLangObj = languageIdentifier.detectLanguage(text);
String detectedLang = null;
if (detectedLangObj != null) {
detectedLang = detectedLangObj.getShortCode();
}
if (detectedLang == null && i == tokens.length) {
throw new DetectionException("Detection failed for '" + line + "', detected <null>");
} else if (detectedLang != null && !expectedLanguage.getShortCode().equals(detectedLang)) {
if (i == tokens.length) {
throw new DetectionException("Detection failed for '" + line + "', detected " + detectedLang);
} else {
int textLength = getTextLength(tokens, i + 1);
//System.out.println("TEXT : " + line);
//System.out.println("TOO SHORT: " + text + " => " + detectedLang + " (" + textLength + ")");
return textLength;
}
}
}
return tokens[0].length();
}
private int getTextLength(String[] tokens, int tokenPos) {
int i = 0;
int charCount = 0;
for (String token : tokens) {
if (i++ > tokenPos) {
return charCount;
}
charCount += token.length();
}
return charCount;
}
private List<String> getLines(InputStream stream) throws IOException {
List<String> lines = CharStreams.readLines(new InputStreamReader(stream));
List<String> result = new ArrayList<>();
for (String line : lines) {
if (!line.startsWith("#")) {
result.add(line);
}
}
return result;
}
public static void main(String[] args) throws IOException {
LanguageDetectionEval eval = new LanguageDetectionEval();
long startTime = System.currentTimeMillis();
for (Language language : Languages.get()) {
eval.evaluate(language);
}
long endTime = System.currentTimeMillis();
System.out.println();
System.out.println("Time: " + (endTime - startTime) + "ms");
System.out.println("Total detection failures: " + eval.totalFailures + "/" + eval.totalInputs);
}
class DetectionException extends RuntimeException {
DetectionException(String s) {
super(s);
}
}
}