/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.bigdata;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.ConfusionSet;
import org.languagetool.rules.ConfusionSetLoader;
import java.io.*;
import java.util.*;
import java.util.stream.Collectors;
/**
* Automatically run {@link ConfusionRuleEvaluator} on word pairs.
* @since 3.2
*/
@SuppressWarnings({"resource", "CallToPrintStackTrace"})
class AutomaticConfusionRuleEvaluator {
private static final String LANGUAGE = "en";
private static final boolean CASE_SENSITIVE = false;
private static final int MAX_EXAMPLES = 1000;
private static final List<Long> EVAL_FACTORS = Arrays.asList(10L, 100L, 1_000L, 10_000L, 100_000L, 1_000_000L, 10_000_000L);
private static final float MIN_PRECISION = 0.99f;
private static final float MIN_RECALL = 0.1f;
private final IndexSearcher searcher;
private final Map<String, List<ConfusionSet>> knownSets;
private int ignored = 0;
AutomaticConfusionRuleEvaluator(File luceneIndexDir) throws IOException {
DirectoryReader reader = DirectoryReader.open(FSDirectory.open(luceneIndexDir.toPath()));
searcher = new IndexSearcher(reader);
InputStream confusionSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt");
knownSets = new ConfusionSetLoader().loadConfusionSet(confusionSetStream);
}
private void run(List<String> lines, File indexDir) throws IOException {
Language language = Languages.getLanguageForShortCode(LANGUAGE);
LanguageModel lm = new LuceneLanguageModel(indexDir);
ConfusionRuleEvaluator evaluator = new ConfusionRuleEvaluator(language, lm, CASE_SENSITIVE);
for (String line : lines) {
if (line.contains("#")) {
System.out.println("Ignoring: " + line);
continue;
}
String[] parts = line.split(";\\s*");
if (parts.length != 2) {
throw new IOException("Expected semicolon-separated input: " + line);
}
try {
int i = 1;
for (String part : parts) {
// compare pair-wise - maybe we should compare every item with every other item?
if (i < parts.length) {
runOnPair(evaluator, line, removeComment(part), removeComment(parts[i]));
}
i++;
}
} catch (RuntimeException e) {
e.printStackTrace();
}
}
System.out.println("Done. Ignored items because they are already known: " + ignored);
}
private String removeComment(String str) {
return str.replaceFirst("\\|.*", "");
}
private void runOnPair(ConfusionRuleEvaluator evaluator, String line, String part1, String part2) throws IOException {
for (Map.Entry<String, List<ConfusionSet>> entry : knownSets.entrySet()) {
if (entry.getKey().equals(part1)) {
List<ConfusionSet> confusionSet = entry.getValue();
for (ConfusionSet set : confusionSet) {
Set<String> stringSet = set.getSet().stream().map(l -> l.getString()).collect(Collectors.toSet());
if (stringSet.containsAll(Arrays.asList(part1, part2))) {
System.out.println("Ignoring: " + part1 + "/" + part2 + ", in active confusion sets already");
ignored++;
return;
}
}
}
}
System.out.println("Working on: " + line);
File sentencesFile = writeExampleSentencesToTempFile(new String[]{part1, part2});
List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
Map<Long, ConfusionRuleEvaluator.EvalResult> results = evaluator.run(input, part1, part2, MAX_EXAMPLES, EVAL_FACTORS);
Map<Long, ConfusionRuleEvaluator.EvalResult> bestResults = findBestFactor(results);
if (bestResults.size() > 0) {
for (Map.Entry<Long, ConfusionRuleEvaluator.EvalResult> entry : bestResults.entrySet()) {
System.out.println("=> " + entry.getValue().getSummary());
}
} else {
System.out.println("No good result found for " + part1 + "/" + part2);
}
}
private Map<Long, ConfusionRuleEvaluator.EvalResult> findBestFactor(Map<Long, ConfusionRuleEvaluator.EvalResult> results) {
Map<Long, ConfusionRuleEvaluator.EvalResult> filteredResults = new HashMap<>();
for (Map.Entry<Long, ConfusionRuleEvaluator.EvalResult> entry : results.entrySet()) {
ConfusionRuleEvaluator.EvalResult result = entry.getValue();
boolean candidate = result.getPrecision() >= MIN_PRECISION && result.getRecall() >= MIN_RECALL;
if (candidate) {
filteredResults.put(entry.getKey(), entry.getValue());
}
}
return filteredResults;
}
private File writeExampleSentencesToTempFile(String[] words) throws IOException {
File tempFile = new File("/tmp/example-sentences.txt");
try (FileWriter fw = new FileWriter(tempFile)) {
for (String word : words) {
findExampleSentences(word, fw);
}
System.out.println("Example sentences written to " + tempFile);
}
return tempFile;
}
private void findExampleSentences(String word, FileWriter fw) throws IOException {
Term term = new Term(TextIndexCreator.FIELD, CASE_SENSITIVE ? word.toLowerCase() : word);
TopDocs topDocs = searcher.search(new TermQuery(term), CASE_SENSITIVE ? Integer.MAX_VALUE : MAX_EXAMPLES);
int count = 0;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
String sentence = searcher.doc(scoreDoc.doc).get(TextIndexCreator.FIELD);
if (CASE_SENSITIVE) {
if (sentence.contains(word)) {
fw.write(sentence + "\n");
count++;
}
} else {
fw.write(sentence + "\n");
count++;
}
if (count > MAX_EXAMPLES) {
break;
}
}
System.out.println("Found " + count + " examples for " + word);
}
public static void main(String[] args) throws IOException {
if (args.length != 3) {
System.out.println("Usage: " + AutomaticConfusionRuleEvaluator.class.getSimpleName() + " <confusionPairCandidates> <exampleSentenceIndexDir> <ngramDir>");
System.out.println(" <confusionPairCandidates> is a semicolon-separated list of words (one pair per line)");
System.out.println(" <exampleSentenceIndexDir> is a Lucene index created by TextIndexCreator");
System.exit(1);
}
List<String> lines = IOUtils.readLines(new FileInputStream(args[0]), "utf-8");
AutomaticConfusionRuleEvaluator eval = new AutomaticConfusionRuleEvaluator(new File(args[1]));
eval.run(lines, new File(args[2]));
}
}