/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev;
import org.apache.commons.io.IOUtils;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.rules.CorrectExample;
import org.languagetool.rules.IncorrectExample;
import org.languagetool.rules.Rule;
import org.languagetool.rules.patterns.AbstractPatternRule;
import org.languagetool.rules.patterns.PatternRule;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Finds and removes "useless" examples sentences. "Useless" are sentences
* of type="correct" that are already covered by an incorrect counterpart
* with a correction that, when applied, leads to the correct sentence.
* NOTE: this is an ugly hack, it might miss sentences or even remove ones
* that should not be removed.
*/
final class UselessExampleFinder {
private int uselessExampleCount;
private int removedLinesCount;
private void run(Language lang) throws IOException {
File basePath = new File("/lt/git/languagetool/languagetool-language-modules");
if (!basePath.exists()) {
throw new RuntimeException("basePath does not exist: " + basePath);
}
String langCode = lang.getShortCode();
File xml = new File(basePath, "/" + langCode + "/src/main/resources/org/languagetool/rules/" + langCode + "/grammar.xml");
List<String> xmlLines = IOUtils.readLines(new FileReader(xml));
JLanguageTool tool = new JLanguageTool(lang);
for (Rule rule : tool.getAllActiveRules()) {
if (!(rule instanceof PatternRule)) {
continue;
}
List<CorrectExample> correctExamples = rule.getCorrectExamples();
List<IncorrectExample> incorrectExamples = rule.getIncorrectExamples();
for (IncorrectExample incorrectExample : incorrectExamples) {
checkCorrections(rule, correctExamples, incorrectExample, xmlLines);
}
}
System.err.println("Useless examples: " + uselessExampleCount);
System.err.println("Removed lines: " + removedLinesCount);
for (String xmlLine : xmlLines) {
System.out.println(xmlLine);
}
}
private void checkCorrections(Rule rule, List<CorrectExample> correctExamplesObjs, IncorrectExample incorrectExample, List<String> xmlLines) {
List<String> correctExamples = correctExamplesObjs.stream().map(k -> k.getExample()).collect(Collectors.toList());
List<String> corrections = incorrectExample.getCorrections();
for (String correction : corrections) {
String fixedSentence = incorrectExample.getExample().replaceAll("<marker>.*?</marker>", "<marker>" + correction.replace("$", "\\$") + "</marker>");
String fixedSentenceNoMarker = incorrectExample.getExample().replaceAll("<marker>.*?</marker>", correction.replace("$", "\\$"));
if (correctExamples.contains(fixedSentence)) {
System.err.println("Useless: " + fixedSentence + " in " + rule.getId());
removeLinesFromXml(rule, fixedSentence, xmlLines);
uselessExampleCount++;
}
if (correctExamples.contains(fixedSentenceNoMarker)) {
System.err.println("Useless: " + fixedSentenceNoMarker + " in " + rule.getId());
removeLinesFromXml(rule, fixedSentenceNoMarker, xmlLines);
uselessExampleCount++;
}
}
}
// Note: this is a bad hack, we just iterate through the file's lines
private void removeLinesFromXml(Rule rule, String sentenceToRemove, List<String> xmlLines) {
List<Integer> linesToRemove = new ArrayList<>();
String currentRuleId = null;
Pattern pattern = Pattern.compile(".*id=[\"'](.*?)[\"'].*");
String expectedSubId = ((AbstractPatternRule) rule).getSubId();
int lineCount = 0;
int subRuleCount = 0;
int removedCount = 0;
boolean inRuleGroup = false;
for (String xmlLine : xmlLines) {
if (xmlLine.contains("<rulegroup")) {
subRuleCount = 0;
inRuleGroup = true;
} else if (xmlLine.contains("</rulegroup>")) {
subRuleCount = 0;
inRuleGroup = false;
} else if ((xmlLine.contains("<rule ")||xmlLine.contains("<rule>")) && inRuleGroup) {
subRuleCount++;
}
Matcher m = pattern.matcher(xmlLine);
if (m.matches()) {
currentRuleId = m.group(1);
}
if (!xmlLine.contains("correction=") && xmlLine.contains(sentenceToRemove + "</example>")) {
if (currentRuleId != null && !currentRuleId.equals(rule.getId())) {
lineCount++;
continue;
}
if (!inRuleGroup) {
subRuleCount = 1;
}
if (!expectedSubId.equals("0") && !expectedSubId.equals(String.valueOf(subRuleCount))) {
lineCount++;
continue;
}
linesToRemove.add(lineCount);
break;
}
lineCount++;
}
Collections.reverse(linesToRemove); // start from end, as we need to remove items
for (Integer s : linesToRemove) {
xmlLines.remove(s.intValue());
removedLinesCount++;
removedCount++;
}
if (removedCount == 0) {
System.err.println("No line removed: " + rule + "[" + expectedSubId + "]");
}
}
public static void main(String[] args) throws IOException {
UselessExampleFinder prg = new UselessExampleFinder();
prg.run(Languages.getLanguageForShortCode("de"));
}
}