/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev;
import com.google.common.base.Strings;
import com.google.common.xml.XmlEscapers;
import org.apache.commons.io.IOUtils;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.rules.Rule;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternToken;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Replaces the 'pattern' element in simple rules with the 'regexp' element.
* WARNING: this is a hack, the rules it produces need to be checked and modified manually!
*/
final class RuleSimplifier {
private int touchedRulesCount;
private void run(Language lang) throws IOException {
File basePath = new File("/lt/git/languagetool/languagetool-language-modules");
if (!basePath.exists()) {
throw new RuntimeException("basePath does not exist: " + basePath);
}
String langCode = lang.getShortCode();
File xml = new File(basePath, "/" + langCode + "/src/main/resources/org/languagetool/rules/" + langCode + "/grammar.xml");
List<String> xmlLines = IOUtils.readLines(new FileReader(xml));
JLanguageTool tool = new JLanguageTool(lang);
int totalRules = 0;
for (Rule rule : tool.getAllActiveRules()) {
if (!(rule instanceof PatternRule)) {
continue;
}
PatternRule patternRule = (PatternRule) rule;
String id = patternRule.getFullId();
if (isSimple((PatternRule)rule)) {
System.err.println("Simplifying: " + id);
simplify(patternRule, xmlLines);
} else {
System.err.println("Can't simplify: " + id);
}
totalRules++;
}
System.err.println("touchedRulesCount: " + touchedRulesCount + " out of " + totalRules);
for (String xmlLine : xmlLines) {
System.out.println(xmlLine);
}
}
private boolean isSimple(PatternRule rule) {
return rule.getPatternTokens().stream().allMatch(this::isSimple)
&& rule.getStartPositionCorrection() == 0
&& rule.getEndPositionCorrection() == 0;
}
private boolean isSimple(PatternToken t) {
return !(t.getNegation() || t.getPOSNegation() || t.hasAndGroup() || t.hasExceptionList() ||
t.hasNextException() || t.hasOrGroup() || t.isInflected() || t.isPOStagRegularExpression() ||
t.getPOStag() != null || t.isReferenceElement() || t.isSentenceStart() ||
t.getSkipNext() >= 1);
}
private String getRegex(PatternRule rule) {
StringBuilder sb = new StringBuilder();
List<PatternToken> tokens = rule.getPatternTokens();
boolean hasCSParts = tokens.stream().anyMatch(PatternToken::isCaseSensitive);
boolean allCSParts = tokens.stream().allMatch(PatternToken::isCaseSensitive);
for (PatternToken patternToken : rule.getPatternTokens()) {
String str = patternToken.getString();
boolean setAllParenthesis = containsBackRef(rule.getMessage()) || containsBackRef(rule.getSuggestionsOutMsg());
if (hasCSParts && !allCSParts && !patternToken.isCaseSensitive()) {
sb.append("(?i:");
appendTokenString(sb, str, setAllParenthesis);
sb.append(")");
} else {
appendTokenString(sb, str, setAllParenthesis);
}
sb.append(" ");
}
String escapedRegex = XmlEscapers.xmlContentEscaper().escape(sb.toString().trim());
if (allCSParts) {
return "<regexp case_sensitive='yes'>" + escapedRegex + "</regexp>";
}
return "<regexp>" + escapedRegex + "</regexp>";
}
private boolean containsBackRef(String str) {
return str.matches(".*\\\\\\d+.*");
}
private void appendTokenString(StringBuilder sb, String str, boolean setAllParenthesis) {
if (str.contains("|") || setAllParenthesis) {
sb.append("(").append(str).append(")");
} else {
sb.append(str);
}
}
// Note: this is a bad hack, we just iterate through the file's lines
private void simplify(PatternRule rule, List<String> xmlLines) {
List<Integer> linesToRemove = new ArrayList<>();
String currentRuleId = null;
Pattern pattern = Pattern.compile(".*id=[\"'](.*?)[\"'].*");
String expectedSubId = rule.getSubId();
int lineCount = 0;
int subRuleCount = 0;
int removedCount = 0;
boolean inRuleGroup = false;
String newRegex = null;
boolean inAntiPattern = false;
for (lineCount = 0; lineCount < xmlLines.size(); lineCount++) {
//for (String xmlLine : xmlLines) {
String xmlLine = xmlLines.get(lineCount);
if (xmlLine.contains("<rulegroup")) {
subRuleCount = 0;
inRuleGroup = true;
} else if (xmlLine.contains("</rulegroup>")) {
subRuleCount = 0;
inRuleGroup = false;
} else if ((xmlLine.contains("<rule ")||xmlLine.contains("<rule>")) && inRuleGroup) {
subRuleCount++;
}
Matcher m = pattern.matcher(xmlLine);
if (m.matches()) {
currentRuleId = m.group(1);
}
if (currentRuleId != null && !currentRuleId.equals(rule.getId())) {
continue;
}
if (!inRuleGroup) {
subRuleCount = 1;
}
if (!expectedSubId.equals("0") && !expectedSubId.equals(String.valueOf(subRuleCount))) {
continue;
}
if (xmlLine.matches(".*<antipattern.*")) {
inAntiPattern = true;
}
if (inAntiPattern) {
continue;
}
if (xmlLine.matches(".*</antipattern.*")) {
inAntiPattern = false;
continue;
}
if (xmlLine.matches(".*<(token|pattern).*") || xmlLine.matches("\\s*</?marker>.*")) {
linesToRemove.add(lineCount);
}
if (xmlLine.matches(".*</pattern.*")) {
linesToRemove.add(lineCount);
int lastTokenIndent = xmlLine.indexOf("<");
newRegex = Strings.repeat(" ", lastTokenIndent) + getRegex(rule);
}
}
Collections.reverse(linesToRemove); // start from end, as we need to remove items
for (Integer s : linesToRemove) {
xmlLines.remove(s.intValue());
removedCount++;
}
if (removedCount == 0) {
System.err.println("No line removed: " + rule + "[" + expectedSubId + "]");
} else {
xmlLines.add(linesToRemove.get(linesToRemove.size()-1), newRegex);
touchedRulesCount++;
}
}
public static void main(String[] args) throws IOException {
RuleSimplifier prg = new RuleSimplifier();
prg.run(Languages.getLanguageForShortCode("de"));
}
}