/* LanguageTool, a natural language style checker
* Copyright (C) 2010 Daniel Naber (http://www.languagetool.org)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.conversion;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.JLanguageTool;
public abstract class RuleConverter {
// indent strings
protected static final String firstIndent = " ";
protected static final String secondIndent = " ";
protected static final String thirdIndent = " ";
protected static final String fourthIndent = " ";
protected static final int firstIndentInt = 2;
protected static final int secondIndentInt = 4;
protected static final int thirdIndentInt = 6;
protected static final int fourthIndentInt = 8;
protected String inFileName;
protected String outFileName;
protected String ruleType;
// lists of rules
protected List<? extends Object> ruleObjects;
protected ArrayList<List<String>> allLtRules;
protected ArrayList<List<String>> ltRules;
protected ArrayList<List<String>> disambiguationRules;
protected ArrayList<String> originalRuleStrings;
protected ArrayList<String[]> warnings; // list as long as allLtRules containing warning strings generating during rule conversion process
// for auto-generating Id and name attributes
protected int idIndex;
protected int nameIndex;
// these should be able to be set depending on the language
protected String SENT_START = "SENT_START";
protected String SENT_END = "SENT_END";
// to check identities
private static Pattern regex = Pattern.compile("[\\.\\^\\$\\*\\+\\?\\{\\}\\[\\]\\|\\(\\)]");
public static String xmlHeader =
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<?xml-stylesheet type=\"text/xsl\" href=\"../print.xsl\" ?>\n" +
"<?xml-stylesheet type=\"text/css\" href=\"../rules.css\"\n" +
"title=\"Easy editing stylesheet\" ?>\n" +
"<!--\n" +
"English Grammar and Typo Rules for LanguageTool\n" +
"See tagset.txt for the meaning of the POS tags\n" +
"Copyright (C) 2001-2007 Daniel Naber (http://www.danielnaber.de)\n" +
"$Id: grammar.xml,v 1.129 2010-11-13 23:24:21 dnaber Exp $\n" +
"-->\n" +
"<!--suppress CheckTagEmptyBody -->\n" +
"<rules lang=\"en\" xsi:noNamespaceSchemaLocation=\"../rules.xsd\" xmlns:xsi=\"http://\n" +
"www.w3.org/2001/XMLSchema-instance\" xmlns:xs=\"http://www.w3.org/2001/XMLSchema\">\n";
// basic constructor
public RuleConverter() {
idIndex = 0;
nameIndex = 0;
}
// constructor with input and output rule files
public RuleConverter(String inFileName, String outFileName, String ruleType) {
this.inFileName = inFileName;
this.outFileName = outFileName;
if (ruleType == null) {
this.ruleType = "default";
} else {
this.ruleType = ruleType;
}
idIndex = 0;
nameIndex = 0;
}
public List<? extends Object> getRules() {return this.ruleObjects;}
public ArrayList<List<String>> getAllLtRules() {return this.allLtRules;}
public ArrayList<List<String>> getLtRules() {return this.ltRules;}
public ArrayList<List<String>> getDisambiguationRules() {return this.disambiguationRules;}
public ArrayList<String> getOriginalRuleStrings() {return this.originalRuleStrings;}
public ArrayList<String[]> getWarnings() {return this.warnings;}
public String getInFile() {return inFileName;}
public String getOutFile() {return outFileName;}
public String getFileType() {return ruleType;}
public String getSentStart() {return this.SENT_START;}
public String getSentEnd() {return this.SENT_END;}
public void setInFile(String filename) {this.inFileName = filename;}
public void setOutFile(String filename) {this.outFileName = filename;}
public void setFileType(String fileType) {this.ruleType = fileType;}
public void setSentStart(String sent_start) {this.SENT_START = sent_start;}
public void setSentEnd(String sent_end) {this.SENT_END = sent_end;}
// Abstract methods
/**
* The main method: parses the input file and populates the rule lists
* @throws IOException
*/
public abstract void parseRuleFile() throws IOException;
/**
* Takes a rule object and returns the original string representation of the rule
* @param ruleObject: element from getRules()
* @return
*/
public abstract String getOriginalRuleString(Object ruleObject);
/**
* Takes a rule object (element from getRules()), an id, a name, and a rule type (this.ruleType) and returns a
* list of strings, the rule in LanguageTool format. Almost always called by getLtRules, etc methods
* @param rule
* @param id
* @param name
* @param type
* @return
*/
public abstract List<String> ltRuleAsList(Object rule, String id, String name, String type);
public abstract String generateId(Object ruleObject);
public abstract String generateName(Object ruleObject);
/**
* Returns a list of acceptable file types
* @return
*/
public abstract String[] getAcceptableFileTypes();
/**
* Returns true if the rule object is a disambiguation rule (i.e. should go into the disambiguation.xml file)
* @param ruleObject
* @return
*/
public abstract boolean isDisambiguationRule(Object ruleObject);
/**
* Takes a LT rule list and elements of a token, and adds the proper <token> element to the rule list.
* @param orig
* @param token
* @param postag
* @param exceptions
* @param careful
* @param inflected
* @param negate
* @param skip
* @param indent
* @return
*/
protected static ArrayList<String> addToken(ArrayList<String> orig, String token, String postag, String exceptions,
boolean careful, boolean inflected, boolean negate, int skip, int indent) {
String space = getSpace(indent);
// fix the case of the "everything" token
if (token.equals(".*")) {
token = "";
}
String inflectedString = "";
if (inflected) {
inflectedString = " inflected=\"yes\"";
}
String skipString = "";
if (skip == -1) {
skipString = " skip=\"-1\"";
}
String regexpString = "";
if (isRegex(token)) {
regexpString = " regexp=\"yes\"";
}
String exceptionString = "";
if (exceptions != null) {
if (exceptions.contains("<exception")) {
exceptionString = exceptions;
} else {
exceptionString = "<exception regexp=\"yes\">" + exceptions + "</exception>";
}
}
String postagRegexp = "";
if (isRegex(postag)) {
postagRegexp = " postag_regexp=\"yes\"";
}
String postagString = "";
if (postag != null) {
if (!postag.isEmpty()) {
postagString = " postag=\"" + postag + "\"";
}
}
String carefulString = "";
if (careful) {
carefulString = "<exception" + postagString + postagRegexp + " negate_pos=\"yes\"/>";
}
String negateString = "";
if (!token.isEmpty() && negate) {
negateString = " negate=\"yes\"";
}
String negatePosString = "";
if (!postagString.isEmpty() && negate) {
negatePosString = " negate_pos=\"yes\"";
}
orig.add(space + "<token" + inflectedString + skipString + regexpString + postagString + postagRegexp + negateString + negatePosString + ">" + token + carefulString + exceptionString + "</token>");
return orig;
}
/**
* Takes a file and returns it as a list of strings, blank lines omitted
*/
public static ArrayList<String> fileToListNoBlanks(String filename) {
ArrayList<String> returnList = new ArrayList<String>();
Scanner in = null;
InputStream is;
try {
is = JLanguageTool.getDataBroker().getFromResourceDirAsStream(filename);
in = new Scanner(is);
while (in.hasNextLine()) {
String line = in.nextLine();
if (!line.equals("") && !line.equals("\n")) {
returnList.add(line);
}
}
} catch (Exception e) {
throw new RuntimeException("Could not load " + filename, e);
} finally {
if (in != null) {
in.close();
}
}
return returnList;
}
/**
* returns if the string contains a character that might indicate it's a regex
*/
protected static boolean isRegex(String e) {
if (e == null) {
return false;
}
Matcher m = regex.matcher(e);
return m.find();
}
protected static String getSpace(int indent) {
StringBuilder sb = new StringBuilder();
for (int i=0;i<indent;i++) {
sb.append(' ');
}
return sb.toString();
}
public static String getRuleStringFromList(List<String> rule) {
StringBuilder sb = new StringBuilder();
for (String line : rule) {
sb.append(line);
sb.append('\n');
}
return sb.toString();
}
// ** Helpers to "or" sets of words together
public static String glueWords(ArrayList<String> words) {
StringBuilder sb = new StringBuilder();
if (words == null) {
return "";
}
for (String word : words) {
sb.append(word);
sb.append("|");
}
String str = sb.toString();
if (str.length() > 1) {
return str.substring(0,str.length()-1);
} else {
return str;
}
}
public static String glueWords(String[] words) {
StringBuilder sb = new StringBuilder();
if (words == null) {
return "";
}
for (String word : words) {
sb.append(word);
sb.append("|");
}
String str = sb.toString();
if (str.length() > 1) {
return str.substring(0,str.length()-1);
} else {
return str;
}
}
}