/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Ignore;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.language.English;
import org.languagetool.language.German;
import org.languagetool.rules.IncorrectExample;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.patterns.Element;
import org.languagetool.rules.patterns.PatternRule;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME_LOWERCASE;
public class IndexerSearcherTest extends LuceneTestCase {
private final File ruleFile = new File("../languagetool-language-modules/en/src/main/resources/org/languagetool/rules/en/grammar.xml");
private Searcher errorSearcher;
private Directory directory;
@Override
public void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
//directory = FSDirectory.open(new File("/tmp/lucenetest")); // for debugging
}
@Override
public void tearDown() throws Exception {
super.tearDown();
if (directory != null) {
directory.close();
}
}
@Ignore("ignored as long as it doesn't work 100%")
public void testAllRules() throws Exception {
final long startTime = System.currentTimeMillis();
// comment in to test with external index:
//directory = new SimpleFSDirectory(new File("/media/external-disk/corpus/languagetool/fast-rule-evaluation-de/"));
//errorSearcher = new Searcher(directory);
// TODO: make this work for all languages
final Language language = new English();
//final Language language = new French();
//final Language language = new Spanish();
//final Language language = new Polish(); // TODO: still "Clauses must have same field"
//final Language language = new German();
final JLanguageTool lt = new JLanguageTool(language);
lt.activateDefaultPatternRules();
System.out.println("Creating index for " + language + "...");
final int ruleCount = createIndex(lt);
System.out.println("Index created with " + ruleCount + " rules");
int ruleCounter = 0;
int ruleProblems = 0;
int exceptionCount = 0;
final DirectoryReader reader = DirectoryReader.open(directory);
try {
final List<Rule> rules = lt.getAllActiveRules();
for (Rule rule : rules) {
if (rule instanceof PatternRule && !rule.isDefaultOff()) {
final PatternRule patternRule = (PatternRule) rule;
try {
ruleCounter++;
final SearcherResult searcherResult = errorSearcher.findRuleMatchesOnIndex(patternRule, language);
final List<MatchingSentence> matchingSentences = searcherResult.getMatchingSentences();
boolean foundExpectedMatch = false;
for (MatchingSentence matchingSentence : matchingSentences) {
final List<RuleMatch> ruleMatches = matchingSentence.getRuleMatches();
final List<String> ruleMatchIds = getRuleMatchIds(ruleMatches);
if (ruleMatchIds.contains(getFullId(patternRule))) {
// TODO: there can be more than one expected match, can't it?
foundExpectedMatch = true;
break;
}
}
if (!foundExpectedMatch) {
System.out.println("Error: No match found for " + patternRule);
System.out.println("Query : " + searcherResult.getRelaxedQuery().toString(FIELD_NAME_LOWERCASE));
System.out.println("Matches : " + matchingSentences);
System.out.println("Examples: " + rule.getIncorrectExamples());
System.out.println();
ruleProblems++;
} else {
//final long time = System.currentTimeMillis() - startTime;
//System.out.println("Tested " + matchingSentences.size() + " sentences in " + time + "ms for rule " + patternRule);
}
} catch (UnsupportedPatternRuleException e) {
System.out.println("UnsupportedPatternRuleException searching for rule " + getFullId(patternRule) + ": " + e.getMessage());
ruleProblems++;
} catch (Exception e) {
System.out.println("Exception searching for rule " + getFullId(patternRule) + ": " + e.getMessage());
e.printStackTrace(System.out);
exceptionCount++;
}
}
}
} finally {
reader.close();
}
System.out.println(language + ": problems: " + ruleProblems + ", total rules: " + ruleCounter);
System.out.println(language + ": exceptions: " + exceptionCount + " (including timeouts)");
System.out.println("Total time: " + (System.currentTimeMillis() - startTime) + "ms");
}
private String getFullId(PatternRule patternRule) {
return patternRule.getId() + "[" + patternRule.getSubId() + "]";
}
private List<String> getRuleMatchIds(List<RuleMatch> ruleMatches) {
final List<String> ids = new ArrayList<String>();
for (RuleMatch ruleMatch : ruleMatches) {
if (ruleMatch.getRule() instanceof PatternRule) {
final PatternRule patternRule = (PatternRule) ruleMatch.getRule();
ids.add(getFullId(patternRule));
}
}
return ids;
}
private int createIndex(JLanguageTool lt) throws IOException {
final Indexer indexer = new Indexer(directory, lt.getLanguage());
int ruleCount = 0;
try {
final List<Rule> rules = lt.getAllActiveRules();
for (Rule rule : rules) {
if (rule instanceof PatternRule && !rule.isDefaultOff()) {
final PatternRule patternRule = (PatternRule) rule;
final List<IncorrectExample> incorrectExamples = rule.getIncorrectExamples();
final Document doc = new Document();
final FieldType idType = new FieldType();
idType.setStored(true);
idType.setTokenized(false);
doc.add(new Field("ruleId", getFullId(patternRule), idType));
for (IncorrectExample incorrectExample : incorrectExamples) {
final String example = incorrectExample.getExample().replaceAll("</?marker>", "");
final FieldType fieldType = new FieldType();
fieldType.setStored(true);
fieldType.setTokenized(true);
fieldType.setIndexed(true);
doc.add(new Field(FIELD_NAME, example, fieldType));
// no lowercase here, it would lowercase the input to the LT analysis, leading to wrong POS tags:
doc.add(new Field(FIELD_NAME_LOWERCASE, example, fieldType));
}
indexer.add(doc);
ruleCount++;
}
}
} finally {
indexer.close();
}
errorSearcher = new Searcher(directory);
return ruleCount;
}
/** for manual debugging only */
public void IGNOREtestForDebugging() throws Exception {
// Note that the second sentence ends with "lid" instead of "lids" (the inflated one)
createIndex("I thin so");
final PatternRule rule = getRule("I_THIN", ruleFile);
final SearcherResult searcherResult = errorSearcher.findRuleMatchesOnIndex(rule, new German());
System.out.println("Matches: " + searcherResult.getMatchingSentences());
assertEquals(1, searcherResult.getMatchingSentences().size());
}
public void testIndexerSearcherWithEnglish() throws Exception {
// Note that the second sentence ends with "lid" instead of "lids" (the inflated one)
createIndex("How to move back and fourth from linux to xmb? Calcium deposits on eye lid.");
SearcherResult searcherResult =
errorSearcher.findRuleMatchesOnIndex(getRule("BACK_AND_FOURTH"), new English());
assertEquals(2, searcherResult.getCheckedSentences());
assertEquals(false, searcherResult.isResultIsTimeLimited());
assertEquals(1, searcherResult.getMatchingSentences().size());
searcherResult = errorSearcher.findRuleMatchesOnIndex(getRule("EYE_BROW"), new English());
assertEquals(2, searcherResult.getCheckedSentences());
assertEquals(false, searcherResult.isResultIsTimeLimited());
assertEquals(1, searcherResult.getMatchingSentences().size());
searcherResult = errorSearcher.findRuleMatchesOnIndex(getRule("ALL_OVER_THE_WORD"), new English());
assertEquals(2, searcherResult.getCheckedSentences());
assertEquals(false, searcherResult.isResultIsTimeLimited());
assertEquals(0, searcherResult.getMatchingSentences().size());
try {
errorSearcher.findRuleMatchesOnIndex(getRule("Invalid Rule Id"), new English());
fail("Exception should be thrown for invalid rule id.");
} catch (PatternRuleNotFoundException expected) {}
}
private PatternRule getRule(String ruleId) throws IOException {
return errorSearcher.getRuleById(ruleId, ruleFile);
}
private PatternRule getRule(String ruleId, File grammarFile) throws IOException {
return errorSearcher.getRuleById(ruleId, grammarFile);
}
public void testWithNewRule() throws Exception {
createIndex("How to move back and fourth from linux to xmb?");
final List<Element> elements = Arrays.asList(
new Element("move", false, false, false),
new Element("back", false, false, false)
);
final PatternRule rule1 = new PatternRule("RULE1", new English(), elements, "desc", "msg", "shortMsg");
final Searcher errorSearcher = new Searcher(directory);
final SearcherResult searcherResult = errorSearcher.findRuleMatchesOnIndex(rule1, new English());
assertEquals(1, searcherResult.getCheckedSentences());
assertEquals(1, searcherResult.getMatchingSentences().size());
final List<RuleMatch> ruleMatches = searcherResult.getMatchingSentences().get(0).getRuleMatches();
assertEquals(1, ruleMatches.size());
final Rule rule = ruleMatches.get(0).getRule();
assertEquals("RULE1", rule.getId());
}
public void testWithRegexRule() throws Exception {
createIndex("How to move back and fourth from linux to xmb?");
final List<Element> elements = Arrays.asList(
new Element("move", false, false, false),
new Element("forth|back", false, true, false)
);
final PatternRule rule1 = new PatternRule("RULE1", new English(), elements, "desc", "msg", "shortMsg");
final Searcher errorSearcher = new Searcher(directory);
final SearcherResult searcherResult = errorSearcher.findRuleMatchesOnIndex(rule1, new English());
assertEquals(1, searcherResult.getCheckedSentences());
assertEquals(1, searcherResult.getMatchingSentences().size());
final List<RuleMatch> ruleMatches = searcherResult.getMatchingSentences().get(0).getRuleMatches();
assertEquals(1, ruleMatches.size());
final Rule rule = ruleMatches.get(0).getRule();
assertEquals("RULE1", rule.getId());
}
public void testApostropheElement() throws Exception {
createIndex("Daily Bleed's Anarchist Encyclopedia");
final List<Element> elements1 = Arrays.asList(
new Element("Bleed", false, false, false),
new Element("'", false, false, false),
new Element("s", false, false, false)
);
final PatternRule rule1 = new PatternRule("RULE1", new English(), elements1, "desc", "msg", "shortMsg");
final List<Element> elements2 = Arrays.asList(
new Element("Bleed", false, false, false),
new Element("'", false, false, false),
new Element("x", false, false, false)
);
final PatternRule rule2 = new PatternRule("RULE", new English(), elements2, "desc", "msg", "shortMsg");
final SearcherResult searcherResult1 = errorSearcher.findRuleMatchesOnIndex(rule1, new English());
assertEquals(1, searcherResult1.getMatchingSentences().size());
final List<RuleMatch> ruleMatches = searcherResult1.getMatchingSentences().get(0).getRuleMatches();
assertEquals(1, ruleMatches.size());
final Rule rule = ruleMatches.get(0).getRule();
assertEquals("RULE1", rule.getId());
final SearcherResult searcherResult2 = errorSearcher.findRuleMatchesOnIndex(rule2, new English());
assertEquals(0, searcherResult2.getMatchingSentences().size());
}
public void testWithException() throws Exception {
createIndex("How to move back and fourth from linux to xmb?");
final Element exceptionElem = new Element("forth|back", false, true, false);
exceptionElem.setStringPosException("exception", false, false, false, false, false, "POS", false, false);
final List<Element> elements = Arrays.asList(
new Element("move", false, false, false),
exceptionElem
);
final PatternRule rule1 = new PatternRule("RULE1", new English(), elements, "desc", "msg", "shortMsg");
final Searcher errorSearcher = new Searcher(directory);
final SearcherResult searcherResult = errorSearcher.findRuleMatchesOnIndex(rule1, new English());
assertEquals(1, searcherResult.getCheckedSentences());
assertEquals(1, searcherResult.getMatchingSentences().size());
final List<RuleMatch> ruleMatches = searcherResult.getMatchingSentences().get(0).getRuleMatches();
assertEquals(1, ruleMatches.size());
final Rule rule = ruleMatches.get(0).getRule();
assertEquals("RULE1", rule.getId());
}
public void testNegatedMatchAtSentenceStart() throws Exception {
createIndex("How to move?");
final Element negatedElement = new Element("Negated", false, false, false);
negatedElement.setNegation(true);
final List<Element> elements = Arrays.asList(
negatedElement,
new Element("How", false, false, false)
);
final Searcher errorSearcher = new Searcher(directory);
final PatternRule rule1 = new PatternRule("RULE1", new English(), elements, "desc", "msg", "shortMsg");
final SearcherResult searcherResult = errorSearcher.findRuleMatchesOnIndex(rule1, new English());
assertEquals(1, searcherResult.getCheckedSentences());
assertEquals(1, searcherResult.getMatchingSentences().size());
final List<RuleMatch> ruleMatches = searcherResult.getMatchingSentences().get(0).getRuleMatches();
assertEquals(1, ruleMatches.size());
final Rule rule = ruleMatches.get(0).getRule();
assertEquals("RULE1", rule.getId());
}
public void testWithOneElementWithException() throws Exception {
createIndex("How to move back and fourth from linux to xmb?");
final Element exceptionElem = new Element("", false, true, false);
exceptionElem.setStringPosException("exception", false, false, false, false, false, "POS", false, false);
final List<Element> elements = Arrays.asList(
exceptionElem
);
final PatternRule rule1 = new PatternRule("RULE1", new English(), elements, "desc", "msg", "shortMsg");
final Searcher errorSearcher = new Searcher(directory);
try {
errorSearcher.findRuleMatchesOnIndex(rule1, new English());
fail();
} catch (UnsupportedPatternRuleException expected) {
}
}
private void createIndex(String content) throws IOException {
directory = new RAMDirectory();
//directory = FSDirectory.open(new File("/tmp/lucenetest")); // for debugging
Indexer.run(content, directory, new English(), false);
errorSearcher = new Searcher(directory);
}
/*public void testForManualDebug() throws Exception {
createIndex("How to move back and fourth from linux to xmb?");
final Searcher errorSearcher = new Searcher();
SearcherResult searcherResult = errorSearcher.findRuleMatchesOnIndex(getRule("IS_EVEN_WORST"), Language.ENGLISH);
System.out.println(searcherResult.getCheckedSentences());
System.out.println(searcherResult.isRelaxedQuery());
}*/
}