/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.index; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.Version; import org.languagetool.JLanguageTool; import org.languagetool.Language; import org.languagetool.language.English; import org.languagetool.rules.patterns.PatternRule; import org.languagetool.rules.patterns.PatternRuleLoader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME; import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME_LOWERCASE; public class PatternRuleQueryBuilderTest extends LuceneTestCase { private static final Version LUCENE_VERSION = Version.LUCENE_41; private IndexSearcher searcher; private DirectoryReader reader; private Directory directory; private Language language; @Override public void setUp() throws Exception { super.setUp(); language = new English(); directory = new RAMDirectory(); /*final File indexPath = new File("/tmp/lucene"); if (indexPath.exists()) { FileUtils.deleteDirectory(indexPath); } directory = FSDirectory.open(indexPath);*/ // TODO: avoid duplication - use Indexer.java! final Map<String, Analyzer> analyzerMap = new HashMap<String, Analyzer>(); analyzerMap.put(FIELD_NAME, new LanguageToolAnalyzer(LUCENE_VERSION, new JLanguageTool(language), false)); analyzerMap.put(FIELD_NAME_LOWERCASE, new LanguageToolAnalyzer(LUCENE_VERSION, new JLanguageTool(language), true)); final Analyzer analyzer = new PerFieldAnalyzerWrapper(new DoNotUseAnalyzer(), analyzerMap); final IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer); final IndexWriter writer = new IndexWriter(directory, config); try { addDocument(writer, "How do you thin about this wonderful idea?"); addDocument(writer, "The are several grammar checkers for English, E.G. LanguageTool."); } finally { writer.close(); } reader = DirectoryReader.open(directory); searcher = newSearcher(reader); } @Override public void tearDown() throws Exception { super.tearDown(); if (reader != null) { reader.close(); } if (directory != null) { directory.close(); } } private void addDocument(IndexWriter writer, String content) throws IOException { final Document doc = new Document(); final FieldType type = new FieldType(); type.setStored(true); type.setIndexed(true); type.setTokenized(true); doc.add(new Field(FIELD_NAME, content, type)); doc.add(new Field(FIELD_NAME_LOWERCASE, content, type)); writer.addDocument(doc); } public void testQueryBuilder() throws Exception { final StringBuilder sb = new StringBuilder(); sb.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?> <rules lang=\"en\"> <category name=\"Test\"> <rule id=\"TEST_RULE\" name=\"test\"> <pattern>"); // TODO: use makeRule() sb.append("<token skip=\"-1\">How</token>"); // match "How" sb.append("<token postag=\"PRP\"></token>");// match"you/[PRP]" sb.append("<token skip=\"1\">thin</token>"); // match "thin" sb.append("<token postag_regexp=\"yes\" postag=\"JJ|DT\">this</token>"); // match "this/[DT]" sb.append("<token regexp=\"yes\" negate=\"yes\">bad|good</token>"); // match "wonderful" sb.append("<token regexp=\"yes\">idea|proposal</token>"); // match "idea" sb.append("</pattern> </rule> </category> </rules>"); final InputStream input = new ByteArrayInputStream(sb.toString().getBytes()); final PatternRuleLoader ruleLoader = new PatternRuleLoader(); final List<PatternRule> rules = ruleLoader.getRules(input, "test.xml"); final PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language); final Query query = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(0)); assertEquals("+fieldLowercase:how +fieldLowercase:_pos_prp +fieldLowercase:thin " + "+spanNear([fieldLowercase:this, SpanMultiTermQueryWrapper(fieldLowercase:/_pos_(jj|dt)/)], 0, false) " + "+fieldLowercase:/idea|proposal/", query.toString()); } public void testCaseSensitive() throws Exception { final StringBuilder sb = new StringBuilder(); sb.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?> <rules lang=\"en\"> <category name=\"Test\">"); sb.append("<rule id=\"TEST_RULE_1\" name=\"test_1\"> <pattern case_sensitive=\"yes\">"); sb.append(" <token>How</token>"); sb.append("</pattern> </rule>"); sb.append("<rule id=\"TEST_RULE_2\" name=\"test_2\"> <pattern case_sensitive=\"yes\">"); sb.append(" <token>how</token>"); sb.append("</pattern> </rule>"); sb.append("<rule id=\"TEST_RULE_3\" name=\"test_3\"> <pattern>"); sb.append(" <token>How</token>"); sb.append("</pattern> </rule>"); sb.append("<rule id=\"TEST_RULE_4\" name=\"test_4\"> <pattern>"); sb.append(" <token>how</token>"); sb.append("</pattern> </rule>"); sb.append("</category> </rules>"); final InputStream input = new ByteArrayInputStream(sb.toString().getBytes()); final PatternRuleLoader ruleLoader = new PatternRuleLoader(); final List<PatternRule> rules = ruleLoader.getRules(input, "test.xml"); final PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language); Query query = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(0)); assertEquals(1, searcher.search(query, null, 1000).totalHits); query = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(1)); assertEquals(0, searcher.search(query, null, 1000).totalHits); query = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(2)); assertEquals(1, searcher.search(query, null, 1000).totalHits); query = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(3)); assertEquals(1, searcher.search(query, null, 1000).totalHits); } public void testUnsupportedPatternRule() throws Exception { final PatternRule patternRule = makeRule("<token skip='-1'><exception>and</exception></token>", false); final PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language); try { patternRuleQueryBuilder.buildRelaxedQuery(patternRule); fail("Exception should be thrown for unsupported PatternRule"); } catch (UnsupportedPatternRuleException expected) {} } public void testSpecialRegexSyntax() throws Exception { final PatternRule patternRule = makeRule("<token regexp='yes'>\\p{Punct}</token>", false); final PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language); final Query query = queryBuilder.buildRelaxedQuery(patternRule); assertEquals("+fieldLowercase:\\p{Punct}", query.toString()); assertMatches(patternRule, 2); } public void testSeveralElements() throws Exception { // See setup() for the texts we can match assertMatches(makeRule("<token>How</token>"), 1); assertMatches(makeRule("<token>how</token>"), 1); assertMatches(makeRule("<token>LanguageTool</token>"), 1); assertMatches(makeRule("<token>UnknownWord</token>"), 0); assertMatches(makeCaseSensitiveRule("<token>How</token>"), 1); assertMatches(makeCaseSensitiveRule("<token>how</token>"), 0); assertMatches(makeRule("<token regexp=\"yes\">Foo|How</token>"), 1); assertMatches(makeRule("<token regexp=\"yes\">Foo|how</token>"), 1); assertMatches(makeRule("<token regexp=\"yes\">Foo|Bar</token>"), 0); assertMatches(makeCaseSensitiveRule("<token regexp=\"yes\">Foo|How</token>"), 1); assertMatches(makeCaseSensitiveRule("<token regexp=\"yes\">foo|HOW</token>"), 0); assertMatches(makeCaseSensitiveRule("<token regexp=\"yes\">foo|how</token>"), 0); assertMatches(makeRule("<token postag=\"WRB\"></token>"), 1); assertMatches(makeRule("<token postag=\"FOO\"></token>"), 0); assertMatches(makeRule("<token postag=\"[XW]RB\" postag_regexp=\"yes\"></token>"), 1); assertMatches(makeRule("<token postag=\"FOO|WRB\" postag_regexp=\"yes\"></token>"), 1); assertMatches(makeRule("<token postag=\"WRB|FOO\" postag_regexp=\"yes\"></token>"), 1); assertMatches(makeRule("<token postag=\"[XY]OO\" postag_regexp=\"yes\"></token>"), 0); // inflected assertMatches(makeRule("<token>grammar</token><token>checker</token>"), 0); assertMatches(makeRule("<token>grammar</token><token>checkers</token>"), 1); assertMatches(makeRule("<token>grammar</token><token inflected='yes'>checker</token>"), 1); // combine term and POS tag: assertMatches(makeRule("<token postag=\"WRB\">How</token>"), 1); assertMatches(makeRule("<token postag=\"[XW]RB\" postag_regexp=\"yes\">How</token>"), 1); assertMatches(makeRule("<token postag=\"WRB\">Foo</token>"), 0); assertMatches(makeRule("<token postag=\"FOO\">How</token>"), 0); // rules with more than one token: assertMatches(makeRule("<token>How</token> <token>do</token>"), 1); //assertMatches(makeRule("<token>do</token> <token>How</token>"), 0); assertMatches(makeRule("<token>How</token> <token>foo</token>"), 0); assertMatches(makeRule("<token>How</token> <token>do</token> <token>you</token>"), 1); assertMatches(makeRule("<token>How</token> <token>do</token> <token>foo</token>"), 0); assertMatches(makeRule("<token regexp=\"yes\">Foo|How</token> <token>do</token>"), 1); assertMatches(makeRule("<token skip=\"-1\">How</token> <token>wonderful</token>"), 1); //assertMatches(makeRule("<token skip=\"-1\">wonderful</token> <token>How</token>"), 0); assertMatches(makeRule("<token skip=\"6\">How</token> <token>wonderful</token>"), 1); assertMatches(makeRule("<token skip=\"5\">How</token> <token>wonderful</token>"), 1); //assertMatches(makeRule("<token skip=\"4\">How</token> <token>wonderful</token>"), 0); assertMatches(makeRule("<token>How</token> <token skip=\"-1\">do</token> <token>wonderful</token>"), 1); assertMatches(makeRule("<token>How</token> <token skip=\"4\">do</token> <token>wonderful</token>"), 1); //assertMatches(makeRule("<token>How</token> <token skip=\"3\">do</token> <token>wonderful</token>"), 0); assertMatches(makeRule("<token skip=\"-1\">How</token> <token skip=\"-1\">thin</token> <token>wonderful</token>"), 1); assertMatches(makeRule("<token skip=\"3\">How</token> <token skip=\"3\">thin</token> <token>wonderful</token>"), 1); assertMatches(makeRule("<token skip=\"3\">How</token> <token skip=\"3\">thin</token> <token>foo</token>"), 0); assertMatches(makeRule("<token>E</token> <token>.</token> <token>G</token> <token>.</token>"), 1); assertMatches(makeRule("<token>X</token> <token>.</token> <token>G</token> <token>.</token>"), 0); //assertMatches(makeRule("<token>E</token> <token>,</token> <token>G</token> <token>.</token>"), 0); assertMatches(makeRule("<token>E</token> <token>.</token> <token>G</token> <token>.</token> <token>LanguageTool</token>"), 1); assertMatches(makeRule("<token>E</token> <token>.</token> <token>G</token> <token>.</token> <token>foo</token>"), 0); // negation: assertMatches(makeRule("<token>How</token> <token negate=\"yes\">foo</token>"), 1); assertMatches(makeRule("<token>How</token> <token negate=\"yes\">do</token>"), 1); // known overmatching assertMatches(makeRule("<token>How</token> <token>do</token> <token negate=\"yes\">foo</token>"), 1); assertMatches(makeRule("<token>How</token> <token negate=\"yes\">foo</token> <token>you</token>"), 1); assertMatches(makeRule("<token>How</token> <token>do</token> <token negate=\"yes\">you</token>"), 1); // known overmatching assertMatches(makeRule("<token>How</token> <token negate=\"yes\">do</token> <token>you</token>"), 1); // known overmatching assertMatches(makeRule("<token>How</token> <token negate=\"yes\">do</token> <token negate=\"yes\">you</token>"), 1); // known overmatching } private void assertMatches(PatternRule patternRule, int expectedMatches) throws Exception { final PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language); final Query query = queryBuilder.buildRelaxedQuery(patternRule); //System.out.println("QUERY: " + query); final int matches = searcher.search(query, null, 1000).totalHits; assertEquals("Query failed: " + query, expectedMatches, matches); } private PatternRule makeCaseSensitiveRule(String ruleXml) throws IOException { return makeRule(ruleXml, true); } private PatternRule makeRule(String ruleXml) throws IOException { return makeRule(ruleXml, false); } private PatternRule makeRule(String ruleXml, boolean caseSensitive) throws IOException { final StringBuilder sb = new StringBuilder(); sb.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); sb.append("<rules lang=\"en\"> <category name=\"Test\"> <rule id=\"TEST_RULE\" name=\"test\">"); if (caseSensitive) { sb.append("<pattern case_sensitive=\"yes\">"); } else { sb.append("<pattern>"); } sb.append(ruleXml); sb.append("</pattern> </rule> </category> </rules>"); final InputStream input = new ByteArrayInputStream(sb.toString().getBytes()); final PatternRuleLoader ruleLoader = new PatternRuleLoader(); final List<PatternRule> rules = ruleLoader.getRules(input, "test.xml"); assertEquals(1, rules.size()); return rules.get(0); } }