/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.index; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; import org.languagetool.Language; import org.languagetool.language.English; import org.languagetool.rules.patterns.AbstractPatternRule; import org.languagetool.rules.patterns.PatternRuleLoader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.List; import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME; import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME_LOWERCASE; public class PatternRuleQueryBuilderTest extends LuceneTestCase { private IndexSearcher searcher; private DirectoryReader reader; private Directory directory; private Language language; @Override public void setUp() throws Exception { super.setUp(); language = new English(); directory = new RAMDirectory(); /*File indexPath = new File("/tmp/lucene"); if (indexPath.exists()) { FileUtils.deleteDirectory(indexPath); } directory = FSDirectory.open(indexPath);*/ Analyzer analyzer = Indexer.getAnalyzer(language); IndexWriterConfig config = Indexer.getIndexWriterConfig(analyzer); try (IndexWriter writer = new IndexWriter(directory, config)) { addDocument(writer, "How do you thin about this wonderful idea?"); addDocument(writer, "The are several grammar checkers for English, E.G. LanguageTool 123."); } reader = DirectoryReader.open(directory); searcher = newSearcher(reader); } @Override public void tearDown() throws Exception { super.tearDown(); if (reader != null) { reader.close(); } if (directory != null) { directory.close(); } } private void addDocument(IndexWriter writer, String content) throws IOException { Document doc = new Document(); FieldType type = new FieldType(); type.setStored(true); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setTokenized(true); doc.add(new Field(FIELD_NAME, content, type)); doc.add(new Field(FIELD_NAME_LOWERCASE, content, type)); writer.addDocument(doc); } public void testQueryBuilder() throws Exception { String ruleXml = "<token skip='-1'>How</token>" // match "How" + "<token postag='PRP'></token>"// match"you/[PRP]" + "<token skip='1'>thin</token>" // match "thin" + "<token postag_regexp='yes' postag='JJ|DT'>this</token>" // match "this/[DT]" + "<token regexp='yes' negate='yes'>bad|good</token>" // match "wonderful" + "<token regexp='yes'>idea|proposal</token>"; // match "idea" AbstractPatternRule patternRule = makeRule(ruleXml); PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language, searcher); Query query = patternRuleQueryBuilder.buildRelaxedQuery(patternRule); assertEquals("+fieldLowercase:how +fieldLowercase:_pos_prp +fieldLowercase:thin " + "+spanNear([fieldLowercase:this, SpanMultiTermQueryWrapper(fieldLowercase:/_pos_(jj|dt)/)], 0, false) " + "+fieldLowercase:/idea|proposal/", query.toString()); } public void testCaseSensitive() throws Exception { InputStream input = new ByteArrayInputStream(("<?xml version='1.0' encoding='UTF-8'?> <rules lang='en'> <category name='Test'>" + "<rule id='TEST_RULE_1' name='test_1'> <pattern case_sensitive='yes'><token>How</token></pattern> </rule>" + "<rule id='TEST_RULE_2' name='test_2'> <pattern case_sensitive='yes'><token>how</token>" + "</pattern> </rule>" + "<rule id='TEST_RULE_3' name='test_3'> <pattern><token>How</token></pattern> </rule>" + "<rule id='TEST_RULE_4' name='test_4'> <pattern><token>how</token></pattern> </rule>" + "</category> </rules>").getBytes()); PatternRuleLoader ruleLoader = new PatternRuleLoader(); List<AbstractPatternRule> rules = ruleLoader.getRules(input, "test.xml"); PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language, searcher); Query query1 = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(0)); assertEquals(1, searcher.search(query1, 1000).totalHits); Query query2 = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(1)); assertEquals(0, searcher.search(query2, 1000).totalHits); Query query3 = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(2)); assertEquals(1, searcher.search(query3, 1000).totalHits); Query query4 = patternRuleQueryBuilder.buildRelaxedQuery(rules.get(3)); assertEquals(1, searcher.search(query4, 1000).totalHits); } public void testUnsupportedPatternRule() throws Exception { PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language, searcher); try { patternRuleQueryBuilder.buildRelaxedQuery(makeRule("<token skip='-1'><exception>and</exception></token>", false)); fail("Exception should be thrown for unsupported PatternRule"); } catch (UnsupportedPatternRuleException ignored) {} } public void testUnsupportedBackReferencePatternRule() throws Exception { PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language, searcher); try { patternRuleQueryBuilder.buildRelaxedQuery(makeRule("<token>\\1</token>", false)); fail("Exception should be thrown for unsupported PatternRule"); } catch (UnsupportedPatternRuleException ignored) {} } public void testSpecialRegexSyntax() throws Exception { AbstractPatternRule patternRule = makeRule("<token regexp='yes'>\\p{Punct}</token>", false); PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language, searcher); try { queryBuilder.buildRelaxedQuery(patternRule); fail(); } catch (UnsupportedPatternRuleException ignore) {} } public void testSpecialRegexSyntax2() throws Exception { AbstractPatternRule patternRule = makeRule("<token regexp='yes' inflected='yes'>\\p{Lu}\\p{Ll}+</token>", false); PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language, searcher); try { queryBuilder.buildRelaxedQuery(patternRule); fail(); } catch (UnsupportedPatternRuleException ignore) {} } public void testNumberRegex() throws Exception { assertMatches(makeRule("<token regexp='yes'>13\\d</token>"), 0); assertMatches(makeRule("<token regexp='yes'>12\\d</token>"), 1); } public void testIgnoreOptionalTokens() throws Exception { assertMatches(makeRule("<token min='0'>optional</token><token>idea</token>"), 1); } public void testOnlyInflected() throws Exception { assertMatches(makeRule("<token inflected='yes'>think</token>"), 0); assertMatches(makeRule("<token inflected='yes'>LanguageTool</token>"), 1); assertMatches(makeRule("<token inflected='yes'>checker</token>"), 1); } public void testInflectedAndRegex() throws Exception { assertMatches(makeRule("<token inflected='yes' regexp='yes'>foo|bar</token>"), 0); assertMatches(makeRule("<token inflected='yes' regexp='yes'>walk|be</token>"), 1); assertMatches(makeRule("<token inflected='yes' regexp='yes'>somefoo|wonderful</token>"), 1); assertMatches(makeRule("<token inflected='yes' regexp='yes'>somefoo|wonderf.l</token>"), 1); assertMatches(makeRule("<token inflected='yes' regexp='yes'>somefoo|wonderX.l</token>"), 0); } public void testSeveralElements() throws Exception { // See setup() for the texts we can match assertMatches(makeRule("<token>How</token>"), 1); assertMatches(makeRule("<token>how</token>"), 1); assertMatches(makeRule("<token>LanguageTool</token>"), 1); assertMatches(makeRule("<token>UnknownWord</token>"), 0); assertMatches(makeCaseSensitiveRule("<token>How</token>"), 1); assertMatches(makeCaseSensitiveRule("<token>how</token>"), 0); assertMatches(makeRule("<token regexp='yes'>Foo|How</token>"), 1); assertMatches(makeRule("<token regexp='yes'>Foo|how</token>"), 1); assertMatches(makeRule("<token regexp='yes'>Foo|Bar</token>"), 0); assertMatches(makeCaseSensitiveRule("<token regexp='yes'>Foo|How</token>"), 1); assertMatches(makeCaseSensitiveRule("<token regexp='yes'>foo|HOW</token>"), 0); assertMatches(makeCaseSensitiveRule("<token regexp='yes'>foo|how</token>"), 0); assertMatches(makeRule("<token postag='WRB'></token>"), 1); assertMatches(makeRule("<token postag='FOO'></token>"), 0); assertMatches(makeRule("<token postag='[XW]RB' postag_regexp='yes'></token>"), 1); assertMatches(makeRule("<token postag='FOO|WRB' postag_regexp='yes'></token>"), 1); assertMatches(makeRule("<token postag='WRB|FOO' postag_regexp='yes'></token>"), 1); assertMatches(makeRule("<token postag='[XY]OO' postag_regexp='yes'></token>"), 0); // inflected assertMatches(makeRule("<token>grammar</token><token>checker</token>"), 0); assertMatches(makeRule("<token>grammar</token><token>checkers</token>"), 1); assertMatches(makeRule("<token>grammar</token><token inflected='yes'>checker</token>"), 1); // combine term and POS tag: assertMatches(makeRule("<token postag='WRB'>How</token>"), 1); assertMatches(makeRule("<token postag='[XW]RB' postag_regexp='yes'>How</token>"), 1); assertMatches(makeRule("<token postag='WRB'>Foo</token>"), 0); assertMatches(makeRule("<token postag='FOO'>How</token>"), 0); // rules with more than one token: assertMatches(makeRule("<token>How</token> <token>do</token>"), 1); //assertMatches(makeRule("<token>do</token> <token>How</token>"), 0); assertMatches(makeRule("<token>How</token> <token>foo</token>"), 0); assertMatches(makeRule("<token>How</token> <token>do</token> <token>you</token>"), 1); assertMatches(makeRule("<token>How</token> <token>do</token> <token>foo</token>"), 0); assertMatches(makeRule("<token regexp='yes'>Foo|How</token> <token>do</token>"), 1); assertMatches(makeRule("<token skip='-1'>How</token> <token>wonderful</token>"), 1); //assertMatches(makeRule("<token skip='-1'>wonderful</token> <token>How</token>"), 0); assertMatches(makeRule("<token skip='6'>How</token> <token>wonderful</token>"), 1); assertMatches(makeRule("<token skip='5'>How</token> <token>wonderful</token>"), 1); //assertMatches(makeRule("<token skip='4'>How</token> <token>wonderful</token>"), 0); assertMatches(makeRule("<token>How</token> <token skip='-1'>do</token> <token>wonderful</token>"), 1); assertMatches(makeRule("<token>How</token> <token skip='4'>do</token> <token>wonderful</token>"), 1); //assertMatches(makeRule("<token>How</token> <token skip='3'>do</token> <token>wonderful</token>"), 0); assertMatches(makeRule("<token skip='-1'>How</token> <token skip='-1'>thin</token> <token>wonderful</token>"), 1); assertMatches(makeRule("<token skip='3'>How</token> <token skip='3'>thin</token> <token>wonderful</token>"), 1); assertMatches(makeRule("<token skip='3'>How</token> <token skip='3'>thin</token> <token>foo</token>"), 0); assertMatches(makeRule("<token>E</token> <token>.</token> <token>G</token> <token>.</token>"), 1); assertMatches(makeRule("<token>X</token> <token>.</token> <token>G</token> <token>.</token>"), 0); //assertMatches(makeRule("<token>E</token> <token>,</token> <token>G</token> <token>.</token>"), 0); assertMatches(makeRule("<token>E</token> <token>.</token> <token>G</token> <token>.</token> <token>LanguageTool</token>"), 1); assertMatches(makeRule("<token>E</token> <token>.</token> <token>G</token> <token>.</token> <token>foo</token>"), 0); // negation: assertMatches(makeRule("<token>How</token> <token negate='yes'>foo</token>"), 1); assertMatches(makeRule("<token>How</token> <token negate='yes'>do</token>"), 1); // known overmatching assertMatches(makeRule("<token>How</token> <token>do</token> <token negate='yes'>foo</token>"), 1); assertMatches(makeRule("<token>How</token> <token negate='yes'>foo</token> <token>you</token>"), 1); assertMatches(makeRule("<token>How</token> <token>do</token> <token negate='yes'>you</token>"), 1); // known overmatching assertMatches(makeRule("<token>How</token> <token negate='yes'>do</token> <token>you</token>"), 1); // known overmatching assertMatches(makeRule("<token>How</token> <token negate='yes'>do</token> <token negate='yes'>you</token>"), 1); // known overmatching } private void assertMatches(AbstractPatternRule patternRule, int expectedMatches) throws Exception { PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language, searcher); Query query = queryBuilder.buildRelaxedQuery(patternRule); //System.out.println("QUERY: " + query); int matches = searcher.search(query, 1000).totalHits; assertEquals("Query failed: " + query, expectedMatches, matches); } private AbstractPatternRule makeCaseSensitiveRule(String ruleXml) throws IOException { return makeRule(ruleXml, true); } private AbstractPatternRule makeRule(String ruleXml) throws IOException { return makeRule(ruleXml, false); } private AbstractPatternRule makeRule(String ruleXml, boolean caseSensitive) throws IOException { StringBuilder sb = new StringBuilder(); sb.append("<?xml version='1.0' encoding='UTF-8'?>"); sb.append("<rules lang='en'> <category name='Test'> <rule id='TEST_RULE' name='test'>"); if (caseSensitive) { sb.append("<pattern case_sensitive='yes'>"); } else { sb.append("<pattern>"); } sb.append(ruleXml); sb.append("</pattern> </rule> </category> </rules>"); InputStream input = new ByteArrayInputStream(sb.toString().getBytes()); PatternRuleLoader ruleLoader = new PatternRuleLoader(); List<AbstractPatternRule> rules = ruleLoader.getRules(input, "test.xml"); assertEquals(1, rules.size()); return rules.get(0); } }