/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.index; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.jetbrains.annotations.Nullable; import org.languagetool.AnalyzedToken; import org.languagetool.Language; import org.languagetool.rules.patterns.AbstractPatternRule; import org.languagetool.rules.patterns.PatternToken; import org.languagetool.synthesis.Synthesizer; import java.io.IOException; import java.util.HashSet; import java.util.Set; import static org.languagetool.dev.index.LanguageToolFilter.LEMMA_PREFIX; import static org.languagetool.dev.index.LanguageToolFilter.POS_PREFIX; /** * A factory class for building a Lucene Query from a PatternRule. The query * requires an index where each document contains only one sentence. It returns * potential matches, i.e. LanguageTool still needs to run over the matches * to make sure there is indeed an error. * * @author Tao Lin * @author Daniel Naber */ public class PatternRuleQueryBuilder { public static final String FIELD_NAME = "field"; public static final String SOURCE_FIELD_NAME = "source"; public static final String FIELD_NAME_LOWERCASE = "fieldLowercase"; private final Language language; private final IndexSearcher indexSearcher; public PatternRuleQueryBuilder(Language language, IndexSearcher indexSearcher) { this.language = language; this.indexSearcher = indexSearcher; } /** * Iterate over all elements, ignore those not supported, add the other ones to a BooleanQuery. * @throws UnsupportedPatternRuleException if no query could be created for the rule */ public Query buildRelaxedQuery(AbstractPatternRule rule) throws UnsupportedPatternRuleException { BooleanQuery.Builder builder = new BooleanQuery.Builder(); for (PatternToken patternToken : rule.getPatternTokens()) { try { BooleanClause clause = makeQuery(patternToken); builder.add(clause); } catch (UnsupportedPatternRuleException e) { //System.out.println("Ignoring because it's not supported: " + element + ": " + e); // cannot handle - okay to ignore, as we may return too broad matches } catch (Exception e) { throw new RuntimeException("Could not create query for rule " + rule.getId(), e); } } BooleanQuery query = builder.build(); if (query.clauses().size() == 0) { throw new UnsupportedPatternRuleException("No items found in rule that can be used to build a search query: " + rule); } return query; } private BooleanClause makeQuery(PatternToken patternToken) throws UnsupportedPatternRuleException { checkUnsupportedElement(patternToken); String termStr = patternToken.getString(); String pos = patternToken.getPOStag(); BooleanClause termQuery = getTermQueryOrNull(patternToken, termStr); BooleanClause posQuery = getPosQueryOrNull(patternToken, pos); if (termQuery != null && posQuery != null) { // if both term and POS are set, we create a query where both are at the same position if (mustOccur(termQuery) && mustOccur(posQuery)) { SpanQuery spanQueryForTerm = asSpanQuery(termQuery); SpanQuery spanQueryForPos = asSpanQuery(posQuery); SpanQuery[] spanClauses = {spanQueryForTerm, spanQueryForPos}; return new BooleanClause(new SpanNearQuery(spanClauses, 0, false), BooleanClause.Occur.MUST); } else { // should not happen, we always use Occur.MUST: throw new UnsupportedPatternRuleException("Term/POS combination not supported yet: " + patternToken); } } else if (termQuery != null) { return termQuery; } else if (posQuery != null) { return posQuery; } throw new UnsupportedPatternRuleException("Neither POS tag nor term set for element: " + patternToken); } private SpanQuery asSpanQuery(BooleanClause query) { if (query.getQuery() instanceof MultiTermQuery) { return new SpanMultiTermQueryWrapper<>((MultiTermQuery) query.getQuery()); } else { Set<Term> terms = new HashSet<>(); try { indexSearcher.createWeight(query.getQuery(), false).extractTerms(terms); } catch (IOException e) { throw new RuntimeException(e); } if (terms.size() != 1) { throw new RuntimeException("Expected term set of size 1: " + terms); } return new SpanTermQuery(terms.iterator().next()); } } private boolean mustOccur(BooleanClause query) { return query != null && query.getOccur() == BooleanClause.Occur.MUST; } @Nullable private BooleanClause getTermQueryOrNull(PatternToken patternToken, String termStr) throws UnsupportedPatternRuleException { if (termStr == null || termStr.isEmpty()) { return null; } Query termQuery; Term termQueryTerm = getTermQueryTerm(patternToken, termStr); if (patternToken.getNegation() || patternToken.getMinOccurrence() == 0) { // we need to ignore this - negation, if any, must happen at the same position return null; } else if (patternToken.isInflected() && patternToken.isRegularExpression()) { Term lemmaQueryTerm = getQueryTerm(patternToken, LEMMA_PREFIX + "(", simplifyRegex(termStr), ")"); Query regexpQuery = getRegexQuery(lemmaQueryTerm, termStr, patternToken); return new BooleanClause(regexpQuery, BooleanClause.Occur.MUST); } else if (patternToken.isInflected() && !patternToken.isRegularExpression()) { /* This is simpler, but leads to problem with e.g. German rules ZEITLICH_SYNCHRON and GEWISSEN_SUBST: Term lemmaQueryTerm = getQueryTerm(element, LEMMA_PREFIX, termStr, ""); Query query = new TermQuery(lemmaQueryTerm); return new BooleanClause(query, BooleanClause.Occur.MUST); */ Synthesizer synthesizer = language.getSynthesizer(); if (synthesizer != null) { try { String[] synthesized = synthesizer.synthesize(new AnalyzedToken(termStr, null, termStr), ".*", true); Query query; if (synthesized.length == 0) { query = new TermQuery(termQueryTerm); } else { query = new RegexpQuery(getTermQueryTerm(patternToken, StringUtils.join(synthesized, "|"))); } return new BooleanClause(query, BooleanClause.Occur.MUST); } catch (IOException e) { throw new RuntimeException("Could not build Lucene query for '" + patternToken + "' and '" + termStr + "'", e); } } return null; } else if (patternToken.isRegularExpression()) { termQuery = getRegexQuery(termQueryTerm, termStr, patternToken); } else { termQuery = new TermQuery(termQueryTerm); } return new BooleanClause(termQuery, BooleanClause.Occur.MUST); } // regex syntax not supported, but doesn't matter - remove or simplify it: private String simplifyRegex(String regex) { return regex.replace("(?:", "(").replace("\\d", "[0-9]").replace("\\w", "[a-zA-Z_0-9]"); } // the new and fast Regex query of Lucene doesn't support full Java regex syntax: private boolean needsSimplification(String regex) { return regex.contains("(?:") || regex.contains("\\d") || regex.contains("\\w"); } @Nullable private BooleanClause getPosQueryOrNull(PatternToken patternToken, String pos) throws UnsupportedPatternRuleException { if (pos == null || pos.isEmpty()) { return null; } Query posQuery; Term posQueryTerm; if (patternToken.getPOSNegation() || patternToken.getMinOccurrence() == 0) { // we need to ignore this - negation, if any, must happen at the same position return null; } else if (patternToken.isPOStagRegularExpression()) { posQueryTerm = getQueryTerm(patternToken, POS_PREFIX + "(", pos, ")"); posQuery = getRegexQuery(posQueryTerm, pos, patternToken); } else { posQueryTerm = getQueryTerm(patternToken, POS_PREFIX, pos, ""); posQuery = new TermQuery(posQueryTerm); } return new BooleanClause(posQuery, BooleanClause.Occur.MUST); } private Term getTermQueryTerm(PatternToken patternToken, String str) { if (patternToken.isCaseSensitive()) { return new Term(FIELD_NAME, str); } else { return new Term(FIELD_NAME_LOWERCASE, str.toLowerCase()); } } private Term getQueryTerm(PatternToken patternToken, String prefix, String str, String suffix) { if (patternToken.isCaseSensitive()) { return new Term(FIELD_NAME, prefix + str + suffix); } else { return new Term(FIELD_NAME_LOWERCASE, prefix.toLowerCase() + str.toLowerCase() + suffix.toLowerCase()); } } private Query getRegexQuery(Term term, String str, PatternToken patternToken) throws UnsupportedPatternRuleException { try { if (needsSimplification(str)) { Term newTerm = new Term(term.field(), simplifyRegex(term.text())); return new RegexpQuery(newTerm); } if (str.contains("?iu") || str.contains("?-i")) { // Lucene's RegexpQuery doesn't seem to support this throw new UnsupportedPatternRuleException("Regex constructs like '?iu' and '?-i' are not supported: " + patternToken); } return new RegexpQuery(term); } catch (IllegalArgumentException e) { // constructs like "\p{Punct}" not supported by Lucene RegExp: throw new UnsupportedPatternRuleException("Advanced regex like '\\p{Punct}' are not supported: " + patternToken); } } private void checkUnsupportedElement(PatternToken patternPatternToken) throws UnsupportedPatternRuleException { if (patternPatternToken.hasOrGroup()) { // TODO: this is not enough: the first of the tokens in the <or> group will not get into this branch throw new UnsupportedPatternRuleException("<or> not yet supported."); } if (patternPatternToken.isUnified()) { throw new UnsupportedPatternRuleException("Elements with unified tokens are not supported."); } if (patternPatternToken.getString().matches("\\\\\\d+")) { // e.g. "\1" throw new UnsupportedPatternRuleException("Elements with only match references (e.g. \\1) are not supported."); } } }