package org.cdlib.xtf.textEngine; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashSet; import java.util.Set; import java.util.StringTokenizer; import java.util.Vector; import org.apache.lucene.bigram.BigramQueryRewriter; import org.apache.lucene.chunk.SpanChunkedNotQuery; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.cdlib.xtf.textEngine.SpanExactQuery; import org.cdlib.xtf.util.Tester; import org.cdlib.xtf.util.Trace; /** * Rewrites a query to eliminate stop words by combining them with * adjacent non-stop-words, forming "bi-grams". This is a fairly in-depth * process, as bi-gramming across NEAR and OR queries is complex. */ public class XtfBigramQueryRewriter extends BigramQueryRewriter { private Set tokenizedFields; /** * Constructs a rewriter using the given stopword set. * * @param stopSet Set of stopwords to remove or bi-gram. This can be * constructed easily by calling * {@link #makeStopSet(String)}. * @param maxSlop Maximum slop to allow in a query, based on the index * being queried. * @param tokFields List of fields that are tokenized. We won't rewrite * queries for non-tokenized fields. */ public XtfBigramQueryRewriter(Set stopSet, int maxSlop, Set tokFields) { super(stopSet, maxSlop); tokenizedFields = tokFields; } // constructor /** * Rewrite a query of any supported type. Stop words will either be * removed or bi-grammed. Skips all queries for un-tokenized fields. * * @param q Query to rewrite * @return A new query, or 'q' unchanged if no change was needed. */ public Query rewriteQuery(Query q) { // Skip all queries for non-tokenized fields if (q instanceof SpanQuery && !tokenizedFields.contains(((SpanQuery)q).getField())) return q; // Handle our special XTF queries. if (q instanceof SpanSectionTypeQuery) return rewrite((SpanSectionTypeQuery)q); else if (q instanceof SpanExactQuery) return rewrite((SpanExactQuery)q); else if (q instanceof MoreLikeThisQuery) return rewrite((MoreLikeThisQuery)q); else if (q instanceof NumericRangeQuery) return rewrite((NumericRangeQuery)q); // Punt to normal handling. return super.rewriteQuery(q); } /** * Rewrite a section type query. If's very simple: simply rewrite the * sub-queries. * * @param stq The query to rewrite * @return Rewritten version, or 'nq' unchanged if no changed needed. */ protected Query rewrite(SpanSectionTypeQuery stq) { // Rewrite the sub-queries SpanQuery textQuery = (SpanQuery)rewriteQuery(stq.getTextQuery()); SpanQuery secTypeQuery = (SpanQuery)rewriteQuery(stq.getSectionTypeQuery()); // If the sub-queries didn't change, then neither does the main query. if (textQuery == stq.getTextQuery() && secTypeQuery == stq.getSectionTypeQuery()) return stq; // Make a new query Query newq = new SpanSectionTypeQuery(textQuery, secTypeQuery); copyBoost(stq, newq); return newq; } // rewrite() /** * Rewrite a span EXACT query. Stop words will be bi-grammed into adjacent * terms. * * @param q The query to rewrite * @return Rewritten version, or 'q' unchanged if no changed needed. */ protected Query rewrite(SpanExactQuery q) { // Rewrite each clause. Allow single clauses to be promoted, and // do perform bi-gramming. // return rewriteClauses(q, q.getClauses(), true, true, 0, new SpanClauseJoiner() { public SpanQuery join(SpanQuery[] clauses) { return new SpanExactQuery(clauses); } }); } // rewrite() /** Rewrite a "more like this" query */ protected Query rewrite(MoreLikeThisQuery mlt) { Query rewrittenSub = rewriteQuery(mlt.getSubQuery()); if (rewrittenSub == mlt.getSubQuery() && !forceRewrite(mlt)) return mlt; return copyBoost(mlt, new MoreLikeThisQuery(rewrittenSub)); } /** Rewrite a numeric range query */ protected Query rewrite(NumericRangeQuery nrq) { if (!forceRewrite(nrq)) return nrq; return (NumericRangeQuery)nrq.clone(); } /** * Basic regression test */ public static final Tester tester = new Tester("XtfBigramStopFilter") { private Set stopSet = new HashSet(); private String queryToText(Query q) { StringBuffer buf = new StringBuffer(); if (q.getBoost() != 1.0f) { float boost = q.getBoost(); q.setBoost(1.0f); buf.append(queryToText(q)); q.setBoost(boost); buf.append("^" + (int)boost); return buf.toString(); } if (q instanceof SpanTermQuery) return ((SpanTermQuery)q).getTerm().text(); if (q instanceof TermQuery) return ((TermQuery)q).getTerm().text(); if (q instanceof SpanNearQuery) { SpanQuery[] clauses = ((SpanNearQuery)q).getClauses(); int slop = ((SpanNearQuery)q).getSlop(); buf.append("\""); for (int i = 0; i < clauses.length; i++) { if (buf.length() > 1) buf.append(" "); buf.append(queryToText(clauses[i])); } buf.append("\""); if (slop != 0) buf.append("~" + slop); return buf.toString(); } if (q instanceof SpanOrQuery) { SpanQuery[] clauses = ((SpanOrQuery)q).getClauses(); buf.append("("); for (int i = 0; i < clauses.length; i++) { if (buf.length() > 1) buf.append(" OR "); buf.append(queryToText(clauses[i])); } buf.append(")"); return buf.toString(); } if (q instanceof SpanChunkedNotQuery) { SpanChunkedNotQuery nq = (SpanChunkedNotQuery)q; buf.append("("); buf.append(queryToText(nq.getInclude())); buf.append(" NOT "); buf.append(queryToText(nq.getExclude())); buf.append(")~" + nq.getSlop()); return buf.toString(); } if (q instanceof BooleanQuery) { BooleanClause[] clauses = ((BooleanQuery)q).getClauses(); buf.append("("); for (int i = 0; i < clauses.length; i++) { if (buf.length() > 1) buf.append(" "); if (clauses[i].getOccur() == BooleanClause.Occur.MUST) buf.append("+"); else if (clauses[i].getOccur() == BooleanClause.Occur.MUST_NOT) buf.append("-"); buf.append(queryToText(clauses[i].getQuery())); } buf.append(")"); return buf.toString(); } return q.toString(); } // queryToText() private SpanQuery term(String text) { return new SpanTermQuery(new Term("text", text)); } private SpanQuery[] terms(String text) { Vector v = new Vector(); StringTokenizer st = new StringTokenizer(text); while (st.hasMoreTokens()) v.add(term(st.nextToken())); return (SpanQuery[])v.toArray(new SpanQuery[v.size()]); } private SpanQuery or(SpanQuery[] clauses) { return new SpanOrQuery(clauses); } private SpanQuery not(int slop, SpanQuery include, SpanQuery exclude) { return new SpanChunkedNotQuery(include, exclude, slop); } private SpanQuery near(int slop, SpanQuery[] clauses) { return new SpanNearQuery(clauses, slop, false); } private SpanQuery and(SpanQuery[] clauses) { return near(20, clauses); } private SpanQuery phrase(SpanQuery[] clauses) { return near(0, clauses); } private SpanQuery[] join(SpanQuery q1, SpanQuery q2) { SpanQuery[] array = new SpanQuery[2]; array[0] = q1; array[1] = q2; return array; } private SpanQuery[] join(SpanQuery q1, SpanQuery q2, SpanQuery q3) { SpanQuery[] array = new SpanQuery[3]; array[0] = q1; array[1] = q2; array[2] = q3; return array; } private Query bool(Query q1, BooleanClause.Occur occur1, Query q2, BooleanClause.Occur occur2, Query q3, BooleanClause.Occur occur3) { BooleanQuery q = new BooleanQuery(); q.add(q1, occur1); q.add(q2, occur2); if (q3 != null) q.add(q3, occur3); return q; } private Query regTerm(String text) { return new TermQuery(new Term("text", text)); } private SpanQuery boost(float factor, SpanQuery q) { q.setBoost(factor); return q; } private Query boost(float factor, Query q) { q.setBoost(factor); return q; } private void testQuery(Query query, String expectedResult) { BigramQueryRewriter rewriter = new BigramQueryRewriter(stopSet, 20); Query newQ = rewriter.rewriteQuery(query); String result = queryToText(newQ); Trace.debug(queryToText(query) + " --> " + result); assert result.equals(expectedResult); } // testQuery() private void testUnchanged(Query query) { BigramQueryRewriter rewriter = new BigramQueryRewriter(stopSet, 20); Query newQ = rewriter.rewriteQuery(query); assert query == newQ; } // testQuery() /** * Run the test. */ protected void testImpl() { stopSet = BigramQueryRewriter.makeStopSet("a and it is the of"); //////////////////////////////////////////////////////////////////////// // PHRASE QUERIES //////////////////////////////////////////////////////////////////////// // Start with some simple ones testUnchanged(phrase(terms("hello there"))); testQuery(phrase(terms("man of war")), "\"man~of of~war\""); testQuery(phrase(terms("man of the world")), "\"man~of of~the the~world\""); testQuery(phrase(terms("when it is a problem")), "\"when~it it~is is~a a~problem\""); testQuery(phrase(terms("and martha is")), "\"and~martha martha~is\""); // Test phrase queries with non~term clauses. testQuery( phrase(join(term("the"), or(terms("white beige")), term("rabbit"))), "\"(the~white OR the~beige) rabbit\""); // It would be a huge pain to deal with trying to apply inner stop // words from an OR query to the outer terms. So we just don't. // testQuery(phrase(join(term("eat"), or(terms("the a")), term("rabbit"))), "\"eat rabbit\""); // Test boost propagation testQuery(phrase(join(term("eat"), boost(5, term("the")), term("wave"))), "\"eat~the^5 the~wave^5\""); testQuery(phrase(join(term("eat"), term("the"), boost(5, term("wave")))), "\"eat~the the~wave^5\""); //////////////////////////////////////////////////////////////////////// // AND QUERIES //////////////////////////////////////////////////////////////////////// // Start with simple ones testUnchanged(and(terms("hello there"))); testQuery(and(terms("man of war")), "\"(man^0 OR man~of) (of~war OR war^0)\"~20"); // Test AND queries with non~term clauses. testQuery(and(join(term("the"), or(terms("white beige")), term("rabbit"))), "\"((the~white OR the~beige) OR (white OR beige)^0) rabbit\"~20"); // Test boost propagation testQuery(boost(2, and(join(term("eat"), boost(5, term("the")), term("wave")))), "\"(eat^0 OR eat~the^5) (the~wave^5 OR wave^0)\"~20^2"); testQuery(boost(5, and( join(boost(2, term("eat")), boost(3, or(terms("the a")))))), "eat^10"); //////////////////////////////////////////////////////////////////////// // NEAR QUERIES //////////////////////////////////////////////////////////////////////// testUnchanged(near(5, terms("three freezy trees"))); testUnchanged(near(5, join(term("three"), or(terms("freezy breezy")), term("trees")))); testQuery(near(5, terms("man of war")), "\"(man^0 OR man~of) (of~war OR war^0)\"~5"); testQuery( near(5, terms("when it is a problem")), "(\"when~it it~is is~a a~problem\"~5 OR " + "\"(when^0 OR when~it) (a~problem OR problem^0)\"~5^0)"); testQuery(near(5, terms("it is a problem")), "(\"it~is is~a a~problem\"~5 OR (a~problem OR problem^0)^0)"); testQuery(near(5, terms("when it is a")), "(\"when~it it~is is~a\"~5 OR (when^0 OR when~it)^0)"); // Try some near queries with non~term clauses. testQuery(near(5, join(or(terms("shake bake")), term("it"))), "((shake OR bake)^0 OR (shake~it OR bake~it))"); testQuery(near(5, join(or(terms("shake bake")), term("it"), term("now"))), "\"((shake OR bake)^0 OR (shake~it OR bake~it)) " + "(it~now OR now^0)\"~5"); testQuery(near(5, join(term("jeff"), or(terms("shakes bakes")), term("it"))), "\"jeff ((shakes OR bakes)^0 OR (shakes~it OR bakes~it))\"~5"); // Test boost propagation testQuery( boost(2, near(5, join(boost(3, or(join(boost(4, term("shake")), boost(5, term("bake"))))), boost(6, term("it")), boost(7, term("now"))))), "\"((shake^4 OR bake^5)^2 OR " + "(shake~it^6 OR bake~it^6)^3) " + "(it~now^7 OR now^5)\"~5^2"); testQuery( boost(7, near(5, join(boost(6, or(join(boost(5, term("shake")), boost(4, term("bake"))))), boost(3, term("it")), boost(2, term("now"))))), "\"((shake^5 OR bake^4)^4 OR " + "(shake~it^5 OR bake~it^4)^6) " + "(it~now^3 OR now^1)\"~5^7"); //////////////////////////////////////////////////////////////////////// // OR QUERIES //////////////////////////////////////////////////////////////////////// testUnchanged(or(join(term("foo"), and(terms("bar gaz"))))); testQuery(or(join(term("arf"), and(terms("the dog")), term("said"))), "(arf OR (the~dog OR dog^0) OR said)"); testQuery(or(join(term("the"), and(terms("very nice")), term("rabbit"))), "(\"very nice\"~20 OR rabbit)"); // Test boost propagation testQuery(boost(5, or(join(boost(2, term("the")), boost(3, term("happy")), boost(4, term("couple"))))), "(happy^3 OR couple^4)^5"); testQuery( boost(5, or(join(boost(2, term("the")), boost(3, term("happy")), boost(4, term("it"))))), "happy^15"); //////////////////////////////////////////////////////////////////////// // NOT QUERIES //////////////////////////////////////////////////////////////////////// testUnchanged(not(5, term("hello"), term("there"))); testQuery(not(5, and(terms("the cow")), and(terms("the dog"))), "((the~cow OR cow^0) NOT (the~dog OR dog^0))~5"); testQuery( and(join(term("like"), term("a"), not(5, term("cow"), term("dog")))), "\"(like^0 OR like~a) ((a~cow NOT dog)~5 OR (cow NOT dog)~5^0)\"~20"); // A couple tests anticipating future support for case sensitivity and // accent insensitivity. // testQuery( and(join(term("the"), not(0, term("hat"), or(terms("hat~p hat~c"))), term("trick"))), "\"((the~hat NOT (hat~p OR hat~c))~0 OR " + "(hat NOT (hat~p OR hat~c))~0^0) trick\"~20"); testQuery( and(join(term("hank"), not(0, term("hat"), or(terms("hat~p hat~c"))), term("is"))), "\"hank ((hat NOT (hat~p OR hat~c))~0^0 OR " + "(hat~is NOT (hat~p OR hat~c))~0)\"~20"); //////////////////////////////////////////////////////////////////////// // BOOLEAN QUERIES //////////////////////////////////////////////////////////////////////// testUnchanged(bool(regTerm("hello"), BooleanClause.Occur.MUST, regTerm("kitty"), BooleanClause.Occur.MUST_NOT, regTerm("pencil"), BooleanClause.Occur.MUST)); testQuery(bool(regTerm("cats"), BooleanClause.Occur.MUST, regTerm("and"), BooleanClause.Occur.MUST_NOT, regTerm("hats"), BooleanClause.Occur.MUST), "(+cats +hats)"); testQuery(bool(regTerm("cats"), BooleanClause.Occur.MUST, regTerm("and"), BooleanClause.Occur.SHOULD, regTerm("hats"), BooleanClause.Occur.MUST), "(+cats +hats)"); testQuery(bool(regTerm("is"), BooleanClause.Occur.MUST, regTerm("it"), BooleanClause.Occur.MUST, regTerm("fun"), BooleanClause.Occur.SHOULD), "(fun)"); // Test BooleanQuery with non~term clauses testQuery(bool(regTerm("whip"), BooleanClause.Occur.MUST, or(terms("it them")), BooleanClause.Occur.MUST, regTerm("good"), BooleanClause.Occur.MUST), "(+whip +them +good)"); // Test boost propagation testQuery(boost(2, bool(boost(3, regTerm("it")), BooleanClause.Occur.SHOULD, boost(4, regTerm("and")), BooleanClause.Occur.SHOULD, boost(5, regTerm("harry")), BooleanClause.Occur.MUST)), "harry^10"); } // testImpl() }; // Tester } // class XtfBigramQueryRewriter