/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.searcher.basic; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.analysis.CommonGrams; import org.apache.nutch.searcher.QueryFilter; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.Query.*; import org.apache.hadoop.conf.Configuration; /** The default query filter. Query terms in the default query field are * expanded to search the url, anchor and content document fields.*/ public class BasicQueryFilter implements QueryFilter { private static final int URL_BOOST = 0; private static final int ANCHOR_BOOST = 1; private static final int CONTENT_BOOST = 2; private static final int TITLE_BOOST = 3; private static final int HOST_BOOST = 4; private static int SLOP = Integer.MAX_VALUE; private float PHRASE_BOOST; private static final String[] FIELDS = { "url", "anchor", "content", "title", "host" }; private float[] FIELD_BOOSTS = new float[5]; /** * Set the boost factor for url matches, relative to content and anchor * matches */ public void setUrlBoost(float boost) { FIELD_BOOSTS[URL_BOOST] = boost; } /** Set the boost factor for title/anchor matches, relative to url and * content matches. */ public void setAnchorBoost(float boost) { FIELD_BOOSTS[ANCHOR_BOOST] = boost; } /** Set the boost factor for sloppy phrase matches relative to unordered term * matches. */ public void setPhraseBoost(float boost) { PHRASE_BOOST = boost; } /** Set the maximum number of terms permitted between matching terms in a * sloppy phrase match. */ public void setSlop(int slop) { SLOP = slop; } private Configuration conf; public BooleanQuery filter(Query input, BooleanQuery output) { addTerms(input, output); addSloppyPhrases(input, output); return output; } private void addTerms(Query input, BooleanQuery output) { Clause[] clauses = input.getClauses(); for (int i = 0; i < clauses.length; i++) { Clause c = clauses[i]; if (!c.getField().equals(Clause.DEFAULT_FIELD)) continue; // skip non-default fields BooleanQuery out = new BooleanQuery(); for (int f = 0; f < FIELDS.length; f++) { Clause o = c; if (c.isPhrase()) { // optimize phrase clauses String[] opt = new CommonGrams(getConf()).optimizePhrase(c.getPhrase(), FIELDS[f]); if (opt.length==1) { o = new Clause(new Term(opt[0]), c.isRequired(), c.isProhibited(), getConf()); } else { o = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited(), getConf()); } } out.add(o.isPhrase() ? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f]) : termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]), BooleanClause.Occur.SHOULD); } output.add(out, (c.isProhibited() ? BooleanClause.Occur.MUST_NOT : (c.isRequired() ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD ))); } } private void addSloppyPhrases(Query input, BooleanQuery output) { Clause[] clauses = input.getClauses(); for (int f = 0; f < FIELDS.length; f++) { PhraseQuery sloppyPhrase = new PhraseQuery(); sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST); sloppyPhrase.setSlop("anchor".equals(FIELDS[f]) ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP : SLOP); int sloppyTerms = 0; for (int i = 0; i < clauses.length; i++) { Clause c = clauses[i]; if (!c.getField().equals(Clause.DEFAULT_FIELD)) continue; // skip non-default fields if (c.isPhrase()) // skip exact phrases continue; if (c.isProhibited()) // skip prohibited terms continue; sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm())); sloppyTerms++; } if (sloppyTerms > 1) output.add(sloppyPhrase, BooleanClause.Occur.SHOULD); } } private org.apache.lucene.search.Query termQuery(String field, Term term, float boost) { TermQuery result = new TermQuery(luceneTerm(field, term)); result.setBoost(boost); return result; } /** Utility to construct a Lucene exact phrase query for a Nutch phrase. */ private org.apache.lucene.search.Query exactPhrase(Phrase nutchPhrase, String field, float boost) { Term[] terms = nutchPhrase.getTerms(); PhraseQuery exactPhrase = new PhraseQuery(); for (int i = 0; i < terms.length; i++) { exactPhrase.add(luceneTerm(field, terms[i])); } exactPhrase.setBoost(boost); return exactPhrase; } /** Utility to construct a Lucene Term given a Nutch query term and field. */ private static org.apache.lucene.index.Term luceneTerm(String field, Term term) { return new org.apache.lucene.index.Term(field, term.toString()); } public void setConf(Configuration conf) { this.conf = conf; this.FIELD_BOOSTS[URL_BOOST] = conf.getFloat("query.url.boost", 4.0f); this.FIELD_BOOSTS[ANCHOR_BOOST] = conf.getFloat("query.anchor.boost", 2.0f); this.FIELD_BOOSTS[CONTENT_BOOST] = conf.getFloat("query.content.boost", 1.0f); this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f); this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f); this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f); } public Configuration getConf() { return this.conf; } }