/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.searcher.basic; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.analysis.CommonGrams; import org.apache.nutch.searcher.QueryFilter; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.Query.*; import org.apache.hadoop.conf.Configuration; /** The default query filter. Query terms in the default query field are * expanded to search the url, anchor and content document fields. * Additional fields can be added by specifying parameters of the form : query.basic.(fieldname).boost * to the configuration files (see nutch-default.xml for an example).Such fields will be used in the clauses * generated by the BasicQueryFilter e.g. for a user query A B, it generates +(field1:A field2:A ...) +(field1:B field2:B....). * If you don't want the additional fields to be included in the clauses you will need to implement a custom query filter for it. **/ public class BasicQueryFilter implements QueryFilter { private static final int URL_BOOST = 0; private static final int ANCHOR_BOOST = 1; private static final int CONTENT_BOOST = 2; private static final int TITLE_BOOST = 3; private static final int HOST_BOOST = 4; private static int SLOP = Integer.MAX_VALUE; private float PHRASE_BOOST; private String[] FIELDS = { "url", "anchor", "content", "title", "host" }; private float[] FIELD_BOOSTS = new float[5]; /** * Set the boost factor for url matches, relative to content and anchor * matches */ public void setUrlBoost(float boost) { FIELD_BOOSTS[URL_BOOST] = boost; } /** Set the boost factor for title/anchor matches, relative to url and * content matches. */ public void setAnchorBoost(float boost) { FIELD_BOOSTS[ANCHOR_BOOST] = boost; } /** Set the boost factor for sloppy phrase matches relative to unordered term * matches. */ public void setPhraseBoost(float boost) { PHRASE_BOOST = boost; } /** Set the maximum number of terms permitted between matching terms in a * sloppy phrase match. */ public void setSlop(int slop) { SLOP = slop; } private Configuration conf; public BooleanQuery filter(Query input, BooleanQuery output) { addTerms(input, output); addSloppyPhrases(input, output); return output; } private void addTerms(Query input, BooleanQuery output) { Clause[] clauses = input.getClauses(); for (int i = 0; i < clauses.length; i++) { Clause c = clauses[i]; if (!c.getField().equals(Clause.DEFAULT_FIELD)) continue; // skip non-default fields BooleanQuery out = new BooleanQuery(); for (int f = 0; f < FIELDS.length; f++) { Clause o = c; if (c.isPhrase()) { // optimize phrase clauses String[] opt = new CommonGrams(getConf()).optimizePhrase(c.getPhrase(), FIELDS[f]); if (opt.length==1) { o = new Clause(new Term(opt[0]), c.isRequired(), c.isProhibited(), getConf()); } else { o = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited(), getConf()); } } out.add(o.isPhrase() ? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f]) : termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]), BooleanClause.Occur.SHOULD); } output.add(out, (c.isProhibited() ? BooleanClause.Occur.MUST_NOT : (c.isRequired() ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD ))); } } private void addSloppyPhrases(Query input, BooleanQuery output) { Clause[] clauses = input.getClauses(); for (int f = 0; f < FIELDS.length; f++) { PhraseQuery sloppyPhrase = new PhraseQuery(); sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST); sloppyPhrase.setSlop("anchor".equals(FIELDS[f]) ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP : SLOP); int sloppyTerms = 0; for (int i = 0; i < clauses.length; i++) { Clause c = clauses[i]; if (!c.getField().equals(Clause.DEFAULT_FIELD)) continue; // skip non-default fields if (c.isPhrase()) // skip exact phrases continue; if (c.isProhibited()) // skip prohibited terms continue; sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm())); sloppyTerms++; } if (sloppyTerms > 1) output.add(sloppyPhrase, BooleanClause.Occur.SHOULD); } } private org.apache.lucene.search.Query termQuery(String field, Term term, float boost) { TermQuery result = new TermQuery(luceneTerm(field, term)); result.setBoost(boost); return result; } /** Utility to construct a Lucene exact phrase query for a Nutch phrase. */ private org.apache.lucene.search.Query exactPhrase(Phrase nutchPhrase, String field, float boost) { Term[] terms = nutchPhrase.getTerms(); PhraseQuery exactPhrase = new PhraseQuery(); for (int i = 0; i < terms.length; i++) { exactPhrase.add(luceneTerm(field, terms[i])); } exactPhrase.setBoost(boost); return exactPhrase; } /** Utility to construct a Lucene Term given a Nutch query term and field. */ private static org.apache.lucene.index.Term luceneTerm(String field, Term term) { return new org.apache.lucene.index.Term(field, term.toString()); } public void setConf(Configuration conf) { this.conf = conf; this.FIELD_BOOSTS[URL_BOOST] = conf.getFloat("query.url.boost", 4.0f); this.FIELD_BOOSTS[ANCHOR_BOOST] = conf.getFloat("query.anchor.boost", 2.0f); this.FIELD_BOOSTS[CONTENT_BOOST] = conf.getFloat("query.content.boost", 1.0f); this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f); this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f); this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f); findAdditionalFields(conf); } public Configuration getConf() { return this.conf; } /** Searches for parameters of the form : query.basic.(fieldname).boost * and adds the fielname to the list of default fields. **/ private void findAdditionalFields(Configuration conf) { // get additional fields specified in parameters Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost"); Iterator confEntriesIterator = conf.iterator(); List existingFields = java.util.Arrays.asList(FIELDS); ArrayList tempfieldNames = new ArrayList(); ArrayList tempfieldBoosts = new ArrayList(); while (confEntriesIterator.hasNext()){ Map.Entry entry = (Map.Entry) confEntriesIterator.next(); String key = entry.getKey().toString(); Matcher match = pat.matcher(key); if (!match.matches())continue; String fieldName = match.group(1); if (fieldName!=null){ // check whether it matches one of the fields which are used by default if (existingFields.contains(fieldName)) continue; // reserved keyword if (fieldName.equals("phrase")) continue; float boostCustomField = conf.getFloat(key, 2.0f); tempfieldNames.add(fieldName); tempfieldBoosts.add(Float.valueOf(boostCustomField)); } } if (tempfieldNames.size()==0) return; // store additional fields names and boost values in corresponding fields String[] tempNames = new String[5+tempfieldNames.size()]; float[] tempBoosts = new float[5+tempfieldNames.size()]; System.arraycopy(FIELDS, 0,tempNames, 0, 5); System.arraycopy(this.FIELD_BOOSTS, 0,tempBoosts, 0, 5); for (int newF=0; newF < tempfieldNames.size();newF++){ tempNames[5+newF]=(String) tempfieldNames.get(newF); tempBoosts[5+newF]= ((Float)tempfieldBoosts.get(newF)).floatValue(); } // replace original fields this.FIELDS = tempNames; this.FIELD_BOOSTS = tempBoosts; } }