/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.searcher.basic;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.analysis.CommonGrams;
import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;
import org.apache.hadoop.conf.Configuration;
/** The default query filter. Query terms in the default query field are
* expanded to search the url, anchor and content document fields.
* Additional fields can be added by specifying parameters of the form : query.basic.(fieldname).boost
* to the configuration files (see nutch-default.xml for an example).Such fields will be used in the clauses
* generated by the BasicQueryFilter e.g. for a user query A B, it generates +(field1:A field2:A ...) +(field1:B field2:B....).
* If you don't want the additional fields to be included in the clauses you will need to implement a custom query filter for it.
**/
public class BasicQueryFilter implements QueryFilter {
private static final int URL_BOOST = 0;
private static final int ANCHOR_BOOST = 1;
private static final int CONTENT_BOOST = 2;
private static final int TITLE_BOOST = 3;
private static final int HOST_BOOST = 4;
private static int SLOP = Integer.MAX_VALUE;
private float PHRASE_BOOST;
private String[] FIELDS =
{ "url", "anchor", "content", "title", "host" };
private float[] FIELD_BOOSTS = new float[5];
/**
* Set the boost factor for url matches, relative to content and anchor
* matches
*/
public void setUrlBoost(float boost) { FIELD_BOOSTS[URL_BOOST] = boost; }
/** Set the boost factor for title/anchor matches, relative to url and
* content matches. */
public void setAnchorBoost(float boost) { FIELD_BOOSTS[ANCHOR_BOOST] = boost; }
/** Set the boost factor for sloppy phrase matches relative to unordered term
* matches. */
public void setPhraseBoost(float boost) { PHRASE_BOOST = boost; }
/** Set the maximum number of terms permitted between matching terms in a
* sloppy phrase match. */
public void setSlop(int slop) { SLOP = slop; }
private Configuration conf;
public BooleanQuery filter(Query input, BooleanQuery output) {
addTerms(input, output);
addSloppyPhrases(input, output);
return output;
}
private void addTerms(Query input, BooleanQuery output) {
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
if (!c.getField().equals(Clause.DEFAULT_FIELD))
continue; // skip non-default fields
BooleanQuery out = new BooleanQuery();
for (int f = 0; f < FIELDS.length; f++) {
Clause o = c;
if (c.isPhrase()) { // optimize phrase clauses
String[] opt = new CommonGrams(getConf()).optimizePhrase(c.getPhrase(), FIELDS[f]);
if (opt.length==1) {
o = new Clause(new Term(opt[0]), c.isRequired(), c.isProhibited(), getConf());
} else {
o = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited(), getConf());
}
}
out.add(o.isPhrase()
? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f])
: termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]),
BooleanClause.Occur.SHOULD);
}
output.add(out, (c.isProhibited()
? BooleanClause.Occur.MUST_NOT
: (c.isRequired()
? BooleanClause.Occur.MUST
: BooleanClause.Occur.SHOULD
)));
}
}
private void addSloppyPhrases(Query input, BooleanQuery output) {
Clause[] clauses = input.getClauses();
for (int f = 0; f < FIELDS.length; f++) {
PhraseQuery sloppyPhrase = new PhraseQuery();
sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
sloppyPhrase.setSlop("anchor".equals(FIELDS[f])
? NutchDocumentAnalyzer.INTER_ANCHOR_GAP
: SLOP);
int sloppyTerms = 0;
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
if (!c.getField().equals(Clause.DEFAULT_FIELD))
continue; // skip non-default fields
if (c.isPhrase()) // skip exact phrases
continue;
if (c.isProhibited()) // skip prohibited terms
continue;
sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm()));
sloppyTerms++;
}
if (sloppyTerms > 1)
output.add(sloppyPhrase, BooleanClause.Occur.SHOULD);
}
}
private org.apache.lucene.search.Query
termQuery(String field, Term term, float boost) {
TermQuery result = new TermQuery(luceneTerm(field, term));
result.setBoost(boost);
return result;
}
/** Utility to construct a Lucene exact phrase query for a Nutch phrase. */
private org.apache.lucene.search.Query
exactPhrase(Phrase nutchPhrase,
String field, float boost) {
Term[] terms = nutchPhrase.getTerms();
PhraseQuery exactPhrase = new PhraseQuery();
for (int i = 0; i < terms.length; i++) {
exactPhrase.add(luceneTerm(field, terms[i]));
}
exactPhrase.setBoost(boost);
return exactPhrase;
}
/** Utility to construct a Lucene Term given a Nutch query term and field. */
private static org.apache.lucene.index.Term luceneTerm(String field,
Term term) {
return new org.apache.lucene.index.Term(field, term.toString());
}
public void setConf(Configuration conf) {
this.conf = conf;
this.FIELD_BOOSTS[URL_BOOST] = conf.getFloat("query.url.boost", 4.0f);
this.FIELD_BOOSTS[ANCHOR_BOOST] = conf.getFloat("query.anchor.boost", 2.0f);
this.FIELD_BOOSTS[CONTENT_BOOST] = conf.getFloat("query.content.boost", 1.0f);
this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f);
this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f);
this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f);
findAdditionalFields(conf);
}
public Configuration getConf() {
return this.conf;
}
/** Searches for parameters of the form : query.basic.(fieldname).boost
* and adds the fielname to the list of default fields.
**/
private void findAdditionalFields(Configuration conf) {
// get additional fields specified in parameters
Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost");
Iterator confEntriesIterator = conf.iterator();
List existingFields = java.util.Arrays.asList(FIELDS);
ArrayList tempfieldNames = new ArrayList();
ArrayList tempfieldBoosts = new ArrayList();
while (confEntriesIterator.hasNext()){
Map.Entry entry = (Map.Entry) confEntriesIterator.next();
String key = entry.getKey().toString();
Matcher match = pat.matcher(key);
if (!match.matches())continue;
String fieldName = match.group(1);
if (fieldName!=null){
// check whether it matches one of the fields which are used by default
if (existingFields.contains(fieldName)) continue;
// reserved keyword
if (fieldName.equals("phrase")) continue;
float boostCustomField = conf.getFloat(key, 2.0f);
tempfieldNames.add(fieldName);
tempfieldBoosts.add(Float.valueOf(boostCustomField));
}
}
if (tempfieldNames.size()==0) return;
// store additional fields names and boost values in corresponding fields
String[] tempNames = new String[5+tempfieldNames.size()];
float[] tempBoosts = new float[5+tempfieldNames.size()];
System.arraycopy(FIELDS, 0,tempNames, 0, 5);
System.arraycopy(this.FIELD_BOOSTS, 0,tempBoosts, 0, 5);
for (int newF=0; newF < tempfieldNames.size();newF++){
tempNames[5+newF]=(String) tempfieldNames.get(newF);
tempBoosts[5+newF]= ((Float)tempfieldBoosts.get(newF)).floatValue();
}
// replace original fields
this.FIELDS = tempNames;
this.FIELD_BOOSTS = tempBoosts;
}
}