package org.apache.nutch.searcher.custom; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.util.StringUtils; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.TermQuery; import org.apache.nutch.indexer.field.CustomFields; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.QueryException; import org.apache.nutch.searcher.QueryFilter; import org.apache.nutch.searcher.Query.Clause; public class CustomFieldQueryFilter implements QueryFilter { public static final Log LOG = LogFactory.getLog(CustomFields.class); private Configuration conf; private List<String> fieldNames = new ArrayList<String>(); private Map<String, Float> boosts = new HashMap<String, Float>(); public CustomFieldQueryFilter() { } public void setConf(Configuration conf) { try { this.conf = conf; FileSystem fs = FileSystem.get(conf); String configFile = conf.get("custom.fields.config", "custom-fields.xml"); LOG.info("Reading configuration field configuration from " + configFile); Properties customFieldProps = new Properties(); InputStream fis = CustomFields.class.getClassLoader().getResourceAsStream( configFile); if (fis == null) { throw new IOException("Was unable to open " + configFile); } customFieldProps.loadFromXML(fis); Enumeration keys = customFieldProps.keys(); while (keys.hasMoreElements()) { String prop = (String)keys.nextElement(); if (prop.endsWith(".name")) { String propName = prop.substring(0, prop.length() - 5); String name = customFieldProps.getProperty(prop); fieldNames.add(name); String boostKey = propName + ".boost"; if (customFieldProps.containsKey(boostKey)) { float boost = Float.parseFloat(customFieldProps.getProperty(boostKey)); boosts.put(name, boost); } } } } catch (Exception e) { LOG.error("Error loading custom field properties:\n" + StringUtils.stringifyException(e)); } } public Configuration getConf() { return this.conf; } public BooleanQuery filter(Query input, BooleanQuery output) throws QueryException { // examine each clause in the Nutch query Clause[] clauses = input.getClauses(); for (int i = 0; i < clauses.length; i++) { Clause c = clauses[i]; // skip non-matching clauses String fieldName = c.getField(); if (!fieldNames.contains(fieldName)) { continue; } String value = c.getTerm().toString().toLowerCase(); // add a Lucene TermQuery for this clause TermQuery clause = new TermQuery(new Term(fieldName, value)); // set boost if (boosts.containsKey(fieldName)) { clause.setBoost(boosts.get(fieldName)); } // add it as specified in query output.add(clause, (c.isProhibited() ? BooleanClause.Occur.MUST_NOT : (c.isRequired() ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD))); } // return the modified Lucene query return output; } }