/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer.field; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.lucene.document.Document; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.ExtensionPoint; import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.plugin.PluginRuntimeException; import org.apache.nutch.util.ObjectCache; /** * The FieldFilters class provides a standard way to collect, order, and run * all FieldFilter implementations that are active in the plugin system. */ public class FieldFilters { public static final String FIELD_FILTER_ORDER = "field.filter.order"; public final static Log LOG = LogFactory.getLog(FieldFilters.class); private FieldFilter[] fieldFilters; /** * Configurable constructor. */ public FieldFilters(Configuration conf) { // get the field filter order, the cache, and all field filters String order = conf.get(FIELD_FILTER_ORDER); ObjectCache objectCache = ObjectCache.get(conf); this.fieldFilters = (FieldFilter[])objectCache.getObject(FieldFilter.class.getName()); if (this.fieldFilters == null) { String[] orderedFilters = null; if (order != null && !order.trim().equals("")) { orderedFilters = order.split("\\s+"); } try { // get the field filter extension point ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( FieldFilter.X_POINT_ID); if (point == null) { throw new RuntimeException(FieldFilter.X_POINT_ID + " not found."); } // get all of the field filter plugins Extension[] extensions = point.getExtensions(); HashMap<String, FieldFilter> filterMap = new HashMap<String, FieldFilter>(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; FieldFilter filter = (FieldFilter)extension.getExtensionInstance(); LOG.info("Adding " + filter.getClass().getName()); if (!filterMap.containsKey(filter.getClass().getName())) { filterMap.put(filter.getClass().getName(), filter); } } // order the filters if necessary if (orderedFilters == null) { objectCache.setObject(FieldFilter.class.getName(), filterMap.values().toArray(new FieldFilter[0])); } else { ArrayList<FieldFilter> filters = new ArrayList<FieldFilter>(); for (int i = 0; i < orderedFilters.length; i++) { FieldFilter filter = filterMap.get(orderedFilters[i]); if (filter != null) { filters.add(filter); } } objectCache.setObject(FieldFilter.class.getName(), filters.toArray(new FieldFilter[filters.size()])); } } catch (PluginRuntimeException e) { throw new RuntimeException(e); } // set the filters in the cache this.fieldFilters = (FieldFilter[])objectCache.getObject(FieldFilter.class.getName()); } } /** * Runs all FieldFilter extensions. * * @param url The url to index. * @param doc The lucene index document. * @param fields The lucene fields. * * @return The document to filter or null to not index this document and url. * * @throws IndexingException If an error occurs while running filters. */ public Document filter(String url, Document doc, List<FieldWritable> fields) throws IndexingException { // loop through and run the field filters for (int i = 0; i < this.fieldFilters.length; i++) { doc = this.fieldFilters[i].filter(url, doc, fields); if (doc == null) { return null; } } return doc; } }