/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.scoring; import java.util.Collection; import java.util.List; import java.util.Map.Entry; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.plugin.Pluggable; import org.apache.nutch.protocol.Content; /** * A contract defining behavior of scoring plugins. * * A scoring filter will manipulate scoring variables in CrawlDatum and in * resulting search indexes. Filters can be chained in a specific order, to * provide multi-stage scoring adjustments. * * @author Andrzej Bialecki */ public interface ScoringFilter extends Configurable, Pluggable { /** The name of the extension point. */ public final static String X_POINT_ID = ScoringFilter.class.getName(); /** * Set an initial score for newly injected pages. Note: newly injected pages * may have no inlinks, so filter implementations may wish to set this score * to a non-zero value, to give newly injected pages some initial credit. * * @param url * url of the page * @param datum * new datum. Filters will modify it in-place. * @throws ScoringFilterException */ public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException; /** * Set an initial score for newly discovered pages. Note: newly discovered * pages have at least one inlink with its score contribution, so filter * implementations may choose to set initial score to zero (unknown value), * and then the inlink score contribution will set the "real" value of the new * page. * * @param url * url of the page * @param datum * new datum. Filters will modify it in-place. * @throws ScoringFilterException */ public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException; /** * This method prepares a sort value for the purpose of sorting and selecting * top N scoring pages during fetchlist generation. * * @param url * url of the page * @param datum * page's datum, should not be modified * @param initSort * initial sort value, or a value from previous filters in chain */ public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException; /** * This method takes all relevant score information from the current datum * (coming from a generated fetchlist) and stores it into * {@link org.apache.nutch.protocol.Content} metadata. This is needed in order * to pass this value(s) to the mechanism that distributes it to outlinked * pages. * * @param url * url of the page * @param datum * source datum. NOTE: modifications to this value are not persisted. * @param content * instance of content. Implementations may modify this in-place, * primarily by setting some metadata properties. */ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException; /** * Currently a part of score distribution is performed using only data coming * from the parsing process. We need this method in order to ensure the * presence of score data in these steps. * * @param url * page url * @param content * original content. NOTE: modifications to this value are not * persisted. * @param parse * target instance to copy the score information to. Implementations * may modify this in-place, primarily by setting some metadata * properties. */ public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException; /** * Distribute score value from the current page to all its outlinked pages. * * @param fromUrl * url of the source page * @param parseData * ParseData instance, which stores relevant score value(s) in its * metadata. NOTE: filters may modify this in-place, all changes will * be persisted. * @param targets * <url, CrawlDatum> pairs. NOTE: filters can modify this * in-place, all changes will be persisted. * @param adjust * a CrawlDatum instance, initially null, which implementations may * use to pass adjustment values to the original CrawlDatum. When * creating this instance, set its status to * {@link CrawlDatum#STATUS_LINKED}. * @param allCount * number of all collected outlinks from the source page * @return if needed, implementations may return an instance of CrawlDatum, * with status {@link CrawlDatum#STATUS_LINKED}, which contains * adjustments to be applied to the original CrawlDatum score(s) and * metadata. This can be null if not needed. * @throws ScoringFilterException */ public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException; /** * This method calculates a new score of CrawlDatum during CrawlDb update, * based on the initial value of the original CrawlDatum, and also score * values contributed by inlinked pages. * * @param url * url of the page * @param old * original datum, with original score. May be null if this is a * newly discovered page. If not null, filters should use score * values from this parameter as the starting values - the * <code>datum</code> parameter may contain values that are no longer * valid, if other updates occured between generation and this * update. * @param datum * the new datum, with the original score saved at the time when * fetchlist was generated. Filters should update this in-place, and * it will be saved in the crawldb. * @param inlinked * (partial) list of CrawlDatum-s (with their scores) from links * pointing to this page, found in the current update batch. * @throws ScoringFilterException */ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException; /** * This method calculates a Lucene document boost. * * @param url * url of the page * @param doc * Lucene document. NOTE: this already contains all information * collected by indexing filters. Implementations may modify this * instance, in order to store/remove some information. * @param dbDatum * current page from CrawlDb. NOTE: changes made to this instance are * not persisted. * @param fetchDatum * datum from FetcherOutput (containing among others the fetching * status) * @param parse * parsing result. NOTE: changes made to this instance are not * persisted. * @param inlinks * current inlinks from LinkDb. NOTE: changes made to this instance * are not persisted. * @param initScore * initial boost value for the Lucene document. * @return boost value for the Lucene document. This value is passed as an * argument to the next scoring filter in chain. NOTE: implementations * may also express other scoring strategies by modifying Lucene * document directly. * @throws ScoringFilterException */ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException; }