/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.scoring;
import java.util.Collection;
import java.util.List;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.plugin.Pluggable;
import org.apache.nutch.protocol.Content;
/**
* A contract defining behavior of scoring plugins.
*
* A scoring filter will manipulate scoring variables in CrawlDatum and
* in resulting search indexes. Filters can be chained in a specific order,
* to provide multi-stage scoring adjustments.
*
* @author Andrzej Bialecki
*/
public interface ScoringFilter extends Configurable, Pluggable {
/** The name of the extension point. */
public final static String X_POINT_ID = ScoringFilter.class.getName();
/**
* Set an initial score for newly injected pages. Note: newly injected pages
* may have no inlinks, so filter implementations may wish to set this
* score to a non-zero value, to give newly injected pages some initial
* credit.
* @param url url of the page
* @param datum new datum. Filters will modify it in-place.
* @throws ScoringFilterException
*/
public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException;
/**
* Set an initial score for newly discovered pages. Note: newly discovered pages
* have at least one inlink with its score contribution, so filter implementations
* may choose to set initial score to zero (unknown value), and then the inlink
* score contribution will set the "real" value of the new page.
* @param url url of the page
* @param datum new datum. Filters will modify it in-place.
* @throws ScoringFilterException
*/
public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException;
/**
* This method prepares a sort value for the purpose of sorting and
* selecting top N scoring pages during fetchlist generation.
* @param url url of the page
* @param datum page's datum, should not be modified
* @param initSort initial sort value, or a value from previous filters in chain
*/
public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException;
/**
* This method takes all relevant score information from the current datum
* (coming from a generated fetchlist) and stores it into
* {@link org.apache.nutch.protocol.Content} metadata.
* This is needed in order to pass this value(s) to the mechanism that distributes it
* to outlinked pages.
* @param url url of the page
* @param datum source datum. NOTE: modifications to this value are not persisted.
* @param content instance of content. Implementations may modify this
* in-place, primarily by setting some metadata properties.
*/
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException;
/**
* Currently a part of score distribution is performed using only data coming
* from the parsing process. We need this method in order to ensure the
* presence of score data in these steps.
* @param url page url
* @param content original content. NOTE: modifications to this value are not persisted.
* @param parse target instance to copy the score information to. Implementations
* may modify this in-place, primarily by setting some metadata properties.
*/
public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException;
/**
* Distribute score value from the current page to all its outlinked pages.
* @param fromUrl url of the source page
* @param parseData ParseData instance, which stores relevant score value(s)
* in its metadata. NOTE: filters may modify this in-place, all changes will
* be persisted.
* @param targets <url, CrawlDatum> pairs. NOTE: filters can modify this in-place,
* all changes will be persisted.
* @param adjust a CrawlDatum instance, initially null, which implementations
* may use to pass adjustment values to the original CrawlDatum. When creating
* this instance, set its status to {@link CrawlDatum#STATUS_LINKED}.
* @param allCount number of all collected outlinks from the source page
* @return if needed, implementations may return an instance of CrawlDatum,
* with status {@link CrawlDatum#STATUS_LINKED}, which contains adjustments
* to be applied to the original CrawlDatum score(s) and metadata. This can
* be null if not needed.
* @throws ScoringFilterException
*/
public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
int allCount) throws ScoringFilterException;
/**
* This method calculates a new score of CrawlDatum during CrawlDb update, based on the
* initial value of the original CrawlDatum, and also score values contributed by
* inlinked pages.
* @param url url of the page
* @param old original datum, with original score. May be null if this is a newly
* discovered page. If not null, filters should use score values from this parameter
* as the starting values - the <code>datum</code> parameter may contain values that are
* no longer valid, if other updates occured between generation and this update.
* @param datum the new datum, with the original score saved at the time when
* fetchlist was generated. Filters should update this in-place, and it will be saved in
* the crawldb.
* @param inlinked (partial) list of CrawlDatum-s (with their scores) from
* links pointing to this page, found in the current update batch.
* @throws ScoringFilterException
*/
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException;
/**
* This method calculates a Lucene document boost.
* @param url url of the page
* @param doc Lucene document. NOTE: this already contains all information collected
* by indexing filters. Implementations may modify this instance, in order to store/remove
* some information.
* @param dbDatum current page from CrawlDb. NOTE: changes made to this instance
* are not persisted.
* @param fetchDatum datum from FetcherOutput (containing among others the fetching status)
* @param parse parsing result. NOTE: changes made to this instance are not persisted.
* @param inlinks current inlinks from LinkDb. NOTE: changes made to this instance are
* not persisted.
* @param initScore initial boost value for the Lucene document.
* @return boost value for the Lucene document. This value is passed as an argument
* to the next scoring filter in chain. NOTE: implementations may also express
* other scoring strategies by modifying Lucene document directly.
* @throws ScoringFilterException
*/
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException;
}