/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.scoring.opic; import java.net.MalformedURLException; import java.net.URL; import java.util.Collection; import java.util.List; import java.util.Map.Entry; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.protocol.Content; import org.apache.nutch.scoring.ScoringFilter; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.util.LogUtil; /** * This plugin implements a variant of an Online Page Importance Computation * (OPIC) score, described in this paper: * <a href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/> * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), * Adaptive On-Line Page Importance Computation * </a>. * * @author Andrzej Bialecki */ public class OPICScoringFilter implements ScoringFilter { private final static Log LOG = LogFactory.getLog(OPICScoringFilter.class); private Configuration conf; private float scoreInjected; private float scorePower; private float internalScoreFactor; private float externalScoreFactor; private boolean countFiltered; public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; scoreInjected = conf.getFloat("db.score.injected", 1.0f); scorePower = conf.getFloat("indexer.score.power", 0.5f); internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f); externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f); countFiltered = conf.getBoolean("db.score.count.filtered", false); } /** Set to the value defined in config, 1.0f by default. */ public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException { datum.setScore(scoreInjected); } /** Set to 0.0f (unknown value) - inlink contributions will bring it to * a correct level. Newly discovered pages have at least one inlink. */ public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException { datum.setScore(0.0f); } /** Use {@link CrawlDatum#getScore()}. */ public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException { return datum.getScore() * initSort; } /** Increase the score by a sum of inlinked scores. */ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException { float adjust = 0.0f; for (int i = 0; i < inlinked.size(); i++) { CrawlDatum linked = (CrawlDatum)inlinked.get(i); adjust += linked.getScore(); } if (old == null) old = datum; datum.setScore(old.getScore() + adjust); } /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); } /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */ public void passScoreAfterParsing(Text url, Content content, Parse parse) { parse.getData().getContentMeta().set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); } /** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */ public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException { float score = scoreInjected; String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY); if (scoreString != null) { try { score = Float.parseFloat(scoreString); } catch (Exception e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); } } int validCount = targets.size(); if (countFiltered) { score /= allCount; } else { if (validCount == 0) { // no outlinks to distribute score, so just return adjust return adjust; } score /= validCount; } // internal and external score factor float internalScore = score * internalScoreFactor; float externalScore = score * externalScoreFactor; for (Entry<Text, CrawlDatum> target : targets) { try { String toHost = new URL(target.getKey().toString()).getHost(); String fromHost = new URL(fromUrl.toString()).getHost(); if(toHost.equalsIgnoreCase(fromHost)){ target.getValue().setScore(internalScore); } else { target.getValue().setScore(externalScore); } } catch (MalformedURLException e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); target.getValue().setScore(externalScore); } } // XXX (ab) no adjustment? I think this is contrary to the algorithm descr. // XXX in the paper, where page "loses" its score if it's distributed to // XXX linked pages... return adjust; } /** Dampen the boost value by scorePower.*/ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { return (float)Math.pow(dbDatum.getScore(), scorePower) * initScore; } }