package com.trylog.scoring;
import java.util.Collection;
import java.util.List;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
import org.apache.nutch.admin.scores.Modification;
import org.apache.nutch.admin.scores.ScoreUpdater;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.admin.scores.Modification;
public class TrylogScoringFilter implements ScoringFilter {
private static final Logger LOG = Logger.getLogger(TrylogScoringFilter.class.getName());
private Configuration conf;
private float scoreInjected = 0.001f;
private float normalizedScore = 1.00f;
private float pagerankToVotesRatio = 0.01f;
public TrylogScoringFilter() { }
public Configuration getConf() { return conf; }
public void setConf(Configuration conf) {
this.conf = conf;
normalizedScore = conf.getFloat("trylog.scoring.normalize.score", 1.00f);
scoreInjected = conf.getFloat("trylog.scoring.injected.score", 1.00f);
pagerankToVotesRatio = conf.getFloat("trylog.scoring.injected.score", 0.01f);
}
public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
CrawlDatum adjust, int allCount) throws ScoringFilterException {
return adjust;
}
public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
return datum.getScore() * initSort;
}
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
//LOG.debug("TrylogScoringFilter :: indexerScore");
float newScore = getScoreFromMetas(url, dbDatum);
LOG.debug("Trylog scoring filter new score : " + url + " -> " + newScore);
return (normalizedScore * newScore);
}
public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException {
//LOG.debug("TrylogScoringFilter :: initialScore");
datum.setScore(0.0f);
}
public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException {
//LOG.debug("TrylogScoringFilter :: injectedScore");
datum.setScore(scoreInjected);
}
public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException {
parse.getData().getContentMeta().set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
}
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException {
content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
}
private float getScoreFromMetas(Text url, CrawlDatum datum){
float PR = 0.0f, votes = 0.0f;
org.apache.hadoop.io.MapWritable meta = datum.getMetaData();
FloatWritable pagerank = (FloatWritable)meta.get(new Text(Modification.META_PAGERANK));
FloatWritable nb_votes = (FloatWritable)meta.get(new Text(Modification.META_VOTES));
if(pagerank != null) { PR = pagerank.get(); }
if(nb_votes != null) { votes = nb_votes.get(); }
return PR + pagerankToVotesRatio*votes;
}
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException {
LOG.debug("TrylogScoringFilter :: updateDbScore");
if (old == null) old = datum;
float newScore = getScoreFromMetas(url, datum);
LOG.debug("Trylog scoring filter new score : " + url + " -> " + newScore);
datum.setScore(newScore);
}
}