package org.apache.nutch.admin.pageranks; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import java.io.FileNotFoundException; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.io.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Scanner; import org.apache.hadoop.hbase.io.RowResult; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.jobcontrol.Job; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.apache.nutch.admin.GuiComponent; import org.apache.nutch.admin.TaskThread; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.crawl.CrawlDbFilter; import org.apache.nutch.crawl.CrawlDbMerger; import org.apache.nutch.crawl.CrawlDbReader; import org.apache.nutch.crawl.CrawlDbReducer; import org.apache.nutch.crawl.FetchSchedule; import org.apache.nutch.crawl.FetchScheduleFactory; import org.apache.nutch.crawl.CrawlDbMerger.Merger; import org.apache.nutch.indexer.DeleteDuplicates; import org.apache.nutch.indexer.IndexMerger; import org.apache.nutch.indexer.Indexer; import org.apache.nutch.net.URLFilter; import org.apache.nutch.net.URLFilters; import org.apache.nutch.searcher.NutchBean; import org.apache.nutch.segment.SegmentMerger; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutchbase.util.hbase.RowPart; import java.util.ArrayList; import javax.servlet.ServletContext; import javax.servlet.ServletContextEvent; import javax.servlet.ServletContextListener; public class PageranksThread extends TaskThread { private static final Logger LOG = Logger.getLogger(PageranksThread.class.getName()); public static final String KEY = "Pageranks_thread"; public static final int PAGERANK_POLL_DELAY = 800; Configuration configuration; public PageranksThread(Configuration configuration, Path _instanceFolder, Map<String, Float> modifs) { super(configuration); this.configuration = configuration; } private String log(String msg) { this.fMessage = msg; //LOG.info(msg); return msg; } private String fatal(String msg) { this.fMessage = "FATAL ERROR : " + msg; LOG.fatal(this.fMessage); return msg; } public void run(){ log("Pagerank thread starting..."); PageRankService PRService = new PageRankService(); try { HTable table = new HTable(new HBaseConfiguration(), "webtable"); String[] scannedColumns = new String[] {"score:", "pagerank:"}; Scanner scanner = table.getScanner(scannedColumns); for (RowResult rowResult : scanner) { RowPart row = new RowPart(rowResult); String url = Bytes.toString(row.getRowId()); float pagerank = row.getPagerank(); log("url [" + pagerank + "] : " + url); if (pagerank == 0.0f) { float PR = Math.min((float)PRService.getPR(url), 0.1f); log(" ==> Got pagerank : " + PR); row.setPagerank(PR); table.commit(row.makeBatchUpdate()); try { Thread.sleep(PAGERANK_POLL_DELAY); } catch (Exception e) { fatal("Pagerank job error : \n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); } } } } catch (IOException e1) { log("Pagerank job error : \n" + org.apache.hadoop.util.StringUtils.stringifyException(e1)); System.out.println("Pagerank job error : \n" + org.apache.hadoop.util.StringUtils.stringifyException(e1)); } //log("Pagerank thread pass terminated. starting over..."); run(); } public static class PageranksThreadConstructor implements ServletContextListener { public void contextDestroyed(ServletContextEvent sce) { } public void contextInitialized(ServletContextEvent sce) { LOG.info("Creating new pagerank fetcher thread..."); final ServletContext app = sce.getServletContext(); GuiComponent component = (GuiComponent) app.getAttribute("component"); Path instanceFolder = component.getNutchInstance().getInstanceFolder(); Configuration configuration = component.getNutchInstance().getConfiguration(); Map modifs = (Map)app.getAttribute(PageranksViewer.OPERATIONS_QUEUE_KEY); if (modifs == null) { modifs = new HashMap<String, Float>(); app.setAttribute(PageranksViewer.OPERATIONS_QUEUE_KEY, modifs); } synchronized (component) { PageranksThread thread = (PageranksThread)app.getAttribute(KEY); if(thread == null) { thread = new PageranksThread(configuration, instanceFolder, modifs); app.setAttribute(KEY, thread); thread.setDaemon(true); thread.start(); } } LOG.info("Done."); } } }