package org.apache.nutch.admin.scores; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.*; import javax.servlet.RequestDispatcher; import javax.servlet.Servlet; import javax.servlet.ServletContextEvent; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpSession; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.UTF8; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.MapFile.Reader; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileRecordReader; import org.apache.hadoop.mapred.lib.HashPartitioner; import org.apache.log4j.Logger; import org.apache.lucene.util.ArrayUtil; import org.apache.nutch.admin.DefaultGuiComponent; import org.apache.nutch.admin.GuiComponent; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.searcher.Hit; import org.apache.nutch.searcher.HitDetails; import org.apache.nutch.searcher.Hits; //import org.apache.nutch.searcher.NutchBean; import org.apache.nutchbase.searcher.NutchBeanHbase; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.Summary; //import org.apache.nutch.searcher.NutchBean.NutchBeanConstructor; import org.apache.nutchbase.searcher.NutchBeanHbase.NutchBeanConstructor; import org.apache.nutchbase.util.hbase.ImmutableRowPart; import org.apache.nutch.util.NutchConfiguration; public class UrlWithScore extends HttpServlet { private static final Logger LOG = Logger.getLogger(UrlWithScore.class.getName()); private Configuration configuration; public static final int PAGE_SIZE = 20; public void init() { init(NutchConfiguration.create()); } public void init(Configuration conf) { } protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { PrintWriter out = resp.getWriter(); //dumpServletInfo(req); // Initialization GuiComponent component = (GuiComponent) getServletContext().getAttribute("component"); Path instanceFolder = component.getNutchInstance().getInstanceFolder(); configuration = component.getNutchInstance().getConfiguration(); Path crawlDir = new Path(configuration.get("crawl.dir")); // Get a search bean to display results NutchBeanHbase bean = NutchBeanHbase.get(getServletContext(), configuration); if(getServletContext().getAttribute("forceReload") != null) { LOG.info("Forcing crawlDb reload"); LOG.info("Forcing searcher index reload"); bean.close(); bean = null; getServletContext().removeAttribute("forceReload"); } if(bean == null) { bean = new NutchBeanHbase(configuration, "webtable"); getServletContext().setAttribute(NutchBeanHbase.KEY, bean); } System.out.println("Bean : " + bean); int start = 0; int hitsPerPage = 30; int hitsToRetrieve = hitsPerPage; int hitsPerSite = 2; String sort = null; boolean reverse = false; // Treating request parameters String query = req.getParameter("query"); LOG.info("query: " + query); if (query == null) { query = ""; } Query query2 = Query.parse(query, "", configuration); ArrayList results = new ArrayList(); Hits hits; try{ hits = bean.search(query2, start + hitsToRetrieve, hitsPerSite, "site", sort, reverse); } catch (IOException ex){ hits = new Hits(0,new Hit[0]); } int end = (int)Math.min(hits.getLength(), start + hitsPerPage); int length = end-start; int realEnd = (int)Math.min(hits.getLength(), start + hitsToRetrieve); Hit[] show = hits.getHits(start, realEnd-start); HitDetails[] details = bean.getDetails(show); Summary[] summaries; try{ summaries = bean.getSummary(details, query2); }catch(Exception e) { LOG.error("Impossible to retrieve summaries : \n " + org.apache.hadoop.util.StringUtils.stringifyException(e)); summaries = new Summary[show.length]; for(int i=0; i<show.length; i++) { summaries[i] = new Summary(); } } LOG.info("total hits: " + hits.getTotal()); // HitDetails detail = details[i]; //String title = detail.getValue("title"); //String url = detail.getValue("url"); //String summary = summaries[i].toHtml(true); ArrayList explanations = new ArrayList<String>(); ArrayList pageranks = new ArrayList<Float>(); ArrayList votes = new ArrayList<Float>(); for (int i = 0; i < show.length; i++) { Hit hit = show[i]; HitDetails detail = details[i]; String url = detail.getValue("url"); LOG.debug("#" + i); LOG.debug(detail.getValue("title") + "(" + url + ")"); explanations.add( bean.getExplanation(query2, hit) ); LOG.debug("BOOST : " + detail.getValue("boost")); Text key = new Text(url); ImmutableRowPart row = bean.getRow(detail); float PR = row.getPagerank(); pageranks.add(PR); votes.add(row.getVotes()); } // display results req.setAttribute("results", Arrays.asList(details)); req.setAttribute("summaries", Arrays.asList(summaries)); req.setAttribute("explanations", explanations); req.setAttribute("pageranks", pageranks); req.setAttribute("votes", votes); RequestDispatcher view = req.getRequestDispatcher("url_list.jsp"); view.forward(req, resp); } private void dumpServletInfo(HttpServletRequest req) { LOG.info("Request attributes : "); Enumeration e = req.getAttributeNames(); while (e.hasMoreElements()) { LOG.info("name : " + (String)e.nextElement()); } LOG.info("---"); LOG.info("Servlet context attributes : "); e = getServletContext().getAttributeNames(); while (e.hasMoreElements()) { LOG.info("name : " + (String)e.nextElement()); } LOG.info("---"); } protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { doGet(req, resp); } }