/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.admin.crawldb; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.net.URL; import java.util.*; import javax.servlet.RequestDispatcher; import javax.servlet.Servlet; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpSession; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Scanner; import org.apache.hadoop.hbase.io.RowResult; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.UTF8; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.MapFile.Reader; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileRecordReader; import org.apache.log4j.Logger; import org.apache.nutch.admin.DefaultGuiComponent; import org.apache.nutch.admin.GuiComponent; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutchbase.util.hbase.TableUtil; public class UrlWithStatus extends HttpServlet { private static final Logger LOG = Logger.getLogger(UrlWithStatus.class.getName()); public static final int PAGE_SIZE = 100; private Path crawlDb; private Configuration configuration; public void init() { init(NutchConfiguration.create()); } public void init(Configuration conf) {} private String testUrl(RowResult rowResult, boolean filterUrl, String urlFilter) { if (rowResult == null) { return null; } if (filterUrl) { String url = TableUtil.unreverseUrl(Bytes.toString(rowResult.getRow())); return url.contains(urlFilter) ? url : null; } else { return TableUtil.unreverseUrl(Bytes.toString(rowResult.getRow())); } } protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { PrintWriter out = resp.getWriter(); // Initialization GuiComponent component = (GuiComponent) getServletContext().getAttribute("component"); Path instanceFolder = component.getNutchInstance().getInstanceFolder(); configuration = component.getNutchInstance().getConfiguration(); // Status filter drop-down String statusdrop = req.getParameter("statusdrop"); LOG.info("Statusdrop : " + statusdrop); boolean filterStatus = false; byte statusFilter = Byte.MAX_VALUE; if (statusdrop == null) { statusdrop = "all"; } if (!"".equals(statusdrop) && !"all".equals(statusdrop)) { filterStatus = true; statusFilter = Byte.parseByte(statusdrop); } req.setAttribute("statusdrop", statusdrop); req.setAttribute("statusFilter", Byte.toString(statusFilter)); String statusMenu = req.getParameter("status"); String urlFilter = req.getParameter("urlFilter"); if (urlFilter == null) { urlFilter = ""; } urlFilter = urlFilter.toLowerCase(); boolean filterUrl = !"".equals(urlFilter); req.setAttribute("urlFilter", urlFilter); System.out.println("urlFilter : (" + Boolean.toString(filterUrl) + ")'" + urlFilter+ "'"); System.out.println("statusMenu : " + statusMenu); int pageIndex; String pageIndexName = req.getParameter("pageIndex"); if ( pageIndexName == null || "".equals( pageIndexName ) ){ pageIndex = 0; }else{ pageIndex = Integer.parseInt( pageIndexName ); } req.setAttribute("pageIndex", new Integer(pageIndex)); System.out.println("pageIndex : " + pageIndex); HTable table = new HTable(new HBaseConfiguration(), "webtable"); String[] scannedColumns = new String[] {"status:", "score:" }; // Delete a domain if asked by the user String urltodelete = req.getParameter("urltodelete"); // LOG.info("URL for domain deletion : " + urltodelete); if (urltodelete != null) { URL u = new URL(urltodelete); String domaintodelete= u.getProtocol() + "://" + u.getAuthority(); // LOG.info("domain for domain deletion : " + domaintodelete); String prefix = TableUtil.reverseUrl(domaintodelete); Scanner scanner = table.getScanner(scannedColumns); for (RowResult rowResult : scanner) { String url = Bytes.toString(rowResult.getRow()); if (url.startsWith(prefix)) { table.deleteAll(url); } } } // Get the matching urls Scanner scanner = table.getScanner(scannedColumns); Map<String, CrawlDatum> map = new TreeMap<String, CrawlDatum>(); int to_skip = pageIndex * PAGE_SIZE, found_urls = 0; System.out.println("Going to skip : " + to_skip + " urls"); RowResult rowResult; String url; do { rowResult = scanner.next(); url = testUrl(rowResult, filterUrl, urlFilter); if (url != null) { to_skip--; } } while (to_skip > 0 && rowResult != null); // System.out.println("Urls skipped"); do { rowResult = scanner.next(); url = testUrl(rowResult, filterUrl, urlFilter); if (url != null) { // System.out.println("url : " + url); Byte st = rowResult.get( Bytes.toBytes("status:")).getValue()[0]; float score = Bytes.toFloat(rowResult.get( Bytes.toBytes("score:")).getValue()); CrawlDatum crawlDatum = new CrawlDatum((int)st, 0, score); map.put(url, crawlDatum); found_urls++; } } while (found_urls < PAGE_SIZE && rowResult != null); scanner.close(); // display results req.setAttribute("map", map); RequestDispatcher view = req.getRequestDispatcher("url_list.jsp"); view.forward(req, resp); } protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { doGet(req, resp); } }