package edu.umd.cloud9.collection; import java.io.IOException; import java.io.PrintWriter; import java.net.InetAddress; import java.util.Arrays; import java.util.Random; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.mortbay.jetty.Server; import org.mortbay.jetty.servlet.ServletHolder; import edu.umd.cloud9.mapreduce.NullInputFormat; import edu.umd.cloud9.mapreduce.NullMapper; /** * Web server for providing access to documents in a collection. * * @author Jimmy Lin */ public class DocumentForwardIndexHttpServer extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(DocumentForwardIndexHttpServer.class); private static DocumentForwardIndex<Indexable> INDEX; // Keys for passing data into mapper via conf object. private static final String INDEX_KEY = "index"; private static final String DOCNO_MAPPING_KEY = "docnoMapping"; private static final String TMP_KEY = "tmp"; @SuppressWarnings({ "unchecked", "rawtypes" }) private static class MyMapper extends NullMapper { @Override public void runSafely(Mapper.Context context) { try { int port = 8888; Configuration conf = context.getConfiguration(); String indexFile = conf.get(INDEX_KEY); String mappingFile = conf.get(DOCNO_MAPPING_KEY); Path tmpPath = new Path(conf.get(TMP_KEY)); String host = InetAddress.getLocalHost().toString(); LOG.info("host: " + host); LOG.info("port: " + port); LOG.info("forward index: " + indexFile); FileSystem fs = FileSystem.get(conf); FSDataInputStream in = fs.open(new Path(indexFile)); String indexClass = in.readUTF(); in.close(); LOG.info("index class: " + indexClass); INDEX = (DocumentForwardIndex<Indexable>) Class.forName(indexClass).newInstance(); INDEX.loadIndex(new Path(indexFile), new Path(mappingFile), fs); Server server = new Server(port); org.mortbay.jetty.servlet.Context root = new org.mortbay.jetty.servlet.Context(server, "/", org.mortbay.jetty.servlet.Context.SESSIONS); root.addServlet(new ServletHolder(new FetchDocidServlet()), "/fetch_docid"); root.addServlet(new ServletHolder(new FetchDocnoServlet()), "/fetch_docno"); root.addServlet(new ServletHolder(new HomeServlet()), "/"); FSDataOutputStream out = FileSystem.get(conf).create(tmpPath, true); out.writeUTF(host); out.close(); try { server.start(); } catch (Exception e) { e.printStackTrace(); } while (true); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } } } private DocumentForwardIndexHttpServer() {} // This must be public. public static class HomeServlet extends HttpServlet { static final long serialVersionUID = 8253865405L; static final Random r = new Random(); public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException { res.setContentType("text/html"); PrintWriter out = res.getWriter(); out.println("<html><head><title>Collection Access: " + INDEX.getCollectionPath() + "</title><head>"); out.println("<body>"); out.println("<h3>Collection Access: " + INDEX.getCollectionPath() + "</h3>"); int firstDocno = INDEX.getFirstDocno(); int lastDocno = INDEX.getLastDocno(); int numDocs = lastDocno - firstDocno; LOG.info("first docno: " + firstDocno); LOG.info("last docno: " + lastDocno); String firstDocid = INDEX.getDocid(firstDocno); String lastDocid = INDEX.getDocid(lastDocno); out.println("First document: docno <a href=\"/fetch_docno?docno=" + firstDocno + "\">" + firstDocno + "</a> or <a href=\"/fetch_docid?docid=" + firstDocid + "\">" + firstDocid + "</a><br/>"); out.println("Last document: docno <a href=\"/fetch_docno?docno=" + lastDocno + "\">" + lastDocno + "</a> or <a href=\"/fetch_docid?docid=" + lastDocid + "\">" + lastDocid + "</a>"); out.println("<h3>Fetch a docid</h3>"); String id; out.println("<p>(random examples: "); id = INDEX.getDocid(r.nextInt(numDocs) + firstDocno); out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>, "); id = INDEX.getDocid(r.nextInt(numDocs) + firstDocno); out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>, "); id = INDEX.getDocid(r.nextInt(numDocs) + firstDocno); out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>)</p>"); out.println("<form method=\"post\" action=\"fetch_docid\">"); out.println("<input type=\"text\" name=\"docid\" size=\"60\" />"); out.println("<input type=\"submit\" value=\"Fetch!\" />"); out.println("</form>"); out.println("</p>"); out.println("<h3>Fetch a docno</h3>"); int n; out.println("<p>(random examples: "); n = r.nextInt(numDocs) + firstDocno; out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>, "); n = r.nextInt(numDocs) + firstDocno; out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>, "); n = r.nextInt(numDocs) + firstDocno; out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>)</p>"); out.println("<p>"); out.println("<form method=\"post\" action=\"fetch_docno\">"); out.println("<input type=\"text\" name=\"docno\" size=\"60\" />"); out.println("<input type=\"submit\" value=\"Fetch!\" />"); out.println("</form>"); out.println("</p>"); out.print("</body></html>\n"); out.close(); } } // this has to be public public static class FetchDocidServlet extends HttpServlet { static final long serialVersionUID = 3986721097L; public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException { doPost(req, res); } public void doPost(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException { LOG.info("triggered servlet for fetching document by docid"); String docid = null; try { if (req.getParameterValues("docid") != null) docid = req.getParameterValues("docid")[0]; Indexable doc = INDEX.getDocument(docid); if (doc != null) { LOG.info("fetched: " + doc.getDocid()); res.setContentType(doc.getDisplayContentType()); PrintWriter out = res.getWriter(); out.print(doc.getDisplayContent()); out.close(); } else { throw new Exception(); } } catch (Exception e) { // catch-all, in case anything goes wrong LOG.info("trapped error fetching " + docid); res.setContentType("text/html"); PrintWriter out = res.getWriter(); out.print("<html><head><title>Invalid docid!</title><head>\n"); out.print("<body>\n"); out.print("<h1>Error!</h1>\n"); out.print("<h3>Invalid docid: " + docid + "</h3>\n"); out.print("</body></html>\n"); out.close(); } } } // this has to be public public static class FetchDocnoServlet extends HttpServlet { static final long serialVersionUID = 5970126341L; public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException { doPost(req, res); } public void doPost(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException { LOG.info("triggered servlet for fetching document by docno"); int docno = 0; try { if (req.getParameterValues("docno") != null) docno = Integer.parseInt(req.getParameterValues("docno")[0]); Indexable doc = INDEX.getDocument(docno); if (doc != null) { LOG.info("fetched: " + doc.getDocid() + " = docno " + docno); res.setContentType(doc.getDisplayContentType()); PrintWriter out = res.getWriter(); out.print(doc.getDisplayContent()); out.close(); } else { throw new Exception(); } } catch (Exception e) { LOG.info("trapped error fetching " + docno); res.setContentType("text/html"); PrintWriter out = res.getWriter(); out.print("<html><head><title>Invalid docno!</title><head>\n"); out.print("<body>\n"); out.print("<h1>Error!</h1>\n"); out.print("<h3>Invalid docno: " + docno + "</h3>\n"); out.print("</body></html>\n"); out.close(); } } } public static final String INDEX_OPTION = "index"; public static final String MAPPING_OPTION = "docnoMapping"; @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("(required) forward index path").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("(required) DocnoMapping data path").create(MAPPING_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String indexFile = cmdline.getOptionValue(INDEX_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_OPTION); LOG.info("Launching DocumentForwardIndexHttpServer"); LOG.info(" - index file: " + indexFile); LOG.info(" - docno mapping data file: " + mappingFile); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Random rand = new Random(); int r = rand.nextInt(); // This tmp file as a rendezvous point. Path tmpPath = new Path("/tmp/" + r); if (fs.exists(tmpPath)) { fs.delete(tmpPath, true); } Job job = new Job(conf, DocumentForwardIndexHttpServer.class.getSimpleName()); job.setJarByClass(DocumentForwardIndexHttpServer.class); job.getConfiguration().set("mapred.child.java.opts", "-Xmx1024m"); job.getConfiguration().set(INDEX_KEY, indexFile); job.getConfiguration().set(DOCNO_MAPPING_KEY, mappingFile); job.getConfiguration().set(TMP_KEY, tmpPath.toString()); job.setNumReduceTasks(0); job.setInputFormatClass(NullInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(MyMapper.class); job.submit(); LOG.info("Waiting for server to start up..."); while (!fs.exists(tmpPath)) { Thread.sleep(50000); LOG.info("..."); } FSDataInputStream in = fs.open(tmpPath); String host = in.readUTF(); in.close(); LOG.info("host: " + host); LOG.info("port: 8888"); return 0; } /** * Dispatches command-line arguments to the tool via the <code>ToolRunner</code>. */ public static void main(String[] args) throws Exception { LOG.info("Running " + DocumentForwardIndexHttpServer.class.getCanonicalName() + " with args " + Arrays.toString(args)); ToolRunner.run(new DocumentForwardIndexHttpServer(), args); } }