/*
* Cloud9: A MapReduce Library for Hadoop
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package edu.umd.cloud9.webgraph.driver;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Random;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
import org.mortbay.jetty.Server;
import org.mortbay.jetty.servlet.Context;
import org.mortbay.jetty.servlet.ServletHolder;
import edu.umd.cloud9.collection.DocumentForwardIndex;
import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping;
import edu.umd.cloud9.mapred.NullInputFormat;
import edu.umd.cloud9.mapred.NullMapper;
import edu.umd.cloud9.mapred.NullOutputFormat;
import edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex;
/**
* <p>
* Runs an HTTP server to explore an anchor text/web graph collection.
* Command-line arguments are as follows:
* </p>
*
* <ul>
* <li>[index-file]: the forward index file for the anchor text collection</li>
* <li>[docno-mapping]: ClueWeb09 docno mapping file</li>
* <li>[ClueWeb09-index-files-base]: the path to ClueWeb09 forward indexes</li>
* </ul>
*
* <p>
* ClueWeb09 forward indexes should be stored at <code>[ClueWeb09-index-files-base]/findex.en.XX</code>,
* where <code>XX</code> is the segment number.
* </p>
*
* @author Nima Asadi
*
*/
@SuppressWarnings("deprecation")
public class ClueWebAnchorTextForwardIndexHttpServer {
private static final Logger LOG = Logger.getLogger(ClueWebAnchorTextForwardIndexHttpServer.class);
private static final String SEPARATOR = ",";
private static final int[] lastDocs = new int[10];
private static final ArrayList<String> clueweb = new ArrayList<String>();
private static IndexableAnchorTextForwardIndex sForwardIndex;
private static DocumentForwardIndex<Indexable>[] docForwardIndex;
@SuppressWarnings("unchecked")
private static class ServerMapper extends NullMapper {
public void run(JobConf conf, Reporter reporter) throws IOException {
int port = 8888;
String indexFile = conf.get("IndexFile");
String mappingFile = conf.get("DocnoMappingDataFile");
Path tmpPath = new Path(conf.get("TmpPath"));
String[] cluewebIndexFiles = conf.get("ClueWebIndexFiles").split(SEPARATOR);
String host = InetAddress.getLocalHost().toString();
LOG.info("host: " + host);
LOG.info("port: " + port);
LOG.info("forward index: " + indexFile);
FSDataInputStream in = FileSystem.get(conf).open(new Path(indexFile));
String indexClass = in.readUTF();
in.close();
LOG.info("index class: " + indexClass);
try {
sForwardIndex = new IndexableAnchorTextForwardIndex(new ClueWarcDocnoMapping());
sForwardIndex.loadIndex(new Path(indexFile), new Path(mappingFile), FileSystem.get(conf));
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error initializing forward index!");
}
for(String s : cluewebIndexFiles)
clueweb.add(s.trim());
//opening clueweb forward index files
docForwardIndex = new DocumentForwardIndex[clueweb.size()];
for(int i = 0; i < clueweb.size(); i++) {
in = FileSystem.get(conf).open(new Path(clueweb.get(i)));
String indexClueWebClass = in.readUTF();
in.close();
try {
docForwardIndex[i] = (DocumentForwardIndex<Indexable>) Class.forName(indexClueWebClass).newInstance();
docForwardIndex[i].loadIndex(new Path(clueweb.get(i)), new Path(mappingFile), FileSystem.get(conf));
lastDocs[i] = docForwardIndex[i].getLastDocno();
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error initializing forward index!");
}
}
Server server = new Server(port);
Context root = new Context(server, "/", Context.SESSIONS);
root.addServlet(new ServletHolder(new FetchDocidServlet()), "/fetch_docid");
root.addServlet(new ServletHolder(new FetchDocnoServlet()), "/fetch_docno");
root.addServlet(new ServletHolder(new FetchDocContentServlet()), "/fetch_content");
root.addServlet(new ServletHolder(new HomeServlet()), "/");
FSDataOutputStream out = FileSystem.get(conf).create(tmpPath, true);
out.writeUTF(host);
out.close();
try {
server.start();
} catch (Exception e) {
e.printStackTrace();
}
while (true)
;
}
}
private ClueWebAnchorTextForwardIndexHttpServer() {
}
// this has to be public
public static class HomeServlet extends HttpServlet {
static final long serialVersionUID = 8253865405L;
static final Random r = new Random();
public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException,
IOException {
res.setContentType("text/html");
PrintWriter out = res.getWriter();
out.println("<html><head><title>Collection Access: "
+ sForwardIndex.getCollectionPath() + "</title><head>");
out.println("<body>");
out.println("<h3>Collection Access: " + sForwardIndex.getCollectionPath() + "</h3>");
int firstDocno = sForwardIndex.getFirstDocno();
int lastDocno = sForwardIndex.getLastDocno();
int numDocs = lastDocno - firstDocno;
String firstDocid = sForwardIndex.getDocid(firstDocno);
String lastDocid = sForwardIndex.getDocid(lastDocno);
out.println("First document: docno <a href=\"/fetch_docno?docno=" + firstDocno + "\">"
+ firstDocno + "</a> or <a href=\"/fetch_docid?docid=" + firstDocid + "\">"
+ firstDocid + "</a><br/>");
out.println("Last document: docno <a href=\"/fetch_docno?docno=" + lastDocno + "\">"
+ lastDocno + "</a> or <a href=\"/fetch_docid?docid=" + lastDocid + "\">"
+ lastDocid + "</a>");
out.println("<h3>Fetch a docid</h3>");
String id;
out.println("<p>(random examples: ");
id = sForwardIndex.getDocid(r.nextInt(numDocs) + firstDocno);
out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>, ");
id = sForwardIndex.getDocid(r.nextInt(numDocs) + firstDocno);
out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>, ");
id = sForwardIndex.getDocid(r.nextInt(numDocs) + firstDocno);
out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>)</p>");
out.println("<form method=\"post\" action=\"fetch_docid\">");
out.println("<input type=\"text\" name=\"docid\" size=\"60\" />");
out.println("<input type=\"submit\" value=\"Fetch!\" />");
out.println("</form>");
out.println("</p>");
out.println("<p>");
out.println("<form method=\"post\" action=\"fetch_content\">");
out.println("<input type=\"text\" name=\"docid\" size=\"60\" />");
out.println("<input type=\"submit\" value=\"Fetch Content!\" />");
out.println("</form>");
out.println("</p>");
out.println("<h3>Fetch a docno</h3>");
int n;
out.println("<p>(random examples: ");
n = r.nextInt(numDocs) + firstDocno;
out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>, ");
n = r.nextInt(numDocs) + firstDocno;
out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>, ");
n = r.nextInt(numDocs) + firstDocno;
out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>)</p>");
out.println("<p>");
out.println("<form method=\"post\" action=\"fetch_docno\">");
out.println("<input type=\"text\" name=\"docno\" size=\"60\" />");
out.println("<input type=\"submit\" value=\"Fetch!\" />");
out.println("</form>");
out.println("</p>");
out.println("<p>");
out.println("<form method=\"post\" action=\"fetch_content\">");
out.println("<input type=\"text\" name=\"docno\" size=\"60\" />");
out.println("<input type=\"submit\" value=\"Fetch Content!\" />");
out.println("</form>");
out.println("</p>");
out.print("</body></html>\n");
out.close();
}
}
public static class FetchDocContentServlet extends HttpServlet {
static final long serialVersionUID = 5970126341L;
public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException,
IOException {
doPost(req, res);
}
public void doPost(HttpServletRequest req, HttpServletResponse res)
throws ServletException, IOException {
LOG.info("triggered servlet for fetching document content");
int docno = 0;
try {
if (req.getParameterValues("docno") != null)
docno = Integer.parseInt(req.getParameterValues("docno")[0]);
else if (req.getParameterValues("docid") != null)
docno = sForwardIndex.getDocno(req.getParameterValues("docid")[0]);
Indexable doc = null;
int i = 0;
for(i = 0; i < lastDocs.length; i++)
if(docno <= lastDocs[i]) {
doc = docForwardIndex[i].getDocument(docno);
break;
}
if (doc != null) {
LOG.info("fetched: " + doc.getDocid() + " = docno " + docno);
res.setContentType(doc.getDisplayContentType());
PrintWriter out = res.getWriter();
out.print(doc.getContent().replaceAll("<\\s*/\\s*[bB][oO][dD][Yy]\\s*>", "<br><br><a href=\"/fetch_docno?docno=" + docno +
"\"> Fetch anchor text for docno: " + docno + "</a></body>"));
out.close();
} else {
throw new Exception();
}
} catch (Exception e) {
LOG.info("trapped error fetching " + docno);
res.setContentType("text/html");
PrintWriter out = res.getWriter();
out.print("<html><head><title>Invalid docno!</title><head>\n");
out.print("<body>\n");
out.print("<h1>Error!</h1>\n");
out.print("<h3>Invalid docno: " + docno + "</h3>\n");
out.print("</body></html>\n");
out.close();
}
}
}
// this has to be public
public static class FetchDocidServlet extends HttpServlet {
static final long serialVersionUID = 3986721097L;
public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException,
IOException {
doPost(req, res);
}
public void doPost(HttpServletRequest req, HttpServletResponse res)
throws ServletException, IOException {
LOG.info("triggered servlet for fetching document by docid");
String docid = null;
try {
if (req.getParameterValues("docid") != null)
docid = req.getParameterValues("docid")[0];
Indexable doc = sForwardIndex.getDocument(docid);
if (doc != null) {
LOG.info("fetched: " + doc.getDocid());
res.setContentType(doc.getDisplayContentType());
PrintWriter out = res.getWriter();
out.print(doc.getContent().replace("<body>", "<body><a href=\"/fetch_content?docid=" + docid +
"\"> Fetch content for docid: " + docid + "</a><br><br>"));
out.close();
} else {
throw new Exception();
}
} catch (Exception e) {
// catch-all, in case anything goes wrong
LOG.info("trapped error fetching " + docid);
res.setContentType("text/html");
PrintWriter out = res.getWriter();
out.print("<html><head><title>Invalid docid!</title><head>\n");
out.print("<body>\n");
out.print("<h1>Error!</h1>\n");
out.print("<h3>Invalid docid: " + docid + "</h3>\n");
out.print("</body></html>\n");
out.close();
}
}
}
// this has to be public
public static class FetchDocnoServlet extends HttpServlet {
static final long serialVersionUID = 5970126341L;
public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException,
IOException {
doPost(req, res);
}
public void doPost(HttpServletRequest req, HttpServletResponse res)
throws ServletException, IOException {
LOG.info("triggered servlet for fetching document by docno");
int docno = 0;
try {
if (req.getParameterValues("docno") != null)
docno = Integer.parseInt(req.getParameterValues("docno")[0]);
Indexable doc = sForwardIndex.getDocument(docno);
if (doc != null) {
LOG.info("fetched: " + doc.getDocid() + " = docno " + docno);
res.setContentType(doc.getDisplayContentType());
PrintWriter out = res.getWriter();
out.print(doc.getContent().replace("<body>", "<body><a href=\"/fetch_content?docno=" + docno +
"\"> Fetch content for docno: " + docno + "</a><br><br>"));
out.close();
} else {
throw new Exception();
}
} catch (Exception e) {
LOG.info("trapped error fetching " + docno);
res.setContentType("text/html");
PrintWriter out = res.getWriter();
out.print("<html><head><title>Invalid docno!</title><head>\n");
out.print("<body>\n");
out.print("<h1>Error!</h1>\n");
out.print("<h3>Invalid docno: " + docno + "</h3>\n");
out.print("</body></html>\n");
out.close();
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
System.out.println("usage: [index-file] [docno-mapping-data-files] [clue-forward-index-root]");
//[clue-forward-index-root: /shared/ClueWeb09/collection.compressed.block/
System.exit(-1);
}
String indexFile = otherArgs[0];
String mappingFile = otherArgs[1];
String clueIndexRoot = otherArgs[2].endsWith("/") ? otherArgs[2] : otherArgs[2] + "/";
String cluewebForwardIndex = "";
for(int i = 1; i < 10; i++)
cluewebForwardIndex += clueIndexRoot + "findex.en.0" + i + ".dat" + SEPARATOR + " ";
cluewebForwardIndex += clueIndexRoot + "findex.en.10.dat";
LOG.info("Launching DocumentForwardIndexHttpServer");
LOG.info(" - index file: " + indexFile);
LOG.info(" - docno mapping data file: " + mappingFile);
LOG.info(" - ClueWeb09 index root:" + clueIndexRoot);
FileSystem fs = FileSystem.get(conf);
Random rand = new Random();
int r = rand.nextInt();
// this tmp file as a rendezvous point
Path tmpPath = new Path("/tmp/" + r);
if (fs.exists(tmpPath)) {
fs.delete(tmpPath, true);
}
JobConf job = new JobConf(conf, ClueWebAnchorTextForwardIndexHttpServer.class);
job.setJobName("ForwardIndexServer:" + indexFile);
job.set("mapred.child.java.opts", "-Xmx2048m");
job.setNumMapTasks(1);
job.setNumReduceTasks(0);
job.setInputFormat(NullInputFormat.class);
job.setOutputFormat(NullOutputFormat.class);
job.setMapperClass(ServerMapper.class);
job.set("IndexFile", indexFile);
job.set("DocnoMappingDataFile", mappingFile);
job.set("TmpPath", tmpPath.toString());
job.set("ClueWebIndexFiles", cluewebForwardIndex);
JobClient client = new JobClient(job);
client.submitJob(job);
LOG.info("Waiting for server to start up...");
while (!fs.exists(tmpPath)) {
Thread.sleep(50000);
LOG.info("...");
}
FSDataInputStream in = fs.open(tmpPath);
String host = in.readUTF();
in.close();
LOG.info("host: " + host);
LOG.info("port: 8888");
}
}