/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.listcrawler; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Set; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.util.Shell; import org.commoncrawl.service.listcrawler.CrawlListDatabaseRecord; import org.commoncrawl.util.CCStringUtils; import com.google.common.collect.ImmutableSet; /** * Servlet used to upload crawl lists to a crawler server * * @author rana * */ @SuppressWarnings("serial") public class ListUploadServlet extends HttpServlet { public static final Log LOG = LogFactory.getLog(ListUploadServlet.class); public static class ListRequeueServlet extends HttpServlet { public static final Log LOG = LogFactory.getLog(ListRequeueServlet.class); @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp)throws ServletException, IOException { String listId = req.getParameter("listId"); String listFileName = req.getParameter("urlFile"); File listFile = new File(ProxyServer.getSingleton().getCrawlHistoryDataDir(),listFileName); LOG.info("###LISTUPLOADER: Requeue Request- ListId:" + listId + " listFileName:" + listFileName); if (listFile.exists()) { ProxyServer.getSingleton().requeueList(Long.parseLong(listId), listFile); } } } public static class RequeueBrokenListsServlet extends HttpServlet { public static final Log LOG = LogFactory.getLog(RequeueBrokenListsServlet.class); @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp)throws ServletException, IOException { long idsToFix[] = { 1286466854056L,1286733537313L, 1286467182139L,1286733537315L, 1286467448576L,1286733537316L, 1286467734918L,1286733537318L, 1286468071056L,1286733537319L, 1286468376989L,1286733537321L, 1286468673896L,1286733537322L, 1286469018206L,1286733537324L, 1286469408437L,1286733537327L, 1286469703877L,1286733537329L, 1286469965566L,1286733537331L, 1286470262212L,1286733537332L, 1286470558900L,1286733537334L, 1286470853220L,1286733537360L }; for (int i=0;i<idsToFix.length;) { File listFile = new File(ProxyServer.getSingleton().getCrawlHistoryDataDir(),"listURLS-" + idsToFix[i++]); if (listFile.exists()) { LOG.info("Reloading List File:" + listFile.getAbsolutePath()); ProxyServer.getSingleton().requeueList(idsToFix[i++], listFile); } } } } public static class ListUploadForm extends HttpServlet { public static final Log LOG = LogFactory.getLog(ListUploadForm.class); @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { resp.setContentType("text/html"); PrintWriter writer = resp.getWriter(); writer.println("<HTML>"); writer.println("<form method='post' action='/ListUploader' enctype='multipart/form-data'>"); writer.println("<table border=0>"); writer.println("<tr><td>CutomerId:<td><input name='customerId' type='text' width=20 /></tr>"); writer.println("<tr><td>List Name:<td><input name='listName' type='text' width=100 /></tr>"); writer.println("<tr><td>List File:<td><input name='listFile' type='file' /></tr>"); writer.println("<tr><td colspan=2> </tr>"); writer.println("<tr><td colspan=2><input type='submit' /></tr>"); writer.println("</table>"); writer.println("</HTML>"); writer.flush(); } } private final static String FILES ="org.mortbay.servlet.MultiPartFilter.files"; private static Set<String> customers = new ImmutableSet.Builder<String>() .add("foobar") .build(); @Override protected void doPut(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { String customerId = req.getParameter("customerId"); String listName = req.getParameter("listName"); String incomingFileName = req.getParameter("fileName"); LOG.info("###LISTUPLOADER: GOT PUT Customer Id:" + customerId + " ListName:" + listName + " FileName:" + incomingFileName); if (customerId == null || !customers.contains(customerId) || listName == null || listName.length() == 0) { LOG.error("###LISTUPLOADER:No Customer Id or Invalid Customer Id:" + customerId + " ListId:" + listName); resp.sendError(500,"Invalid Customer Id or Invalid List Name!" + customerId + ":" + listName); return; } else if (incomingFileName == null || incomingFileName.length() == 0) { LOG.error("###LISTUPLOADER:No IncomingFilename"); resp.sendError(500,"Invalid Filename"); return; } else { // get the server ... ProxyServer server = ProxyServer.getSingleton(); // get the crawl history data directory ... File dataDir = server.getCrawlHistoryDataDir(); // create import file ... File importFile = new File(dataDir,incomingFileName + "-" + System.currentTimeMillis()); LOG.info("###LISTUPLOADER:Filename:" + incomingFileName + " Customer:" + customerId + " List:" + listName + " outputFile:" + importFile.getAbsolutePath()); // open a handle to it BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(importFile),1 << 20); // allocate a buffer ... byte incomingBuffer[] = new byte[ 1 << 19 ]; int bytesRead = -1; int totalBytesRead = 0; // get input stream InputStream input = req.getInputStream(); try { try { while ((bytesRead = input.read(incomingBuffer)) != -1) { LOG.info("Read:" + bytesRead + " bytes from:" + incomingFileName); outputStream.write(incomingBuffer,0,bytesRead); totalBytesRead += bytesRead; } } finally { outputStream.flush(); outputStream.close(); } LOG.info("###LISTUPLOADER:List:" + listName + " Finished download filename:" + incomingFileName + " TotalBytesRead:" + totalBytesRead + "-Inserting Record"); // won't reach here unless write succeeded ... // create a database record CrawlListDatabaseRecord databaseRecord = new CrawlListDatabaseRecord(); databaseRecord.setListName(listName); databaseRecord.setCustomerName(customerId); databaseRecord.setSourceFileName(incomingFileName); databaseRecord.setTempFileName(importFile.getName()); long listId = server.queueListImportRequest(databaseRecord); LOG.info("###LISTUPLOADER:Queueing List:" + listName + " ListID:"+ listId); if (listId == -1) { LOG.error("###LISTUPLOADER:Queueing For List:" + listName + " Failed!"); resp.sendError(500,"Queue Request Failed!"); } else { resp.setContentType("text/plain"); resp.getWriter().print(Long.toString(listId)); resp.getWriter().flush(); } } catch (IOException e) { LOG.error("###LISTUPLOADER: IOException processing List:" + listName); LOG.error(CCStringUtils.stringifyException(e)); importFile.delete(); } } } @Override protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { String customerId = req.getParameter("customerId"); String listName = req.getParameter("listName"); LOG.info("###LISTUPLOADER: GOT POST CustomerId:" + customerId + " ListName:" + listName); if (customerId == null || !customers.contains(customerId) || listName == null || listName.length() == 0) { resp.sendError(500,"Invalid Customer Id or Invalid List Name!" + customerId + ":" + listName); } else { ArrayList<MultiPartFilter.UploadFileData> files= (ArrayList<MultiPartFilter.UploadFileData>) req.getAttribute(FILES); if (files == null || files.size() == 0) { LOG.error("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName + " No Files in Mutlipart Body!"); resp.sendError(500,"No File Selected!"); return; } else { MultiPartFilter.UploadFileData uploadData = files.get(0); if (uploadData.incomingContentType == null || !uploadData.incomingContentType.equals("text/plain")) { LOG.error("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName + " incoming MimeType:" + uploadData.incomingContentType + " NOT text/plain!"); resp.sendError(500,"Only Text Files Supported For Now :-("); return; } else{ // get the server ... ProxyServer server = ProxyServer.getSingleton(); // get the crawl history data directory ... File dataDir = server.getCrawlHistoryDataDir(); LOG.info("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName + "Incoming FileName is:" + uploadData.incomingFile.getAbsolutePath()); // move the file File importFile = new File(dataDir,uploadData.incomingFilename + "-" + System.currentTimeMillis()); LOG.info("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName +"Renaming Incoming File to:" + importFile.getAbsolutePath()); int retryCount = 0; boolean renameFailed = false; while (!importFile.exists()) { LOG.info("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName +" Moving Temp File"); Shell.execCommand(new String[] {"mv", uploadData.incomingFile.getAbsolutePath(),importFile.getAbsolutePath() } ); if (!importFile.exists()) { if (++retryCount == 10) { renameFailed = true; LOG.error("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName +" Rename Failed. Bailing!"); break; } LOG.error("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName +" Rename Failed. Retrying"); try { Thread.sleep(1000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { break; } } if (renameFailed) { LOG.error("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName +" Gave Up Trying to Move File!"); resp.sendError(500,"Failed to Copy Temp File!"); return; } LOG.info("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName +" Queueing Database Record"); // create a database record CrawlListDatabaseRecord databaseRecord = new CrawlListDatabaseRecord(); databaseRecord.setListName(listName); databaseRecord.setCustomerName(customerId); databaseRecord.setSourceFileName(uploadData.incomingFilename); databaseRecord.setTempFileName(importFile.getName()); long listId = server.queueListImportRequest(databaseRecord); if (listId == -1) { LOG.error("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName +" List Queueing Failed!"); resp.sendError(500,"Queue Request Failed!"); } else { LOG.info("###LISTUPLOADER: CustomerId:" + customerId + " ListName:" + listName +" ListId:" + listId); resp.setContentType("text/plain"); resp.getWriter().print(Long.toString(listId)); resp.getWriter().flush(); } } } } } }