/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.Vector; import java.util.concurrent.Semaphore; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.SequenceFile.Reader; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.BlackListRecord; import org.commoncrawl.mapred.BlackListSimilarityMatch; import org.commoncrawl.mapred.BlackListURLPattern; import org.commoncrawl.mapred.PatternMatchDetails; import org.commoncrawl.db.RecordStore; import org.commoncrawl.server.AsyncWebServerRequest; import org.commoncrawl.server.CommonCrawlServer; import org.commoncrawl.util.URLPattern.URLPatternBuilder; import org.commoncrawl.util.URLPattern.URLPatternMatcher; import com.google.gson.stream.JsonWriter; public class PatternListEditor extends CommonCrawlServer { @Override protected String getDefaultDataDir() { return "data"; } @Override protected String getDefaultHttpInterface() { return "localhost"; } @Override protected int getDefaultHttpPort() { return 8033; } @Override protected String getDefaultLogFileName() { return "patternEditor.log"; } @Override protected String getDefaultRPCInterface() { return "localhost"; } @Override protected int getDefaultRPCPort() { return 8034; } @Override protected String getWebAppName() { return null; } static PatternListEditor _server; RecordStore _recordStore; RecordStore _oldRecordStore; @Override protected boolean initServer() { _server = this; try { getWebServer().addServlet("validatePatterns", "/validate", ValidatePatternServlet.class); getWebServer().addServlet("bulkValidate", "/bulk", BulkValidate.class); getWebServer().addServlet("importOld", "/importOld", LoadFromOldDatabase.class); getWebServer().addServlet("genFile", "/genFile", GenerateFilterFile.class); _webServer.start(); _recordStore = new RecordStore(); _oldRecordStore = new RecordStore(); File patternDB = new File(getDataDirectory(),"pattern.db"); File oldPatternDB = new File(getDataDirectory(),"blacklist.db"); _recordStore.initialize(patternDB, null); _oldRecordStore.initialize(oldPatternDB, null); Vector<Long> recordIds = _recordStore.getChildRecordsByParentId("patterns"); if (recordIds.size() == 0) { initializePatternDB(); } else { /* LOG.info("There are:" + recordIds.size() + " patterns in the database"); for (long recordId : recordIds) { PatternMatchDetails detail = (PatternMatchDetails) _recordStore.getRecordById(recordId); if (detail.getStatus() == 0) { LOG.info("Pattern:" + detail.getRegEx() + " is unmodified. Adding to Queue"); _unmodifiedPatterns.add(recordId); } else { LOG.info("Skipping Modified Pattern:" + detail.getRegEx() + " with status:" + PatternMatchDetails.Status .toString(detail.getStatus())); } } */ } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return true; } void initializePatternDB() throws IOException { File sourcePath = new File(getDataDirectory(),"pattern.db.source"); File destPath = new File(getDataDirectory(),"pattern.db"); SequenceFile.Reader reader = new Reader(FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(sourcePath.getAbsolutePath()),CrawlEnvironment.getHadoopConfig()); Text key = new Text(); PatternMatchDetails details = new PatternMatchDetails(); _recordStore.beginTransaction(); while (reader.next(key,details)) { LOG.info("Inserting Pattern:" + key.toString()); _recordStore.insertRecord("patterns", key.toString(), details); } _recordStore.commitTransaction(); } @Override protected boolean parseArguements(String[] argv) { return true; } @Override protected void printUsage() { // TODO Auto-generated method stub } @Override protected boolean startDaemons() { // TODO Auto-generated method stub return false; } @Override protected void stopDaemons() { // TODO Auto-generated method stub } public static class GenerateFilterFile extends HttpServlet { private static class PatternMatch extends URLPatternMatcher { public String sourceExpression; public int matchCount = 0; public int attributionCount = 0; public PatternMatch(String regularExpression) throws PatternSyntaxException { super(regularExpression); sourceExpression = regularExpression; } } private static class DomainRecord { public DomainRecord(String domainName) { this.domainName = domainName; } ArrayList<PatternMatch> patternList = new ArrayList<PatternMatch>(); TreeSet<String> urls = new TreeSet<String>(); String domainName; } private static final String PARENT_REC_ID = "PARENT_REC_ID"; private static final String BlackListRecordPrefix = "BlackListRecordPrefix_"; private static void populatePatternsFromOld(TreeMap<String,DomainRecord> domainMap,RecordStore recordStore) throws IOException { for (long recordId : recordStore.getChildRecordsByParentId(PARENT_REC_ID)) { BlackListRecord blackListRecord = (BlackListRecord)recordStore.getRecordById(recordId); String domain = blackListRecord.getDomainName(); String rootDomain = URLUtils.extractRootDomainName(domain); DomainRecord domainObject = domainMap.get(rootDomain); if (domainObject == null) { domainObject = new DomainRecord(rootDomain); domainMap.put(rootDomain, domainObject); } if (blackListRecord.getStatus() == BlackListRecord.Status.blacklisted) { PatternMatch matchObject = new PatternMatch("http://[^/]*.*"); matchObject.matchCount = Integer.MAX_VALUE; matchObject.attributionCount = Integer.MAX_VALUE; domainObject.patternList.add(matchObject); } else { for (BlackListURLPattern pattern : blackListRecord.getPatterns()) { if (pattern.getStatus() == BlackListURLPattern.Status.blacklist) { // collect urls ... Set<String> urlSet = new HashSet<String>(); for (BlackListSimilarityMatch match : pattern.getMatches()) { urlSet.add(match.getDocument1URL()); urlSet.add(match.getDocument2URL()); } URLPatternBuilder builder = new URLPatternBuilder(); for (String url : urlSet) { builder.addPath(url); if (url.contains(";www.") || url.contains(";http")) { domainObject.urls.add(url); } } builder.consolidatePatterns(); int origSetSize = urlSet.size(); for (URLPattern patternObj : builder.getPatterns()) { String regularExpresion = patternObj.generateRegEx(); URLPatternMatcher matcher = new URLPatternMatcher(regularExpresion); int matchCount = 0; Set<String> mismatches = new HashSet<String>(); Set<String> matches = new HashSet<String>(); // ok now validate against collected urls for (String url : urlSet) { if (matcher.matches(url)) { ++matchCount; matches.add(url); } else { mismatches.add(url); } } urlSet.clear(); urlSet.addAll(mismatches); if (matchCount != 0) { domainObject.patternList.add(new PatternMatch(regularExpresion)); } } } } } } } private static void populatePatternsFromNew(TreeMap<String,DomainRecord> domainMap,RecordStore recordStore)throws IOException { Vector<Long> recordIds = recordStore.getChildRecordsByParentId("patterns"); LOG.info("There are:" + recordIds.size() + " patterns in the database"); for (long recordId : recordIds) { PatternMatchDetails detail = (PatternMatchDetails) recordStore.getRecordById(recordId); GoogleURL urlObject = new GoogleURL(detail.getUrls().get(0).toString()); String rootDomain = URLUtils.extractRootDomainName(urlObject.getHost()); DomainRecord domainObject = domainMap.get(rootDomain); if (domainObject == null) { domainObject = new DomainRecord(rootDomain); domainMap.put(rootDomain, domainObject); } try { Pattern pattern = Pattern.compile(detail.getRegEx()); domainObject.patternList.add(new PatternMatch(detail.getRegEx())); for (TextBytes urlBytes : detail.getUrls()) { String url = urlBytes.toString(); if (url.contains(";www.") || url.contains(";http")) { LOG.error("Skipping BAD URL:" + url); } else { domainObject.urls.add(url); } } } catch (PatternSyntaxException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp)throws ServletException, IOException { RecordStore newRecordStore = _server._recordStore; RecordStore oldRecordStore = _server._oldRecordStore; TreeMap<String,DomainRecord> domainMap = new TreeMap<String,DomainRecord>(); populatePatternsFromOld(domainMap,oldRecordStore); populatePatternsFromNew(domainMap,newRecordStore); // ok for each domain now ... for (DomainRecord domainObject : domainMap.values()) { // two passes for (int pass=0;pass<2;++pass) { if (pass == 0) { for (String url : domainObject.urls){ for (PatternMatch pattern : domainObject.patternList) { if (pattern.matches(url)) { // increment match count if (pattern.matchCount != Integer.MAX_VALUE) { pattern.matchCount++; } } } } } else { // sort patterns by match count Collections.sort(domainObject.patternList,new Comparator<PatternMatch>() { @Override public int compare(PatternMatch o1, PatternMatch o2) { return ((Integer)o2.matchCount).compareTo(o1.matchCount); } }); // ok now iterate urls ... for (String url : domainObject.urls) { for (PatternMatch pattern : domainObject.patternList) { if (pattern.matches(url)) { // increment count if (pattern.attributionCount != Integer.MAX_VALUE) { pattern.attributionCount++; } break; } } } Collections.sort(domainObject.patternList,new Comparator<PatternMatch>() { @Override public int compare(PatternMatch o1, PatternMatch o2) { return ((Integer)o2.attributionCount).compareTo(o1.attributionCount); } }); } } } PrintWriter writer = new PrintWriter(new File(System.currentTimeMillis() + "-patterns.txt"), "UTF-8"); // ok time to write things out ... for (DomainRecord domainObj : domainMap.values()) { for (PatternMatch match : domainObj.patternList) { if (match.attributionCount != 0) { writer.println(domainObj.domainName +"," + match.sourceExpression); } } } writer.flush(); writer.close(); } } public static class LoadFromOldDatabase extends HttpServlet { private static final String PARENT_REC_ID = "PARENT_REC_ID"; private static final String BlackListRecordPrefix = "BlackListRecordPrefix_"; @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp)throws ServletException, IOException { for (Object parameter : req.getParameterMap().keySet()) { System.out.println("Param:" + parameter.toString()); } RecordStore recordStore = _server._oldRecordStore; PrintWriter writer = resp.getWriter(); writer.println("<HTML>"); writer.println("<TABLE border=1>"); for (long recordId : recordStore.getChildRecordsByParentId(PARENT_REC_ID)) { BlackListRecord blackListRecord = (BlackListRecord)recordStore.getRecordById(recordId); if (blackListRecord.getStatus() == BlackListRecord.Status.blacklisted) { writer.println("<TR><TD colspan=3 style='background-color:red'>"); writer.println("Domain:" + blackListRecord.getDomainName()); writer.println("</TR>"); } else { for (BlackListURLPattern pattern : blackListRecord.getPatterns()) { if (pattern.getStatus() == BlackListURLPattern.Status.blacklist) { // collect urls ... Set<String> urlSet = new HashSet<String>(); for (BlackListSimilarityMatch match : pattern.getMatches()) { urlSet.add(match.getDocument1URL()); urlSet.add(match.getDocument2URL()); } URLPatternBuilder builder = new URLPatternBuilder(); for (String url : urlSet) { builder.addPath(url); } builder.consolidatePatterns(); int origSetSize = urlSet.size(); for (URLPattern patternObj : builder.getPatterns()) { URLPatternMatcher matcher = new URLPatternMatcher(patternObj.generateRegEx()); int matchCount = 0; Set<String> mismatches = new HashSet<String>(); Set<String> matches = new HashSet<String>(); // ok now validate against collected urls for (String url : urlSet) { if (matcher.matches(url)) { ++matchCount; matches.add(url); } else { mismatches.add(url); } } urlSet.clear(); urlSet.addAll(mismatches); writer.println("<TR><TD>" + blackListRecord.getDomainName()); writer.println("<TD>" + matchCount + "/" + urlSet.size()); writer.println("<TD>" + patternObj.generateRegEx()); if (matches.size() != 0) { writer.println("<BR>" + matches.iterator().next()); } writer.println("</TR>"); } if (urlSet.size() != 0) { writer.println("<TR><TD>Mismatches:"); writer.println("<TD colspan=2>"); writer.println("<TABLE BORDER=1>"); for (String url : urlSet) { writer.println("<TR><TD>" + url); } writer.println("</TABLE>"); writer.println("</TR>"); } } } } } } } public static class BulkValidate extends HttpServlet { @Override protected void doPost(final HttpServletRequest req, final HttpServletResponse resp) throws ServletException, IOException { AsyncWebServerRequest request = new AsyncWebServerRequest("") { @Override public boolean handleRequest(Semaphore completionSemaphore) throws IOException { String action = req.getParameter("action"); if (action.equals("apply")) { String mode = req.getParameter("mode"); if (mode == null || mode.equals("")) { mode = "apply"; } RecordStore recordStore = _server._recordStore; for (Object parameterName : req.getParameterMap().keySet()) { String param = parameterName.toString(); if (param.startsWith("PatternId_")) { recordStore.beginTransaction(); long patternId = Long.parseLong(param.substring("PatternId_".length())); PatternMatchDetails detail = (PatternMatchDetails) recordStore.getRecordById(patternId); if (mode.equals("apply")) { detail.setStatus(PatternMatchDetails.Status.Apply); } else if (mode.equals("ignore")) { detail.setStatus(PatternMatchDetails.Status.Ignore); } else if (mode.equals("sessionid")) { detail.setStatus(PatternMatchDetails.Status.SessionID); } LOG.info("Setting Pattern:" + detail.getRegEx() + " to:" + PatternMatchDetails.Status.toString(detail.getStatus())); recordStore.updateRecordById(patternId, detail); recordStore.commitTransaction(); } } resp.sendRedirect("/bulk?action=submit&PATTERN=" + req.getParameter("PATTERN")+"&mode=" + req.getParameter("mode")); } return false; } }; request.dispatch(_server.getEventLoop()); } @Override protected void doGet(final HttpServletRequest req, final HttpServletResponse resp) throws ServletException, IOException { AsyncWebServerRequest request = new AsyncWebServerRequest("") { @Override public boolean handleRequest(Semaphore completionSemaphore)throws IOException { String action = req.getParameter("action"); String mode = req.getParameter("mode"); PrintWriter writer = resp.getWriter(); if (action == null) { writer.println("<HTML>"); writer.println("<script src='http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js' type='text/javascript'></script>"); writer.println("<FORM action='/bulk' method=GET>"); writer.println("<INPUT TYPE=HIDDEN NAME='action' VALUE='submit'></INPUT>"); writer.println("Search for Pattern:<INPUT TYPE=TEXT NAME=PATTERN> </INPUT>"); writer.println("Mode:<SELECT NAME=mode>"); writer.println("<OPTION value='apply' " + ((mode == null || mode.equals("apply")) ? "selected" : "") + ">apply</OPTION>"); writer.println("<OPTION value='ignore' " + ((mode != null && mode.equals("ignore")) ? "selected" : "") + ">ignore</OPTION>"); writer.println("<OPTION value='sessionid' " + ((mode != null && mode.equals("sessionid")) ? "selected" : "") + ">sessionid</OPTION>"); writer.println("</SELECT>"); writer.println("<INPUT TYPE=SUBMIT>"); writer.println("</FORM>"); writer.println("</HTML>"); } else if (action.equals("submit")) { String selectionPattern = req.getParameter("PATTERN"); Pattern selectionPatternObj = Pattern.compile(selectionPattern); writer.println("<HTML>"); writer.println("<script src='http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js' type='text/javascript'></script>"); writer.println("<FORM action='/bulk' method=GET>"); writer.println("<INPUT TYPE=HIDDEN NAME='action' VALUE='submit'></INPUT>"); writer.println("Search for Pattern:<INPUT TYPE=TEXT NAME=PATTERN VALUE='" + selectionPattern + "'> </INPUT>"); writer.println("Mode:<SELECT id=modeSelector NAME=mode>"); writer.println("<OPTION value='apply' " + ((mode == null || mode.equals("apply")) ? "selected" : "") + ">apply</OPTION>"); writer.println("<OPTION value='ignore' " + ((mode != null && mode.equals("ignore")) ? "selected" : "") + ">ignore</OPTION>"); writer.println("<OPTION value='sessionid' " + ((mode != null && mode.equals("sessionid")) ? "selected" : "") + ">sessionid</OPTION>"); writer.println("</SELECT>"); writer.println("<INPUT TYPE=SUBMIT>"); writer.println("</FORM>"); writer.println("<script>"); writer.println("$(document).ready(function() { $('#modeSelector').change(function() { "); writer.println("$('#modeSelector option:selected').each(function () {"); writer.println("$('#hiddenMode').val($(this).val()); } ); } ); } );"); writer.println("</script>"); writer.println("<FORM action='/bulk' method=POST>"); writer.println("<INPUT TYPE=HIDDEN NAME='action' VALUE='apply'></INPUT>"); writer.println("<INPUT TYPE=HIDDEN NAME='PATTERN' value='" + selectionPattern + "'> </INPUT>"); writer.println("<INPUT id='hiddenMode' TYPE=HIDDEN NAME='mode' value='" + mode + "'> </INPUT>"); writer.println("<INPUT TYPE=SUBMIT></INPUT>"); writer.println("<TABLE BORDER=1>"); RecordStore recordStore = _server._recordStore; int unmodifiedCount = 0; int selectedCount = 0; int badPatternCount=0; for (long patternId : recordStore.getChildRecordsByParentId("patterns")) { PatternMatchDetails detail = (PatternMatchDetails) recordStore.getRecordById(patternId); if (detail.getStatus() == PatternMatchDetails.Status.UnModified) { unmodifiedCount++; Matcher selectionMatcher = selectionPatternObj.matcher(detail.getUrls().get(0).toString().toLowerCase()); if (selectionMatcher.find()) { URLPatternMatcher matcher = new URLPatternMatcher(detail.getRegEx()); if (matcher.matches(detail.getUrls().get(0).toString())) { selectedCount++; writer.println("<TR><TD><INPUT class='applyCheckbox' TYPE=CHECKBOX NAME='PatternId_" + patternId + "' CHECKED></INPUT></TD>"); writer.println("<TD> <a target=newwindow href='" + detail.getUrls().get(0)+"'>" + detail.getUrls().get(0)+ "</a>"); writer.println("</TR>"); } else { badPatternCount++; } } } } writer.println("</TABLE>"); writer.println("<INPUT TYPE=SUBMIT></INPUT>"); writer.println("<P><a href=\"javascript:$('.applyCheckbox').attr('checked',false);\">UnCheckAll</a> <a href=\"javascript:$('.applyCheckbox').attr('checked',true);\">CheckAll</a>"); writer.println("</FORM>"); writer.println("<P><B>UnmodifiedCount:" + unmodifiedCount + " SelectedCount:" + selectedCount + " BadCount:" + badPatternCount ); writer.println("</HTML>"); } return false; } }; request.dispatch(_server.getEventLoop()); } } public static class ValidatePatternServlet extends HttpServlet { @Override protected void doGet(final HttpServletRequest req, final HttpServletResponse resp) throws ServletException, IOException { RecordStore recordStore = _server._recordStore; Vector<Long> recordIds = recordStore.getChildRecordsByParentId("patterns"); LOG.info("There are:" + recordIds.size() + " patterns in the database"); for (long recordId : recordIds) { PatternMatchDetails detail = (PatternMatchDetails) recordStore.getRecordById(recordId); LOG.info("Validating Pattern:" + detail.getRegEx()); boolean isValid; int passNumber = 0; do { isValid=false; try { Pattern pattern = Pattern.compile(detail.getRegEx()); isValid=(passNumber != 0); for (TextBytes urlBytes : detail.getUrls()) { String url = urlBytes.toString(); if (url.contains(";www.")) { LOG.error("Skipping BAD URL:" + url); } else { url = URLPattern.normalizeQueryURL(url); if (pattern.matcher(url).matches()) { LOG.info("URL:" + url + " matches!"); } else { LOG.error("URL:" + url + " does not match pattern:" + detail.getRegEx()); isValid = false; break; } } } } catch (PatternSyntaxException e) { LOG.error(CCStringUtils.stringifyException(e)); } if (!isValid) { if (passNumber++ == 0) { URLPatternBuilder builder = new URLPatternBuilder(); for (TextBytes url : detail.getUrls()) { builder.addPath(url.toString()); } builder.consolidatePatterns(); URLPattern patternObj = builder.getPatterns().get(0); // generate regular expression detail.setRegEx(patternObj.generateRegEx()); } else { LOG.info("Failed to fix pattern:" + detail.getRegEx() + " on second pass. Marking Bad"); detail.setPatternIsBad(true); } recordStore.beginTransaction(); recordStore.updateRecordById(detail.getRecordId(), detail); recordStore.commitTransaction(); } } while (!isValid && !detail.getPatternIsBad()); } } } public static class BlackListDatabaseServlet extends HttpServlet { private static RecordStore _recordStore; private static TreeMap<String,BlackListRecord> _recordMap = new TreeMap<String,BlackListRecord>(); private static final String PARENT_REC_ID = "PARENT_REC_ID"; private static final String BlackListRecordPrefix = "BlackListRecordPrefix_"; public static final Log LOG = LogFactory.getLog(BlackListDatabaseServlet.class); public static void initialize(PatternListEditor server)throws IOException { File databaseFile = new File(server.getDataDirectory(),"blacklist.db"); _recordStore = new RecordStore(); _recordStore.initialize(databaseFile, null); intializeDatabase(); } @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp)throws ServletException, IOException { URL url = new URL(req.getRequestURL().toString()); LOG.info("Incoming URL:" + url.toString()); Pattern pattern = Pattern.compile(".*/blackList/(.*)"); Matcher matcher = pattern.matcher(url.getPath()); if (matcher.matches()){ resp.setCharacterEncoding("UTF-8"); String action = matcher.group(1); if (action.equals("loadData")) { LOG.info("Merging Databases"); mergeDatabase(); LOG.info("Done Merging Databases"); resp.getWriter().print("Merge Complete"); } else if (action.equals("getRecordSet")) { LOG.info("Generating RecordSet. Request:" + req.toString()); generateBigDataset(req,resp,resp.getWriter()); } else if (action.equals("updateDomainStatus")) { LOG.info("Received UpdateDomainStatus Request:" + req.toString()); updateDomainStatus(req,resp,resp.getWriter()); } else if (action.equals("updatePatternStatus")) { LOG.info("Received UpdatePatternStatus Request:" + req.toString()); updatePatternStatus(req,resp,resp.getWriter()); } else if (action.equals("getMatchList")) { LOG.info("Received getMatchList Request:" + req.toString()); getMatchList(req,resp,resp.getWriter()); } else { resp.sendError(500, "Bad Request"); } } } private static void intializeDatabase() throws IOException { for (long recordId : _recordStore.getChildRecordsByParentId(PARENT_REC_ID)) { BlackListRecord blackListRecord = (BlackListRecord)_recordStore.getRecordById(recordId); _recordMap.put(blackListRecord.getDomainName(), blackListRecord); } } private void mergeDatabase()throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Path path = new Path("crawl/scratch/sample001Result/part-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, CrawlEnvironment.getHadoopConfig()); try { Text domainName = new Text(); BlackListRecord newRecord = new BlackListRecord(); while (reader.next(domainName, newRecord)) { LOG.info("Domain Name:" + domainName.toString() + " recordName:" + newRecord.getDomainName()); BlackListRecord existingRecord = _recordMap.get(domainName.toString()); if (existingRecord != null) { // take status from existing record ... newRecord.setStatus(existingRecord.getStatus()); // create a map of patterns of new set TreeMap<String,BlackListURLPattern> newPatterns = new TreeMap<String,BlackListURLPattern>(); for (BlackListURLPattern newPattern : newRecord.getPatterns()) { newPatterns.put(newPattern.getPattern(), newPattern); } // walk patterns in existing set for (BlackListURLPattern existingPattern : existingRecord.getPatterns()) { // if the pattern's status has been modified ... if (existingPattern.getStatus() == BlackListURLPattern.Status.blacklist) { BlackListURLPattern newPatternObj = newPatterns.get(existingPattern.getPattern()); if (newPatternObj != null) { // honor old stats newPatternObj.setStatus(existingPattern.getStatus()); } else { // need to add this pattern to new set newRecord.getPatterns().add(existingPattern); } } } // ok delete old record and insert new record ... _recordStore.beginTransaction(); _recordStore.updateRecordByKey(BlackListRecordPrefix+newRecord.getDomainName(), newRecord); _recordStore.commitTransaction(); } else{ _recordStore.beginTransaction(); _recordStore.insertRecord(PARENT_REC_ID,BlackListRecordPrefix+newRecord.getDomainName(), newRecord); _recordStore.commitTransaction(); } // either way updte map ... _recordMap.put(newRecord.getDomainName(),newRecord); newRecord = new BlackListRecord(); } } finally { reader.close(); } } private void getMatchList(HttpServletRequest req, HttpServletResponse resp,PrintWriter writer) throws IOException { String domain = req.getParameter("domain"); int patternIdx = Integer.parseInt(req.getParameter("patternIdx")); resp.setContentType("application/x-javascript; charset=utf-8"); writer.println(req.getParameter("callback") + "(\n"); JsonWriter jsonWriter = new JsonWriter(writer); try { jsonWriter.beginObject(); jsonWriter.name("results"); jsonWriter.beginArray(); BlackListRecord record = _recordMap.get(domain); if (patternIdx >=0 && patternIdx < record.getPatterns().size()) { BlackListURLPattern pattern = record.getPatterns().get(patternIdx); for (BlackListSimilarityMatch match : pattern.getMatches()) { jsonWriter.beginObject(); jsonWriter.name("matchURL1"); jsonWriter.value(match.getDocument1URL()); jsonWriter.name("matchURL2"); jsonWriter.value(match.getDocument2URL()); jsonWriter.endObject(); } } jsonWriter.endArray(); jsonWriter.endObject(); } catch (Exception e) { throw new IOException(e); } writer.println(");\n"); } private void updatePatternStatus(HttpServletRequest req, HttpServletResponse resp,PrintWriter writer) throws IOException { String domain = req.getParameter("domain"); int patternIdx = Integer.parseInt(req.getParameter("patternIdx")); String status = req.getParameter("status"); resp.setContentType("application/x-javascript; charset=utf-8"); writer.println(req.getParameter("callback") + "(\n"); JsonWriter jsonWriter = new JsonWriter(writer); try { boolean success = false; jsonWriter.beginObject(); LOG.info("Domain:" + domain + " new status:" + status); BlackListRecord record = _recordMap.get(domain); if (record != null) { if (patternIdx >= 0 && patternIdx < record.getPatterns().size()) { success = true; if (status.equalsIgnoreCase("unmodified")) { record.getPatterns().get(patternIdx).setStatus(BlackListURLPattern.Status.unmodified); } else if (status.equalsIgnoreCase("blacklist")) { record.getPatterns().get(patternIdx).setStatus(BlackListURLPattern.Status.blacklist); } else { success = false; } if (success) { LOG.info("Updating Record:" + domain); _recordStore.beginTransaction(); _recordStore.updateRecordByKey(BlackListRecordPrefix+record.getDomainName(), record); _recordStore.commitTransaction(); LOG.info("Updated Record:" + domain); jsonWriter.name("status"); jsonWriter.value(BlackListURLPattern.Status.toString(record.getPatterns().get(patternIdx).getStatus())); } } } jsonWriter.name("success"); jsonWriter.value(success); jsonWriter.endObject(); writer.println(");\n"); } catch (Exception e) { throw new IOException(e); } } private void updateDomainStatus(HttpServletRequest req, HttpServletResponse resp,PrintWriter writer) throws IOException { String domain = req.getParameter("domain"); String status = req.getParameter("status"); resp.setContentType("application/x-javascript; charset=utf-8"); writer.println(req.getParameter("callback") + "(\n"); JsonWriter jsonWriter = new JsonWriter(writer); try { jsonWriter.beginObject(); LOG.info("Domain:" + domain + " new status:" + status); BlackListRecord record = _recordMap.get(domain); if (record != null) { jsonWriter.name("success"); jsonWriter.value(true); if (status.equalsIgnoreCase("unmodified")) { record.setStatus(BlackListRecord.Status.unmodified); } else if (status.equalsIgnoreCase("modified")) { record.setStatus(BlackListRecord.Status.modified); } else if (status.equalsIgnoreCase("blacklisted")) { record.setStatus(BlackListRecord.Status.blacklisted); } LOG.info("Updating Record:" + domain); _recordStore.beginTransaction(); _recordStore.updateRecordByKey(BlackListRecordPrefix+record.getDomainName(), record); _recordStore.commitTransaction(); LOG.info("Updated Record:" + domain); jsonWriter.name("status"); jsonWriter.value(BlackListRecord.Status.toString(record.getStatus())); } else { jsonWriter.name("success"); jsonWriter.value(false); } jsonWriter.endObject(); writer.println(")\n"); } catch (Exception e) { throw new IOException(e); } } private void generateBigDataset(HttpServletRequest req, HttpServletResponse resp,PrintWriter writer) throws IOException { resp.setContentType("application/x-javascript; charset=utf-8"); writer.println(req.getParameter("callback") + "(\n"); JsonWriter jsonWriter = new JsonWriter(writer); try { jsonWriter.beginObject(); jsonWriter.name("results"); jsonWriter.beginArray(); int recordCount =0; BlackListRecord records[] = _recordMap.values().toArray(new BlackListRecord[0]); Arrays.sort(records,new Comparator<BlackListRecord>() { @Override public int compare(BlackListRecord o1, BlackListRecord o2) { return ((Integer)o2.getUrlCount()).compareTo(o1.getUrlCount()); } }); for (BlackListRecord record : records) { // write json record jsonWriter.beginObject(); jsonWriter.name("name"); jsonWriter.value(record.getDomainName()); jsonWriter.name("href"); jsonWriter.value(record.getDomainName() + "_Index.html"); jsonWriter.name("logFileCount"); jsonWriter.value(record.getLogFileCount()); jsonWriter.name("urlCount"); jsonWriter.value(record.getUrlCount()); jsonWriter.name("status"); jsonWriter.value(BlackListRecord.Status.toString(record.getStatus())); jsonWriter.name("patterns"); jsonWriter.beginArray(); for (BlackListURLPattern pattern : record.getPatterns()) { jsonWriter.beginObject(); jsonWriter.name("pattern"); jsonWriter.value(pattern.getPattern()); jsonWriter.name("status"); jsonWriter.value(BlackListURLPattern.Status.toString(pattern.getStatus())); jsonWriter.name("matchCount"); jsonWriter.value(pattern.getTotalMatchCount()); jsonWriter.name("avgJSC"); jsonWriter.value(pattern.getAvgJSC()); /* jsonWriter.key("matches"); jsonWriter.array(); for (BlackListSimilarityMatch match : pattern.getMatches()) { jsonWriter.object(); jsonWriter.key("doc1URL"); jsonWriter.value(match.getDocument1URL()); jsonWriter.key("doc2URL"); jsonWriter.value(match.getDocument2URL()); jsonWriter.key("doc1Location"); jsonWriter.value(match.getDocument1Locaiton()); jsonWriter.key("doc2Location"); jsonWriter.value(match.getDocument2Location()); jsonWriter.key("hammingDistance"); jsonWriter.value(match.getHammingDistance()); jsonWriter.key("jsc"); jsonWriter.value(match.getJsc()); jsonWriter.endObject(); } jsonWriter.endArray(); */ jsonWriter.endObject(); } jsonWriter.endArray(); jsonWriter.endObject(); } jsonWriter.endArray(); jsonWriter.endObject(); writer.println(")\n"); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); throw new IOException(e); } } } }