/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.net.MalformedURLException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.junit.Test; /** * @author rana */ public class SessionIDURLNormalizer { // ^(.*php.*)/osCsid/([0-9a-fA-F]*)$ -> http://www.bearcountryuk.com/index.php/cPath/50/teddy+bear+name/Accessories/osCsid/4d4b2659aa5f1a39d907d315cf0a5209 // ^(.*php.*)/([0-9a-fA-F]*)$ -> http://www.myredpacket.co.uk/section.php/25/12/birthday-gifts-and-presents/d1b4c32d834a331b63109589ef730c27 //^(.*)/PHPSESSID/([0-9a-fA-F]*).html$ -> http://www.minshuku-web.com/catalog/869/PHPSESSID/96bc0d2490b3ce6206d04c1ed7ccfb26.html //^(.*/)sessions/[0-9a-fA-F]*/(.*)$ -> http://ifshinviolins.com/sessions/dd603a0a691faeb744db3f72212ca888/store //^(.*);\$sessionid\$([0-9a-zA-Z]*)$ -> http://www.iexplore.co.uk/travel-photos/French+Polynesia/1;$sessionid$BHZYH4QAAMGH5TBKYHVCFEQ //^(.*)/session_id/([0-9a-zA-Z]*)(.*)$ -> http://www.reinke.com/index.html/session_id/d606e74935a60c04d9989082b2fb624d/screen/interesting_links //^(.*)--session_id.[0-9a-zA-Z]*(.*)$ -> http://www.iboats.com/Portable_Above_Deck_Fuel_Tanks/dm/cart_id.726334728--category_id.238165--search_type.category--session_id.729524783--view_id.238165 private static Pattern pattern0 = Pattern.compile("^[0-9a-fA-F]*$"); // http://www.bearcountryuk.com/index.php/cPath/50/teddy+bear+name/Accessories/osCsid/4d4b2659aa5f1a39d907d315cf0a5209 private static Pattern pattern0_1 = Pattern.compile("[0-9a-zA-Z]*"); private static Pattern pattern0_2 = Pattern.compile("[0-9]*"); private static Pattern pattern0_3 = Pattern.compile("^[0-9a-fA-F\\-]*$"); private static Pattern pattern1 = Pattern.compile("^(.*php.*)/osCsid/[0-9a-fA-F]*$"); // http://www.bearcountryuk.com/index.php/cPath/50/teddy+bear+name/Accessories/osCsid/4d4b2659aa5f1a39d907d315cf0a5209 private static Pattern pattern2 = Pattern.compile("^(.*php.*)/[0-9a-fA-F]*$"); // http://www.myredpacket.co.uk/section.php/25/12/birthday-gifts-and-presents/d1b4c32d834a331b63109589ef730c27 private static Pattern pattern3 = Pattern.compile("^(.*)/PHPSESSID.[0-9a-fA-F]*.*$"); // http://www.minshuku-web.com/catalog/869/PHPSESSID/96bc0d2490b3ce6206d04c1ed7ccfb26.html private static Pattern pattern4 = Pattern.compile("^(.*/)sessions/[0-9a-fA-F]*/(.*)$"); // http://ifshinviolins.com/sessions/dd603a0a691faeb744db3f72212ca888/store private static Pattern pattern5 = Pattern.compile("^(.*);\\$sessionid\\$[0-9a-zA-Z]*$"); // http://www.iexplore.co.uk/travel-photos/French+Polynesia/1;$sessionid$BHZYH4QAAMGH5TBKYHVCFEQ private static Pattern pattern6 = Pattern.compile("^(.*)/session_id/[0-9a-zA-Z]*(.*)$"); // http://www.reinke.com/index.html/session_id/d606e74935a60c04d9989082b2fb624d/screen/interesting_links private static Pattern pattern7 = Pattern.compile("^(.*)--session_id.[0-9]*(.*)$"); // http://www.iboats.com/Portable_Above_Deck_Fuel_Tanks/dm/cart_id.726334728--category_id.238165--search_type.category--session_id.729524783--view_id.238165 private Configuration config; private static String OSCSID = "oscsid"; private static String PHPSESSID = "phpsessid"; private static String OSCSID_W_DASH= "-oscsid-"; private static String SESSIONS = "/sessions/"; private static String $SESSIONS$ = ";$sessionid$"; private static String SESSION_ID = "/session_id/"; private static String DASHDASH_SESSIONID = "--session_id."; private static String JSESSIONID = ";jsessionid="; private static String SID ="sid"; private static String MSCSID ="mscsid"; public String normalize(String urlString, String scope) throws MalformedURLException { String urlStringOriginal = urlString; urlString = urlString.toLowerCase(); { int phpsessidIDX = urlString.lastIndexOf(PHPSESSID); if (phpsessidIDX != -1) { int charPosAfterPHPSessId = phpsessidIDX + PHPSESSID.length(); if (urlString.length() > charPosAfterPHPSessId) { char charAfterPHPSessId = urlString.charAt(charPosAfterPHPSessId); if (charAfterPHPSessId == '=' || charAfterPHPSessId == '.' || charAfterPHPSessId == '-' || charAfterPHPSessId == '+' || charAfterPHPSessId == '/' ) { int idStart = charPosAfterPHPSessId + 1; int idEnd = idStart; while (idEnd != urlString.length()) { if (urlString.charAt(idEnd) == '&' || urlString.charAt(idEnd) == '.' || urlString.charAt(idEnd) == '-' || urlString.charAt(idEnd) == '+' || urlString.charAt(idEnd) == '/' ) break; ++idEnd; } String idStr = urlString.substring(idStart,idEnd); if (idStr.length() != 0) { Matcher m = pattern0.matcher(idStr); int desiredLength = 32; if (!m.matches()) { m = pattern0_1.matcher(idStr); desiredLength = 26; } if (idStr.length() == desiredLength && m.matches()) { if (idEnd == urlString.length()) { return urlStringOriginal.substring(0,phpsessidIDX - 1); } else { return urlStringOriginal.substring(0,phpsessidIDX - 1) + urlStringOriginal.substring(idEnd); } } } } } } } if (urlString.indexOf(".php") != -1) { int lastSlashPos = urlString.lastIndexOf('/'); if (lastSlashPos != -1) { String idStr = urlString.substring(lastSlashPos + 1); if (idStr.length() == 32) { Matcher m = pattern0.matcher(idStr); if (m.matches()) { int nextToLastSlashIndex = urlString.lastIndexOf('/',lastSlashPos - 1); if (nextToLastSlashIndex != -1) { if (urlString.indexOf("oscsid",nextToLastSlashIndex + 1) == nextToLastSlashIndex + 1) { return urlStringOriginal.substring(0,nextToLastSlashIndex); } /* else { return urlStringOriginal.substring(0,lastSlashPos); } */ } } } } } int indexOfOSCSID = urlString.lastIndexOf(OSCSID); if (indexOfOSCSID != -1) { int indexOfNextSlash = urlString.indexOf('/',indexOfOSCSID + OSCSID.length()); if (indexOfNextSlash != -1) { String idStr = urlString.substring(indexOfNextSlash + 1); if (idStr.length() == 32) { Matcher m = pattern0.matcher(idStr); if (m.matches()) { return urlStringOriginal.substring(0,indexOfOSCSID - 1); } } } } int oscsidDashIDX = urlString.lastIndexOf(OSCSID_W_DASH); if (oscsidDashIDX != -1) { int dotHTMLIDX = urlString.lastIndexOf(".html"); if (dotHTMLIDX > oscsidDashIDX) { String idStr = urlString.substring(oscsidDashIDX + OSCSID_W_DASH.length(), dotHTMLIDX); if (idStr.length() == 32) { Matcher m = pattern0.matcher(idStr); if (m.matches()) { return urlStringOriginal.substring(0,oscsidDashIDX) + urlStringOriginal.substring(dotHTMLIDX); } } } } int sessionsIDX = urlString.indexOf(SESSIONS); if (sessionsIDX != -1) { int nextSlashIDX = urlString.indexOf('/',sessionsIDX + SESSIONS.length()); if (nextSlashIDX != -1) { String idStr = urlString.substring(sessionsIDX + SESSIONS.length(), nextSlashIDX); if (idStr.length() == 32) { Matcher m = pattern0.matcher(idStr); if (m.matches()) { return urlStringOriginal.substring(0,sessionsIDX) + urlStringOriginal.substring(nextSlashIDX); } } } } int dollarSessionIDX = urlString.indexOf($SESSIONS$); if (dollarSessionIDX != -1) { Matcher m = pattern0_1.matcher(urlString.substring(dollarSessionIDX + $SESSIONS$.length())); if (m.matches()) { return urlStringOriginal.substring(0,dollarSessionIDX); } } int session_id_IDX = urlString.indexOf(SESSION_ID); if (session_id_IDX != -1) { int nextSlashIDX = urlString.indexOf('/',session_id_IDX + SESSION_ID.length()); if (nextSlashIDX != -1) { String idStr = urlString.substring(session_id_IDX + SESSION_ID.length(), nextSlashIDX); if (idStr.length() == 32) { Matcher m = pattern0.matcher(idStr); if (m.matches()) { return urlStringOriginal.substring(0,session_id_IDX) + urlStringOriginal.substring(nextSlashIDX); } } } } int dashdashIDX = urlString.indexOf(DASHDASH_SESSIONID); if (dashdashIDX != -1) { int nextDashDashIDX = urlString.indexOf("--",dashdashIDX + DASHDASH_SESSIONID.length()); if (nextDashDashIDX != -1) { Matcher m = pattern0_2.matcher(urlString.substring(dashdashIDX + DASHDASH_SESSIONID.length(), nextDashDashIDX)); if (m.matches()) { return urlStringOriginal.substring(0,dashdashIDX) + urlStringOriginal.substring(nextDashDashIDX); } } } { String matchingStr = null; int sidIDX = urlString.lastIndexOf(SID); if (sidIDX != -1 && sidIDX != 0) { if (urlString.charAt(sidIDX-1) == '/' || urlString.charAt(sidIDX-1) == '?' || urlString.charAt(sidIDX-1) == '&' || urlString.charAt(sidIDX-1) == '+') { matchingStr = SID; } } if (matchingStr == null) { sidIDX = urlString.lastIndexOf(OSCSID); if (sidIDX != -1 && sidIDX != 0) { if (urlString.charAt(sidIDX-1) == '/' || urlString.charAt(sidIDX-1) == '?' || urlString.charAt(sidIDX-1) == '&' || urlString.charAt(sidIDX-1) == '+') { matchingStr = OSCSID; } } } if (matchingStr == null) { sidIDX = urlString.lastIndexOf(MSCSID); if (sidIDX != -1 && sidIDX != 0) { if (urlString.charAt(sidIDX-1) == '/' || urlString.charAt(sidIDX-1) == '?' || urlString.charAt(sidIDX-1) == '&' || urlString.charAt(sidIDX-1) == '+') { matchingStr = MSCSID; } } } if (matchingStr != null) { { int charPosAfterSessId = sidIDX + matchingStr.length(); if (urlString.length() > charPosAfterSessId) { char charAfterSessId = urlString.charAt(charPosAfterSessId); if (charAfterSessId == '=' || charAfterSessId == '.' || charAfterSessId == '-' || charAfterSessId == '+' || charAfterSessId == '/' ) { int idStart = charPosAfterSessId + 1; int idEnd = idStart; while (idEnd != urlString.length()) { if (urlString.charAt(idEnd) == '&' || urlString.charAt(idEnd) == '.' || urlString.charAt(idEnd) == '+' || urlString.charAt(idEnd) == '/' ) break; ++idEnd; } String idStr = urlString.substring(idStart,idEnd); if (idStr.length() != 0) { int desiredLength = 32; Matcher m = pattern0.matcher(idStr); if (!m.matches()) { m = pattern0_3.matcher(idStr); desiredLength = 36; //with dashes .... } if (!m.matches()) { m = pattern0_1.matcher(idStr); desiredLength = 26; //with dashes .... } if (m.matches() && idStr.length() >= desiredLength) { if (idEnd == urlString.length()) { return urlStringOriginal.substring(0,sidIDX - 1); } else { return urlStringOriginal.substring(0,sidIDX - 1) + urlStringOriginal.substring(idEnd); } } } } } } } } int jsessionIdIDX = urlString.indexOf(JSESSIONID); if (jsessionIdIDX != -1) { // find trailing delimiter (if any) int indexOfQuery = urlString.indexOf('?', jsessionIdIDX); if (indexOfQuery != -1) { return urlStringOriginal.substring(0,jsessionIdIDX) + urlStringOriginal.substring(indexOfQuery); } else { return urlStringOriginal.substring(0,jsessionIdIDX); } } return urlStringOriginal; } public Configuration getConf() { return config; } public void setConf(Configuration conf) { config = conf; } static String testStrings[] = { "http://www.bearcountryuk.com/images/bc0059.jpg/osCsid/96a7bddc9c8a4249dbabd862f859e9e1", "http://www.jileyes.com/lingerie_category-cat-26-name-Inseparables___ensembles__soutien_gorge-osCsid-3416a5c31a2013e37cf87ca963c6c99f.html", "http://www.construfacil.com/index.php/P/search/PHPSESSID/015c350a9dcead350788459fe27e1d2c", //* "http://www.didglobal.com/page/PHPSESSID/db2efa56f2d298cbed0f27be2574cbfe/home", "http://www.lot-tissimo.com/zf/1/PHPSESSID/gfhte7m6riss8a57kt8hou7bl6/", "http://www.droles-blagues.com/news+index.storytopic+0+start+10+PHPSESSID+dabb2d0c754e989167997c0f6cca69b3.htm", "http://relax-navi.net/formmail+index.id_form+1+PHPSESSID+8014724e439c07d12e0bb63599af99e1.htm", "http://www.tagtag.com/site/mobile/terms/PHPSESSID/a82av7cnicjak8t8gcq9ss8lg6", "http://www.nblskil.org/ct/wffaq+index.PHPSESSID+7f1426a7e7d6f8717a05028335811b9e.htm", "http://www.soft-news.net/m-news+index+PHPSESSID-7375c6f2abc8237cefb6a19012281821.html", "http://www.horizon-etudiant.com/news+index.PHPSESSID+df70913950e6a2aeca5049f6ccbf2a46.htm", "http://www.classicsilks.com/catalog/images//osCsid/1eccdf955e1accf18372a3e12aa92fd6", "http://www.bearcountryuk.com/index.php/cPath/50/teddy+bear+name/Accessories/osCsid/4d4b2659aa5f1a39d907d315cf0a5209", "http://www.myredpacket.co.uk/section.php/25/12/birthday-gifts-and-presents/d1b4c32d834a331b63109589ef730c27", "http://www.minshuku-web.com/catalog/869/PHPSESSID/96bc0d2490b3ce6206d04c1ed7ccfb26.html", "http://ifshinviolins.com/sessions/dd603a0a691faeb744db3f72212ca888/store", "http://www.iexplore.co.uk/travel-photos/French+Polynesia/1;$sessionid$BHZYH4QAAMGH5TBKYHVCFEQ", "http://www.reinke.com/index.html/session_id/d606e74935a60c04d9989082b2fb624d/screen/interesting_links", "http://www.iboats.com/Portable_Above_Deck_Fuel_Tanks/dm/cart_id.726334728--category_id.238165--search_type.category--session_id.729524783--view_id.238165", "http://quote.yahoo.com/tech-ticker/article/37053/VMware-Tanks-as-CEO-Greene-Gets-Ousted;_ylt=An1dUveIfo30T0EBvyw6_US7YWsA?tickers=vmw", "https://www.harrahs.com/AvailabilityCalendar.do?propCode=PLV", "http://www.google.com/search?hl=en&q=st+jude+hospital+fullerton&btnG=Google+Search", "http://www.bearcountryuk.com/index.ddd/cPath/50/teddy+bear+name/Accessories/osCsid/4d4b2659aa5f1a39d907d315cf0a5209", "http://www.bearcountryuk.com/index.ddd;jsessionid=08301521611089820628281", "http://www.myredpacket.co.uk/section.php/25/12/birthday-gifts-and-presents;JSESSIONID=08301521611089820628281", "http://www1.cimaglobal.com/cps/rde/xchg/SID-0AE7C4D1-E388165B/live/root.xsl/13928.htm", "http://www.placidway.com/treatment-detail/20/Orthopedic/Knee-Surgery-Treatment-Abroad//?PHPSESSID=c83e4440fdb325634206cda3482aa758", "http://www.allacademic.com/one/www/www/index.php?cmd=www&PHPSESSID=e563c9711d20c906de543d52a1633072", "http://boards.bootsnall.com/the-team.html?sid=f52964b93dcfeb6a9ba43b0caf44d752", "http://www.fnac.com/livre.asp?SID=2f3f0314-8164-f087-e7e9-4ed9487391c8&UID=0B3FF5542-5944-146B-8EEB-ECDB3218C6AF&Origin=FnacAff&OrderInSession=0&TTL=040520100324&bl=2%5b1pro%5dliv", "http://forums-test.mozillazine.org/memberlist.php?mode=viewprofile&u=261941&sid=dd4c61187cd950ad4b64b8e4da7c20a9", "http://www.rainbowresource.com/prodlist.php?sid=1257592724-171162", "http://www.eloan.com/s/show/glossary?context=refi&lockdays=30&sid=B456E0E99B62D31EAB4274D8B59B944A&user=&mcode=&vid=", "http://www.motherwear.com/cs/sizechart.cfm?cid=107&sid=25046", "http://www.trainpetdog.com/store/terms-of-use.php?osCsid=b27eecba862e5c723c05b2f4245c06ea", "http://alumni.byu.edu/s/1085/03-provo-Alumni/index.aspx?sid=1085&gid=7&pgid=60&cid=169&referer=&query=emeriti%2fpdf%2femeritiwinter09.pdf", "http://www.couponchief.com/coupons/submit?sid=4422", "http://www.emeraldinsight.com/Insight/menuNavigation.do;jsessionid=A17FC93E864C2F8B3709F63558BA69DB?hdAction=InsightHome", "http://www.lakeshorelearning.com/order/onlineOrder.jsp;jsessionid=KxMMpRGgPpC1ktZ1pJJCZF1MmmFxZHPnyrNJhBmWJGHkhcL5Hd4p!-617247554!NONE?FOLDER%3C%3Efolder_id=2534374302096766&ASSORTMENT%3C%3East_id=1408474395181113&bmUID=1257311436941" }; @Test public void unitTest() throws Exception { long totalTimeStart = System.currentTimeMillis(); for (String url : testStrings) { long nanoSecsStart = System.nanoTime(); String result = normalize(url, ""); long nanoSecsEnd = System.nanoTime(); long nanoTime; if (nanoSecsEnd < nanoSecsStart) { nanoTime = (Long.MAX_VALUE - nanoSecsStart) + nanoSecsEnd; } else { nanoTime = nanoSecsEnd - nanoSecsStart; } if (result != url) { System.out.print("*"); } System.out.println("Time:" +nanoTime +"Source:" + url + " Resolved to:" + result); } long totalTimeEnd = System.currentTimeMillis(); System.out.println("Total Time:" + (totalTimeEnd-totalTimeStart)); } public static void main(String[] args) { try { new SessionIDURLNormalizer().unitTest(); } catch (Exception e) { e.printStackTrace(); } } }