/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
* NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package org.apache.pig.piggybank.evaluation.util.apachelogparser;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.pig.EvalFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* SearchEngineExtractor takes a url string and extracts the search engine. For example, given
*
* http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search
*
* then
*
* Google
*
* would be extracted.
*
* From pig latin, usage looks something like
*
* searchEngine = FOREACH row GENERATE
* org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor(referer);
*
* Supported search engines include abacho.com, alice.it, alltheweb.com, altavista.com, aolsearch.aol.com,
* as.starware.com, ask.com, blogs.icerocket.com, blogsearch.google.com, blueyonder.co.uk, busca.orange.es,
* buscador.lycos.es, buscador.terra.es, buscar.ozu.es, categorico.it, cuil.com, excite.com, excite.it,
* fastweb.it, feedster.com, godado.com, godado.it, google.ad, google.ae, google.af, google.ag, google.am,
* google.as, google.at, google.az, google.ba, google.be, google.bg, google.bi, google.biz, google.bo,
* google.bs, google.bz, google.ca, google.cc, google.cd, google.cg, google.ch, google.ci, google.cl,
* google.cn, google.co.at , google.co.bi, google.co.bw, google.co.ci, google.co.ck, google.co.cr,
* google.co.gg, google.co.gl, google.co.gy, google.co.hu, google.co.id, google.co.il, google.co.im,
* google.co.in, google.co.it, google.co.je, google.co.jp, google.co.ke, google.co.kr, google.co.ls,
* google.co.ma, google.co.mu, google.co.mw, google.co.nz, google.co.pn, google.co.th, google.co.tt,
* google.co.ug, google.co.uk, google.co.uz, google.co.ve, google.co.vi, google.co.za, google.co.zm,
* google.co.zw, google.com, google.com.af, google.com.ag, google.com.ai, google.com.ar, google.com.au,
* google.com.az, google.com.bd, google.com.bh, google.com.bi, google.com.bn, google.com.bo, google.com.br,
* google.com.bs, google.com.bz, google.com.cn, google.com.co, google.com.cu, google.com.do, google.com.ec,
* google.com.eg, google.com.et, google.com.fj, google.com.ge, google.com.gh, google.com.gi, google.com.gl,
* google.com.gp, google.com.gr, google.com.gt, google.com.gy, google.com.hk, google.com.hn, google.com.hr,
* google.com.jm, google.com.jo, google.com.kg, google.com.kh, google.com.ki, google.com.kz, google.com.lk,
* google.com.lv, google.com.ly, google.com.mt, google.com.mu, google.com.mw, google.com.mx, google.com.my,
* google.com.na, google.com.nf, google.com.ng, google.com.ni, google.com.np, google.com.nr, google.com.om,
* google.com.pa, google.com.pe, google.com.ph, google.com.pk, google.com.pl, google.com.pr, google.com.pt,
* google.com.py, google.com.qa, google.com.ru, google.com.sa, google.com.sb, google.com.sc, google.com.sg,
* google.com.sv, google.com.tj, google.com.tr, google.com.tt, google.com.tw, google.com.uy, google.com.uz,
* google.com.ve, google.com.vi, google.com.vn, google.com.ws, google.cz, google.de, google.dj, google.dk ,
* google.dm , google.ec, google.ee, google.es, google.fi, google.fm, google.fr, google.gd, google.ge,
* google.gf, google.gg, google.gl, google.gm, google.gp, google.gr, google.gy, google.hk, google.hn,
* google.hr, google.ht, google.hu, google.ie, google.im, google.in, google.info, google.is, google.it,
* google.je, google.jo, google.jobs, google.jp, google.kg, google.ki, google.kz, google.la, google.li,
* google.lk, google.lt, google.lu, google.lv, google.ma, google.md, google.mn, google.mobi, google.ms,
* google.mu, google.mv, google.mw, google.net, google.nf, google.nl, google.no, google.nr, google.nu,
* google.off.ai, google.ph, google.pk, google.pl, google.pn, google.pr, google.pt, google.ro, google.ru,
* google.rw, google.sc, google.se, google.sg, google.sh, google.si, google.sk, google.sm, google.sn,
* google.sr, google.st, google.tk, google.tm, google.to, google.tp, google.tt, google.tv, google.tw,
* google.ug, google.us, google.uz, google.vg, google.vn, google.vu, google.ws, gps.virgin.net, hotbot.com,
* ilmotore.com, ithaki.net, kataweb.it, libero.it, lycos.it, mamma.com, megasearching.net, mirago.co.uk,
* netscape.com, search.aol.co.uk, search.arabia.msn.com, search.bbc.co.uk, search.conduit.com,
* search.icq.com, search.live.com, search.lycos.co.uk, search.lycos.com, search.msn.co.uk, search.msn.com,
* search.myway.com, search.mywebsearch.com, search.ntlworld.com, search.orange.co.uk, search.prodigy.msn.com,
* search.sweetim.com, search.virginmedia.com, search.yahoo.co.jp, search.yahoo.com, search.yahoo.jp,
* simpatico.ws, soso.com, suche.fireball.de, suche.t-online.de, suche.web.de, technorati.com, tesco.net,
* thespider.it, tiscali.co.uk, uk.altavista.com, uk.ask.com, uk.search.yahoo.com
*
* Thanks to Spiros Denaxas for his URI::ParseSearchString, which is the basis for the lookups.
*/
public class SearchEngineExtractor extends EvalFunc<String> {
private static HashMap<String, String> searchEngines = new HashMap<String, String>();
static {
searchEngines.put("abacho.com", "Abacho");
searchEngines.put("alice.it", "Alice.it");
searchEngines.put("alltheweb.com", "AllTheWeb");
searchEngines.put("altavista.com", "Altavista");
searchEngines.put("aolsearch.aol.com", "AOL Search");
searchEngines.put("as.starware.com", "Starware");
searchEngines.put("ask.com", "Ask dot com");
searchEngines.put("blogs.icerocket.com", "IceRocket");
searchEngines.put("blogsearch.google.com", "Google Blogsearch");
searchEngines.put("blueyonder.co.uk", "Blueyonder");
searchEngines.put("busca.orange.es", "Orange ES");
searchEngines.put("buscador.lycos.es", "Lycos ES");
searchEngines.put("buscador.terra.es", "Terra ES");
searchEngines.put("buscar.ozu.es", "Ozu ES");
searchEngines.put("categorico.it", "Categorico IT");
searchEngines.put("cuil.com", "Cuil");
searchEngines.put("excite.com", "Excite");
searchEngines.put("excite.it", "Excite IT");
searchEngines.put("fastweb.it", "Fastweb IT");
searchEngines.put("feedster.com", "Feedster");
searchEngines.put("godado.com", "Godado");
searchEngines.put("godado.it", "Godado (IT)");
searchEngines.put("google.ad", "Google Andorra");
searchEngines.put("google.ae", "Google United Arab Emirates");
searchEngines.put("google.af", "Google Afghanistan");
searchEngines.put("google.ag", "Google Antiqua and Barbuda");
searchEngines.put("google.am", "Google Armenia");
searchEngines.put("google.as", "Google American Samoa");
searchEngines.put("google.at", "Google Austria");
searchEngines.put("google.az", "Google Azerbaijan");
searchEngines.put("google.ba", "Google Bosnia and Herzegovina");
searchEngines.put("google.be", "Google Belgium");
searchEngines.put("google.bg", "Google Bulgaria");
searchEngines.put("google.bi", "Google Burundi");
searchEngines.put("google.biz", "Google dot biz");
searchEngines.put("google.bo", "Google Bolivia");
searchEngines.put("google.bs", "Google Bahamas");
searchEngines.put("google.bz", "Google Belize");
searchEngines.put("google.ca", "Google Canada");
searchEngines.put("google.cc", "Google Cocos Islands");
searchEngines.put("google.cd", "Google Dem Rep of Congo");
searchEngines.put("google.cg", "Google Rep of Congo");
searchEngines.put("google.ch", "Google Switzerland");
searchEngines.put("google.ci", "Google Cote dIvoire");
searchEngines.put("google.cl", "Google Chile");
searchEngines.put("google.cn", "Google China");
searchEngines.put("google.co.at ", "Google Austria");
searchEngines.put("google.co.bi", "Google Burundi");
searchEngines.put("google.co.bw", "Google Botswana");
searchEngines.put("google.co.ci", "Google Ivory Coast");
searchEngines.put("google.co.ck", "Google Cook Islands");
searchEngines.put("google.co.cr", "Google Costa Rica");
searchEngines.put("google.co.gg", "Google Guernsey");
searchEngines.put("google.co.gl", "Google Greenland");
searchEngines.put("google.co.gy", "Google Guyana");
searchEngines.put("google.co.hu", "Google Hungary ");
searchEngines.put("google.co.id", "Google Indonesia");
searchEngines.put("google.co.il", "Google Israel");
searchEngines.put("google.co.im", "Google Isle of Man");
searchEngines.put("google.co.in", "Google India");
searchEngines.put("google.co.it", "Google Italy");
searchEngines.put("google.co.je", "Google Jersey");
searchEngines.put("google.co.jp", "Google Japan");
searchEngines.put("google.co.ke", "Google Kenya");
searchEngines.put("google.co.kr", "Google South Korea");
searchEngines.put("google.co.ls", "Google Lesotho");
searchEngines.put("google.co.ma", "Google Morocco");
searchEngines.put("google.co.mu", "Google Mauritius");
searchEngines.put("google.co.mw", "Google Malawi");
searchEngines.put("google.co.nz", "Google New Zeland");
searchEngines.put("google.co.pn", "Google Pitcairn Islands");
searchEngines.put("google.co.th", "Google Thailand");
searchEngines.put("google.co.tt", "Google Trinidad and Tobago");
searchEngines.put("google.co.ug", "Google Uganda");
searchEngines.put("google.co.uk", "Google UK");
searchEngines.put("google.co.uz", "Google Uzbekistan");
searchEngines.put("google.co.ve", "Google Venezuela");
searchEngines.put("google.co.vi", "Google US Virgin Islands");
searchEngines.put("google.co.za", "Google South Africa ");
searchEngines.put("google.co.zm", "Google Zambia");
searchEngines.put("google.co.zw", "Google Zimbabwe");
searchEngines.put("google.com", "Google");
searchEngines.put("google.com.af", "Google Afghanistan");
searchEngines.put("google.com.ag", "Google Antiqua and Barbuda");
searchEngines.put("google.com.ai", "Google Anguilla");
searchEngines.put("google.com.ar", "Google Argentina");
searchEngines.put("google.com.au", "Google Australia");
searchEngines.put("google.com.az", "Google Azerbaijan ");
searchEngines.put("google.com.bd", "Google Bangladesh");
searchEngines.put("google.com.bh", "Google Bahrain");
searchEngines.put("google.com.bi", "Google Burundi");
searchEngines.put("google.com.bn", "Google Brunei Darussalam");
searchEngines.put("google.com.bo", "Google Bolivia ");
searchEngines.put("google.com.br", "Google Brazil");
searchEngines.put("google.com.bs", "Google Bahamas");
searchEngines.put("google.com.bz", "Google Belize");
searchEngines.put("google.com.cn", "Google China");
searchEngines.put("google.com.co", "Google ");
searchEngines.put("google.com.cu", "Google Cuba");
searchEngines.put("google.com.do", "Google Dominican Rep");
searchEngines.put("google.com.ec", "Google Ecuador");
searchEngines.put("google.com.eg", "Google Egypt");
searchEngines.put("google.com.et", "Google Ethiopia");
searchEngines.put("google.com.fj", "Google Fiji");
searchEngines.put("google.com.ge", "Google Georgia");
searchEngines.put("google.com.gh", "Google Ghana");
searchEngines.put("google.com.gi", "Google Gibraltar");
searchEngines.put("google.com.gl", "Google Greenland");
searchEngines.put("google.com.gp", "Google Guadeloupe");
searchEngines.put("google.com.gr", "Google Greece");
searchEngines.put("google.com.gt", "Google Guatemala");
searchEngines.put("google.com.gy", "Google Guyana");
searchEngines.put("google.com.hk", "Google Hong Kong");
searchEngines.put("google.com.hn", "Google Honduras");
searchEngines.put("google.com.hr", "Google Croatia");
searchEngines.put("google.com.jm", "Google Jamaica");
searchEngines.put("google.com.jo", "Google Jordan");
searchEngines.put("google.com.kg", "Google Kyrgyzstan");
searchEngines.put("google.com.kh", "Google Cambodia");
searchEngines.put("google.com.ki", "Google Kiribati");
searchEngines.put("google.com.kz", "Google Kazakhstan");
searchEngines.put("google.com.lk", "Google Sri Lanka");
searchEngines.put("google.com.lv", "Google Latvia");
searchEngines.put("google.com.ly", "Google Libya");
searchEngines.put("google.com.mt", "Google Malta");
searchEngines.put("google.com.mu", "Google Mauritius");
searchEngines.put("google.com.mw", "Google Malawi");
searchEngines.put("google.com.mx", "Google Mexico");
searchEngines.put("google.com.my", "Google Malaysia");
searchEngines.put("google.com.na", "Google Namibia");
searchEngines.put("google.com.nf", "Google Norfolk Island");
searchEngines.put("google.com.ng", "Google Nigeria");
searchEngines.put("google.com.ni", "Google Nicaragua");
searchEngines.put("google.com.np", "Google Nepal");
searchEngines.put("google.com.nr", "Google Nauru");
searchEngines.put("google.com.om", "Google Oman");
searchEngines.put("google.com.pa", "Google Panama");
searchEngines.put("google.com.pe", "Google Peru");
searchEngines.put("google.com.ph", "Google Philipines");
searchEngines.put("google.com.pk", "Google Pakistan");
searchEngines.put("google.com.pl", "Google Poland");
searchEngines.put("google.com.pr", "Google Puerto Rico");
searchEngines.put("google.com.pt", "Google Portugal");
searchEngines.put("google.com.py", "Google Paraguay");
searchEngines.put("google.com.qa", "Google ");
searchEngines.put("google.com.ru", "Google Russia");
searchEngines.put("google.com.sa", "Google Saudi Arabia");
searchEngines.put("google.com.sb", "Google Solomon Islands");
searchEngines.put("google.com.sc", "Google Seychelles");
searchEngines.put("google.com.sg", "Google Singapore");
searchEngines.put("google.com.sv", "Google El Savador");
searchEngines.put("google.com.tj", "Google Tajikistan");
searchEngines.put("google.com.tr", "Google Turkey");
searchEngines.put("google.com.tt", "Google Trinidad and Tobago");
searchEngines.put("google.com.tw", "Google Taiwan");
searchEngines.put("google.com.uy", "Google Uruguay");
searchEngines.put("google.com.uz", "Google Uzbekistan ");
searchEngines.put("google.com.ve", "Google Venezuela");
searchEngines.put("google.com.vi", "Google US Virgin Islands");
searchEngines.put("google.com.vn", "Google Vietnam");
searchEngines.put("google.com.ws", "Google Samoa");
searchEngines.put("google.cz", "Google Czech Rep");
searchEngines.put("google.de", "Google Germany");
searchEngines.put("google.dj", "Google Djubouti");
searchEngines.put("google.dk ", "Google Denmark");
searchEngines.put("google.dm ", "Google Dominica");
searchEngines.put("google.ec", "Google Ecuador");
searchEngines.put("google.ee", "Google Estonia");
searchEngines.put("google.es", "Google Spain");
searchEngines.put("google.fi", "Google Finland");
searchEngines.put("google.fm", "Google Micronesia");
searchEngines.put("google.fr", "Google France");
searchEngines.put("google.gd", "Google Grenada");
searchEngines.put("google.ge", "Google Georgia");
searchEngines.put("google.gf", "Google French Guiana");
searchEngines.put("google.gg", "Google Guernsey");
searchEngines.put("google.gl", "Google Greenland");
searchEngines.put("google.gm", "Google Gambia");
searchEngines.put("google.gp", "Google Guadeloupe");
searchEngines.put("google.gr", "Google Greece");
searchEngines.put("google.gy", "Google Guyana");
searchEngines.put("google.hk", "Google Hong Kong");
searchEngines.put("google.hn", "Google Honduras");
searchEngines.put("google.hr", "Google Croatia");
searchEngines.put("google.ht", "Google Haiti");
searchEngines.put("google.hu", "Google Hungary");
searchEngines.put("google.ie", "Google Ireland");
searchEngines.put("google.im", "Google Isle of Man");
searchEngines.put("google.in", "Google India");
searchEngines.put("google.info", "Google dot info");
searchEngines.put("google.is", "Google Iceland");
searchEngines.put("google.it", "Google Italy");
searchEngines.put("google.je", "Google Jersey");
searchEngines.put("google.jo", "Google Jordan");
searchEngines.put("google.jobs", "Google dot jobs");
searchEngines.put("google.jp", "Google Japan");
searchEngines.put("google.kg", "Google Kyrgyzstan");
searchEngines.put("google.ki", "Google Kiribati");
searchEngines.put("google.kz", "Google Kazakhstan");
searchEngines.put("google.la", "Google Laos");
searchEngines.put("google.li", "Google Liechtenstein");
searchEngines.put("google.lk", "Google Sri Lanka");
searchEngines.put("google.lt", "Google Lithuania");
searchEngines.put("google.lu", "Google Luxembourg");
searchEngines.put("google.lv", "Google Latvia");
searchEngines.put("google.ma", "Google Morocco");
searchEngines.put("google.md", "Google Moldova");
searchEngines.put("google.mn", "Google Mongolia");
searchEngines.put("google.mobi", "Google dot mobi");
searchEngines.put("google.ms", "Google Montserrat");
searchEngines.put("google.mu", "Google Mauritius");
searchEngines.put("google.mv", "Google Maldives");
searchEngines.put("google.mw", "Google Malawi");
searchEngines.put("google.net", "Google dot net");
searchEngines.put("google.nf", "Google Norfolk Island");
searchEngines.put("google.nl", "Google Netherlands");
searchEngines.put("google.no", "Google Norway");
searchEngines.put("google.nr", "Google Nauru");
searchEngines.put("google.nu", "Google Niue");
searchEngines.put("google.off.ai", "Google Anguilla");
searchEngines.put("google.ph", "Google Philipines");
searchEngines.put("google.pk", "Google Pakistan");
searchEngines.put("google.pl", "Google Poland");
searchEngines.put("google.pn", "Google Pitcairn Islands");
searchEngines.put("google.pr", "Google Puerto Rico");
searchEngines.put("google.pt", "Google Portugal");
searchEngines.put("google.ro", "Google Romania");
searchEngines.put("google.ru", "Google Russia");
searchEngines.put("google.rw", "Google Rwanda");
searchEngines.put("google.sc", "Google Seychelles");
searchEngines.put("google.se", "Google Sweden");
searchEngines.put("google.sg", "Google Singapore");
searchEngines.put("google.sh", "Google Saint Helena");
searchEngines.put("google.si", "Google Slovenia");
searchEngines.put("google.sk", "Google Slovakia");
searchEngines.put("google.sm", "Google San Marino");
searchEngines.put("google.sn", "Google Senegal");
searchEngines.put("google.sr", "Google Suriname");
searchEngines.put("google.st", "Google Sao Tome ");
searchEngines.put("google.tk", "Google Tokelau");
searchEngines.put("google.tm", "Google Turkmenistan");
searchEngines.put("google.to", "Google Tonga");
searchEngines.put("google.tp", "Google East Timor");
searchEngines.put("google.tt", "Google Trinidad and Tobago");
searchEngines.put("google.tv", "Google Tuvalu");
searchEngines.put("google.tw", "Google Taiwan");
searchEngines.put("google.ug", "Google Uganda");
searchEngines.put("google.us", "Google US");
searchEngines.put("google.uz", "Google Uzbekistan");
searchEngines.put("google.vg", "Google British Virgin Islands");
searchEngines.put("google.vn", "Google Vietnam");
searchEngines.put("google.vu", "Google Vanuatu");
searchEngines.put("google.ws", "Google Samoa");
searchEngines.put("gps.virgin.net", "Virgin Search");
searchEngines.put("hotbot.com", "HotBot");
searchEngines.put("ilmotore.com", "ilMotore");
searchEngines.put("ithaki.net", "Ithaki");
searchEngines.put("kataweb.it", "Kataweb IT");
searchEngines.put("libero.it", "Libero IT");
searchEngines.put("lycos.it", "Lycos IT");
searchEngines.put("mamma.com", "Mamma");
searchEngines.put("megasearching.net", "Megasearching");
searchEngines.put("mirago.co.uk", "Mirago UK");
searchEngines.put("netscape.com", "Netscape");
searchEngines.put("search.aol.co.uk", "AOL UK");
searchEngines.put("search.arabia.msn.com", "MSN Arabia");
searchEngines.put("search.bbc.co.uk", "BBC Search");
searchEngines.put("search.conduit.com", "Conduit");
searchEngines.put("search.icq.com", "ICQ dot com");
searchEngines.put("search.live.com", "Live.com");
searchEngines.put("search.lycos.co.uk", "Lycos UK");
searchEngines.put("search.lycos.com", "Lycos");
searchEngines.put("search.msn.co.uk", "MSN UK");
searchEngines.put("search.msn.com", "MSN");
searchEngines.put("search.myway.com", "MyWay");
searchEngines.put("search.mywebsearch.com", "My Web Search");
searchEngines.put("search.ntlworld.com", "NTLWorld");
searchEngines.put("search.orange.co.uk", "Orange Search");
searchEngines.put("search.prodigy.msn.com", "MSN Prodigy");
searchEngines.put("search.sweetim.com", "Sweetim");
searchEngines.put("search.virginmedia.com", "VirginMedia");
searchEngines.put("search.yahoo.co.jp", "Yahoo Japan");
searchEngines.put("search.yahoo.com", "Yahoo!");
searchEngines.put("search.yahoo.jp", "Yahoo! Japan");
searchEngines.put("simpatico.ws", "Simpatico IT");
searchEngines.put("soso.com", "Soso");
searchEngines.put("suche.fireball.de", "Fireball DE");
searchEngines.put("suche.t-online.de", "T-Online");
searchEngines.put("suche.web.de", "Suche DE");
searchEngines.put("technorati.com", "Technorati");
searchEngines.put("tesco.net", "Tesco Search");
searchEngines.put("thespider.it", "TheSpider IT");
searchEngines.put("tiscali.co.uk", "Tiscali UK");
searchEngines.put("uk.altavista.com", "Altavista UK");
searchEngines.put("uk.ask.com", "Ask UK");
searchEngines.put("uk.search.yahoo.com", "Yahoo! UK");
}
@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
String referer="";
try{
referer = (String)input.get(0);
String searchEngine = null;
String host = null;
host = new URL(referer).getHost().toLowerCase().replaceFirst("^www.", "");
if (host != null)
searchEngine = searchEngines.containsKey(host) ? searchEngines.get(host) : null;
return searchEngine;
} catch (Exception e) {
throw new IOException("Caught exception processing input row ", e);
}
}
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
List<FuncSpec> funcList = new ArrayList<FuncSpec>();
funcList.add(new FuncSpec(this.getClass().getName(),
new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
return funcList;
}
}