/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
* NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package org.apache.pig.piggybank.evaluation.util.apachelogparser;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pig.EvalFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* SearchTermExtractor takes a url string and extracts the search terms. For example, given
*
* http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search
*
* then
*
* purpose of life
*
* would be extracted.
*
* From pig latin, usage looks something like
*
* searchTerm = FOREACH row GENERATE
* org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchTermExtractor(referer);
*
* Supported search engines include alltheweb.com, altavista.com, aolsearch.aol.com, arianna.libero.it,
* as.starware.com, ask.com, blogs.icerocket.com, blueyonder.co.uk, busca.orange.es, buscador.lycos.es,
* buscador.terra.es, buscar.ozu.es, categorico.it, cerca.lycos.it, cuil.com, excite.it, godado.com,
* godado.it, gps.virgin.net, hotbot.com, ilmotore.com, it.altavista.com, ithaki.net, libero.it, lycos.es,
* lycos.it, mamma.com, megasearching.net, mirago.co.uk, netscape.com, ozu.es, ricerca.alice.it,
* search.aol.co.uk, search.bbc.co.uk, search.conduit.com, search.icq.com, search.live.com,
* search.lycos.co.uk, search.lycos.com, search.msn.co.uk, search.msn.com, search.myway.com,
* search.mywebsearch.com, search.ntlworld.com, search.orange.co.uk, search.sweetim.com,
* search.virginmedia.com, simpatico.ws, soso.com, suche.fireball.de, suche.web.de, terra.es, tesco.net,
* thespider.it, tiscali.co.uk, uk.altavista.com, uk.ask.com
*
* Thanks to Spiros Denaxas for his URI::ParseSearchString, which is the basis for the lookups.
*/
public class SearchTermExtractor extends EvalFunc<String> {
private static Matcher TERM_MATCHER = null;
private static Matcher P_TERM_MATCHER = null;
static {
TERM_MATCHER = Pattern.compile("\\b(?:q|buscar|key|qry|qs|query|s|searchfor|su|w)=([^&]+)").matcher("");
P_TERM_MATCHER = Pattern.compile("\\bp=([^&]+)").matcher("");
}
private String myDecode(String string) {
try {
string = URLDecoder.decode(string, "UTF-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return string;
}
private static HashMap<String, Boolean> HOSTS = new HashMap<String, Boolean>();
static {
HOSTS.put("alltheweb.com", true);
HOSTS.put("altavista.com", true);
HOSTS.put("aolsearch.aol.com", true);
HOSTS.put("arianna.libero.it", true);
HOSTS.put("as.starware.com", true);
HOSTS.put("ask.com", true);
HOSTS.put("blogs.icerocket.com", true);
HOSTS.put("blueyonder.co.uk", true);
HOSTS.put("busca.orange.es", true);
HOSTS.put("buscador.lycos.es", true);
HOSTS.put("buscador.terra.es", true);
HOSTS.put("buscar.ozu.es", true);
HOSTS.put("categorico.it", true);
HOSTS.put("cerca.lycos.it", true);
HOSTS.put("cuil.com", true);
HOSTS.put("excite.it", true);
HOSTS.put("godado.com", true);
HOSTS.put("godado.it", true);
HOSTS.put("gps.virgin.net", true);
HOSTS.put("hotbot.com", true);
HOSTS.put("ilmotore.com", true);
HOSTS.put("it.altavista.com", true);
HOSTS.put("ithaki.net", true);
HOSTS.put("libero.it", true);
HOSTS.put("lycos.es", true);
HOSTS.put("lycos.it", true);
HOSTS.put("mamma.com", true);
HOSTS.put("megasearching.net", true);
HOSTS.put("mirago.co.uk", true);
HOSTS.put("netscape.com", true);
HOSTS.put("ozu.es", true);
HOSTS.put("ricerca.alice.it", true);
HOSTS.put("search.aol.co.uk", true);
HOSTS.put("search.bbc.co.uk", true);
HOSTS.put("search.conduit.com", true);
HOSTS.put("search.icq.com", true);
HOSTS.put("search.live.com", true);
HOSTS.put("search.lycos.co.uk", true);
HOSTS.put("search.lycos.com", true);
HOSTS.put("search.msn.co.uk", true);
HOSTS.put("search.msn.com", true);
HOSTS.put("search.myway.com", true);
HOSTS.put("search.mywebsearch.com", true);
HOSTS.put("search.ntlworld.com", true);
HOSTS.put("search.orange.co.uk", true);
HOSTS.put("search.sweetim.com", true);
HOSTS.put("search.virginmedia.com", true);
HOSTS.put("simpatico.ws", true);
HOSTS.put("soso.com", true);
HOSTS.put("suche.fireball.de", true);
HOSTS.put("suche.web.de", true);
HOSTS.put("terra.es", true);
HOSTS.put("tesco.net", true);
HOSTS.put("thespider.it", true);
HOSTS.put("tiscali.co.uk", true);
HOSTS.put("uk.altavista.com", true);
HOSTS.put("uk.ask.com", true);
}
@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
String url="";
try {
url = (String)input.get(0);
if (url == null)
return null;
URL urlObject = new URL(url);
if (urlObject == null)
return null;
String host = urlObject.getHost();
if (host == null)
return null;
host = host.replaceFirst("^www\\.", "");
if (host == null)
return null;
host = host.toLowerCase();
if (HOSTS.containsKey(host) || host.contains("google.co")
|| host.contains("search.yahoo")) {
String queryString = urlObject.getQuery();
if (queryString == null) { return null; }
TERM_MATCHER.reset(queryString);
if (TERM_MATCHER.find()) {
String terms = TERM_MATCHER.group(1);
return myDecode(terms);
// at least once, a p= comes before a q= when p= isn't tied to the search terms
} else {
P_TERM_MATCHER.reset(queryString);
if (P_TERM_MATCHER.find()) {
String terms = P_TERM_MATCHER.group(1);
return myDecode(terms);
}
}
}
if (host.endsWith("feedster.com") || host.endsWith("technorati.com")) {
String path = urlObject.getPath();
if (path == null)
return null;
path = path.replaceFirst("^/search/", "");
return myDecode(path);
}
return null;
} catch (MalformedURLException e) {
return null;
} catch (Exception e) {
throw new IOException("Caught exception processing input row ", e);
}
}
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
List<FuncSpec> funcList = new ArrayList<FuncSpec>();
funcList.add(new FuncSpec(this.getClass().getName(),
new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
return funcList;
}
}