/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.udf.generic; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.io.Text; /** * GenericUDTFParseUrlTuple: this * */ @Description(name = "parse_url_tuple", value = "_FUNC_(url, partname1, partname2, ..., partnameN) - extracts N (N>=1) parts from a URL.\n" + "It takes a URL and one or multiple partnames, and returns a tuple. " + "All the input parameters and output column types are string.", extended = "Partname: HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO, QUERY:<KEY_NAME>\n" + "Note: Partnames are case-sensitive, and should not contain unnecessary white spaces.\n" + "Example:\n" + " > SELECT b.* FROM src LATERAL VIEW _FUNC_(fullurl, 'HOST', 'PATH', 'QUERY', 'QUERY:id') " + "b as host, path, query, query_id LIMIT 1;\n" + " > SELECT _FUNC_(a.fullurl, 'HOST', 'PATH', 'QUERY', 'REF', 'PROTOCOL', 'FILE', " + " 'AUTHORITY', 'USERINFO', 'QUERY:k1') as (ho, pa, qu, re, pr, fi, au, us, qk1) from src a;") public class GenericUDTFParseUrlTuple extends GenericUDTF { enum PARTNAME { HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO, QUERY_WITH_KEY, NULLNAME }; private static final Logger LOG = LoggerFactory.getLogger(GenericUDTFParseUrlTuple.class.getName()); int numCols; // number of output columns String[] paths; // array of pathnames, each of which corresponds to a column PARTNAME[] partnames; // mapping from pathnames to enum PARTNAME Text[] retCols; // array of returned column values Text[] cols; // object pool of non-null Text, avoid creating objects all the time private transient Object[] nullCols; // array of null column values private transient ObjectInspector[] inputOIs; // input ObjectInspectors boolean pathParsed = false; boolean seenErrors = false; private transient URL url = null; private transient Pattern p = null; private transient String lastKey = null; @Override public void close() throws HiveException { } @Override public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { inputOIs = args; numCols = args.length - 1; if (numCols < 1) { throw new UDFArgumentException("parse_url_tuple() takes at least two arguments: " + "the url string and a part name"); } for (int i = 0; i < args.length; ++i) { if (args[i].getCategory() != ObjectInspector.Category.PRIMITIVE || !args[i].getTypeName().equals(serdeConstants.STRING_TYPE_NAME)) { throw new UDFArgumentException("parse_url_tuple()'s arguments have to be string type"); } } seenErrors = false; pathParsed = false; url = null; p = null; lastKey = null; paths = new String[numCols]; partnames = new PARTNAME[numCols]; cols = new Text[numCols]; retCols = new Text[numCols]; nullCols = new Object[numCols]; for (int i = 0; i < numCols; ++i) { cols[i] = new Text(); retCols[i] = cols[i]; nullCols[i] = null; } // construct output object inspector ArrayList<String> fieldNames = new ArrayList<String>(numCols); ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(numCols); for (int i = 0; i < numCols; ++i) { // column name can be anything since it will be named by UDTF as clause fieldNames.add("c" + i); // all returned type will be Text fieldOIs.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); } return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); } @Override public void process(Object[] o) throws HiveException { if (o[0] == null) { forward(nullCols); return; } // get the path names for the 1st row only if (!pathParsed) { for (int i = 0;i < numCols; ++i) { paths[i] = ((StringObjectInspector) inputOIs[i+1]).getPrimitiveJavaObject(o[i+1]); if (paths[i] == null) { partnames[i] = PARTNAME.NULLNAME; } else if (paths[i].equals("HOST")) { partnames[i] = PARTNAME.HOST; } else if (paths[i].equals("PATH")) { partnames[i] = PARTNAME.PATH; } else if (paths[i].equals("QUERY")) { partnames[i] = PARTNAME.QUERY; } else if (paths[i].equals("REF")) { partnames[i] = PARTNAME.REF; } else if (paths[i].equals("PROTOCOL")) { partnames[i] = PARTNAME.PROTOCOL; } else if (paths[i].equals("FILE")) { partnames[i] = PARTNAME.FILE; } else if (paths[i].equals("AUTHORITY")) { partnames[i] = PARTNAME.AUTHORITY; } else if (paths[i].equals("USERINFO")) { partnames[i] = PARTNAME.USERINFO; } else if (paths[i].startsWith("QUERY:")) { partnames[i] = PARTNAME.QUERY_WITH_KEY; paths[i] = paths[i].substring(6); // update paths[i], e.g., from "QUERY:id" to "id" } else { partnames[i] = PARTNAME.NULLNAME; } } pathParsed = true; } String urlStr = ((StringObjectInspector) inputOIs[0]).getPrimitiveJavaObject(o[0]); if (urlStr == null) { forward(nullCols); return; } try { String ret = null; url = new URL(urlStr); for (int i = 0; i < numCols; ++i) { ret = evaluate(url, i); if (ret == null) { retCols[i] = null; } else { if (retCols[i] == null) { retCols[i] = cols[i]; // use the object pool rather than creating a new object } retCols[i].set(ret); } } forward(retCols); return; } catch (MalformedURLException e) { // parsing error, invalid url string if (!seenErrors) { LOG.error("The input is not a valid url string: " + urlStr + ". Skipping such error messages in the future."); seenErrors = true; } forward(nullCols); return; } } @Override public String toString() { return "parse_url_tuple"; } private String evaluate(URL url, int index) { if (url == null || index < 0 || index >= partnames.length) { return null; } switch (partnames[index]) { case HOST : return url.getHost(); case PATH : return url.getPath(); case QUERY : return url.getQuery(); case REF : return url.getRef(); case PROTOCOL : return url.getProtocol(); case FILE : return url.getFile(); case AUTHORITY : return url.getAuthority(); case USERINFO : return url.getUserInfo(); case QUERY_WITH_KEY: return evaluateQuery(url.getQuery(), paths[index]); case NULLNAME: default : return null; } } private String evaluateQuery(String query, String key) { if (query == null || key == null) { return null; } if (!key.equals(lastKey)) { p = Pattern.compile("(&|^)" + key + "=([^&]*)"); } lastKey = key; Matcher m = p.matcher(query); if (m.find()) { return m.group(2); } return null; } }