package com.alimama.mdrill.adhoc; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; public class TransHigo_udf extends UDF { public Text evaluate(final Text d,String type) { if (d == null) { return new Text("_"); } return new Text(transformSolrMetacharactor(d.toString(),type)); } public static String transformSolrMetacharactor(String input,String tp){ // boolean isString=tp.indexOf("string")>=0; if(input!=null) { input=input.trim(); } // if(input==null||input.isEmpty()||input.equals("\\N")||input.equals("\\n")) // { // return isString?"_":"0"; // } if(tp.toLowerCase().indexOf("date")>=0) { return ensureTdate(input); } return input; // // if(!isString) // { // // return input; // } // // StringBuffer sb = new StringBuffer(); // String regex = "[+\\-&|!(){}\\[\\]^\"~*?:(\\) #/]"; // Pattern pattern = Pattern.compile(regex); // Matcher matcher = pattern.matcher(input); // while(matcher.find()){ // matcher.appendReplacement(sb, "_"); // } // matcher.appendTail(sb); // return sb.toString(); } public static final String yyyymmdd_regex = "(\\d{4})(\\d{2})(\\d{2})"; public static final Pattern yyyymmdd_pattern = Pattern.compile(yyyymmdd_regex); public static final Matcher yyyymmdd_matcher = yyyymmdd_pattern.matcher(""); public static final String yyyy_mm_dd_regex = "(\\d{4})-(\\d{2})-(\\d{2})"; public static final Pattern yyyy_mm_dd_pattern = Pattern.compile(yyyy_mm_dd_regex); public static final Matcher yyyy_mm_dd_matcher = yyyy_mm_dd_pattern.matcher(""); public static final String yyyy_mm_dd_2_regex = "(\\d{4})/(\\d{2})/(\\d{2})"; public static final Pattern yyyy_mm_dd_2_pattern = Pattern.compile(yyyy_mm_dd_2_regex); public static final Matcher yyyy_mm_dd_2_matcher = yyyy_mm_dd_2_pattern.matcher(""); public static final String yyyymmddhhsshh_regex = "(\\d{4})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})"; public static final Pattern yyyymmddhhsshh_pattern = Pattern.compile(yyyymmddhhsshh_regex); public static final Matcher yyyymmddhhsshh_matcher = yyyymmddhhsshh_pattern.matcher(""); public static final String yyyy_mm_dd_hh_ss_hh_regex = "(\\d{4}-\\d{2}-\\d{2}) (\\d{2}:\\d{2}:\\d{2})"; public static final Pattern yyyy_mm_dd_hh_ss_hh_pattern = Pattern.compile(yyyy_mm_dd_hh_ss_hh_regex); public static final Matcher yyyy_mm_dd_hh_ss_hh_matcher = yyyy_mm_dd_hh_ss_hh_pattern.matcher(""); public static final String valid_regex = "\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"; public static final Pattern valid_pattern = Pattern.compile(valid_regex); public static final Matcher valid_matcher = valid_pattern.matcher(""); /** * * @param string * @return */ public static String ensureTdate(String string) { try{ yyyymmdd_matcher.reset(string); int len=string.length(); if(len==8&&yyyymmdd_matcher.find()){ return yyyymmdd_matcher.group(1)+"-"+yyyymmdd_matcher.group(2)+"-"+yyyymmdd_matcher.group(3)+"T00:00:00Z"; } yyyy_mm_dd_matcher.reset(string); if(len==10&&yyyy_mm_dd_matcher.find()){ return yyyy_mm_dd_matcher.group(1)+"-"+yyyy_mm_dd_matcher.group(2)+"-"+yyyy_mm_dd_matcher.group(3)+"T00:00:00Z"; } yyyy_mm_dd_2_matcher.reset(string); if(len==10&&yyyy_mm_dd_2_matcher.find()){ return yyyy_mm_dd_2_matcher.group(1)+"-"+yyyy_mm_dd_2_matcher.group(2)+"-"+yyyy_mm_dd_2_matcher.group(3)+"T00:00:00Z"; } yyyymmddhhsshh_matcher.reset(string); if(len==14&&yyyymmddhhsshh_matcher.find()){ return yyyymmddhhsshh_matcher.group(1)+"-"+yyyymmddhhsshh_matcher.group(2)+"-"+yyyymmddhhsshh_matcher.group(3)+"T"+yyyymmddhhsshh_matcher.group(4)+":"+yyyymmddhhsshh_matcher.group(5)+":"+yyyymmddhhsshh_matcher.group(6)+"Z"; } yyyy_mm_dd_hh_ss_hh_matcher.reset(string); if(len==19&&yyyy_mm_dd_hh_ss_hh_matcher.find()){ return yyyy_mm_dd_hh_ss_hh_matcher.group(1)+"T"+yyyy_mm_dd_hh_ss_hh_matcher.group(2)+"Z"; } valid_matcher.reset(string); if(valid_matcher.find()){ return valid_matcher.group(); } return "2099-09-09T00:00:00Z"; }catch(Exception e){ } return "2099-09-09T00:00:00Z"; } }