package com.alimama.quanjingmonitor.mdrillImport.parse; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; import org.apache.log4j.Logger; import com.alimama.mdrillImport.InvalidEntryException; public class aplus_text extends com.alimama.mdrillImport.DataParser{ private static final long serialVersionUID = 1L; public volatile long groupCreateerror=0; /** * create stream table overwrite aplus_target_table ( 0 version string, 1 ip string, 2 time string, 3 url string, 4 user_agent string, 5 linezing_session string, 6 cna string, 7 adid string, 8 amid string, 9 cmid string, 10 pmid string, 11 uid string, 12 sid string, 13 pre string, 14 cache_ string, 15 scr string, 16 nick string, 17 at_autype string, 18 bbid string, 19 at_isb string, 20 at_mall_pro_re string, 21 at_mall_re string, 22 at_shoptype string, 23 b2c_auction string, 24 b2c_brand string, 25 b2c_orid string, 26 at_type string, 27 category string, 28 marketinfo string, 29 atp_isdpp string, 30 at_bucketid string, 31 at_insid string, 32 at_jporid string, 33 upi_bi string, 34 rpi_bi string, 35 wm_pageid string, 36 wm_prototypeid string, 37 wm_sid string, 38 spm_cnt string, 39 title string, 40 url_type string, 41 ref_type string, 42 ref_shopid string, 43 parse_ip string, 44 parse_time string, 45 logkey string, 46 gmkey string, 47 gokey string, 48 logtype string, 49 atp_sid string, 50 userid string, 51 isbeta string, 52 spm_url string, 53 spm_pre string ) with ( input.type='tt', galaxy.semantic.source.timetunnel.logname='aplus_text', galaxy.semantic.source.timetunnel.accesskey='accesskey', galaxy.semantic.source.timetunnel.subid='subid', galaxy.semantic.source.timetunnel.checkpoint.name='ds_rt_pv_10', galaxy.semantic.source.timetunnel.start.time='2013-10-23 00:00:00', galaxy.semantic.source.max.batch.size=100, galaxy.semantic.source.input.field.delimiter='\u0001', galaxy.semantic.source.debug=false ); galaxy.semantic.source.parser.classpath=com.alibaba.galaxy.semantic.source.component.DefaultParser */ private static Logger LOG = Logger.getLogger(aplus_text.class); private volatile long lines=0; private volatile long lines_sb=0; private volatile long lines_sb2=0; private static long TS_MAX=3600l*24*31; private volatile long laststartts=System.currentTimeMillis()/1000-TS_MAX; private volatile long lastendts=System.currentTimeMillis()/1000+TS_MAX; private volatile long timediff=System.currentTimeMillis(); private volatile long timediff2=System.currentTimeMillis(); public String formatRows(String[] clicklog) { StringBuilder b = new StringBuilder(); for (int i = 0; i < clicklog.length; i++) { b.append(i); b.append("="); b.append(String.valueOf(clicklog[i])); b.append(","); } return b.toString(); } @Override public DataIter parseLine(String line) throws InvalidEntryException { try { if(line==null) { return null; } this.lines++; if(this.lines>100000) { this.laststartts=(System.currentTimeMillis()/1000)-TS_MAX; this.lastendts=(System.currentTimeMillis()/1000)+TS_MAX; this.lines=0; } String[] log =line.split("\001",-1); if(log==null||log.length<46) { return null; } long ts = Long.parseLong(log[2]); this.lines_sb++; if(this.lines_sb>5000) { this.lines_sb=0; long nowts=System.currentTimeMillis(); if(nowts-timediff>30000) { timediff=nowts; LOG.info("parseLine_sb_"+formatDayMin.format(new Date(ts*1000))+" "+formatRows(log)); } } if(ts<laststartts||ts>lastendts) { return null; } // 0 version string, // 1 ip string, // 2 time string, // 3 url string, // 4 user_agent string, // 5 linezing_session string, // 6 cna string, // 7 adid string, // 8 amid string, // 9 cmid string, // 10 pmid string, // 11 uid string, // 12 sid string, // 13 pre string, // 14 cache_ string, // 15 scr string, // 16 nick string, // 17 at_autype string, // 18 bbid string, // 19 at_isb string, // 20 at_mall_pro_re string, // 21 at_mall_re string, // 22 at_shoptype string, // 23 b2c_auction string, // 24 b2c_brand string, // 25 b2c_orid string, // 26 at_type string, // 27 category string, // 28 marketinfo string, // 29 atp_isdpp string, // 30 at_bucketid string, // 31 at_insid string, // 32 at_jporid string, // 33 upi_bi string, // 34 rpi_bi string, // 35 wm_pageid string, // 36 wm_prototypeid string, // 37 wm_sid string, // 38 spm_cnt string, // 39 title string, // 40 url_type string, // 41 ref_type string, // 42 ref_shopid string, // 43 parse_ip string, // 44 parse_time string, // 45 logkey string, // 46 gmkey string, // 47 gokey string, // 48 logtype string, // 49 atp_sid string, // 50 userid string, // 51 isbeta string, // 52 spm_url string, // 53 spm_pre string String ad_id=log[7]; String url_ad_id=getName(log[3], "ad_id"); if((ad_id==null||ad_id.isEmpty())&&(url_ad_id==null||url_ad_id.isEmpty())) { return null; } DataIterParse rtn= new DataIterParse(ts,log,ad_id,url_ad_id); if(!rtn.isvalidate()) { this.lines_sb2++; if(this.lines_sb2>5000) { this.lines_sb2=0; long nowts=System.currentTimeMillis(); if(nowts-timediff2>30000) { timediff2=nowts; LOG.info("parseLine_sb2_"+formatDayMin.format(new Date(ts*1000))+" "+formatRows(log)); } } return null; } return rtn; } catch (Throwable nfe) { if(groupCreateerror<100) { LOG.error("InvalidEntryException:"+line,nfe); groupCreateerror++; } throw new InvalidEntryException("Invalid log `" + line + "'\n" , nfe); } } public static class DataIterParse implements DataIter{ private long ts; private String ad_id; private String logkey=""; private boolean ispv2=false; private boolean isclick_1=false; private boolean isclick_2=false; private String url_ad_id; private String pid=null; private String pid_Url=null; private Map<String, String> map=FetchAdid2Pid.fetch(); private String fetch(String adid,String strday) { if(adid==null||adid.isEmpty()) { return null; } String ad_id_cut=adid.substring(0, Math.min(10,adid.length())); return map.get(strday+"@"+String.valueOf(ad_id_cut)); } int index=0; public DataIterParse(long ts,String[] pvlog,String ad_id,String url_ad_id) { this.ts=ts; this.ad_id=ad_id; this.url_ad_id=url_ad_id; String Url=decodeString(pvlog[3]); String pre=decodeString(pvlog[13]); this.logkey=pvlog[45]; String strday=formatDay.format(new Date(ts*1000)); this.pid=this.fetch(this.ad_id, strday); this.pid_Url=this.fetch(this.url_ad_id, strday); String jlogid=getName(Url, "jlogid"); String jlogid_pre=getName(pre, "jlogid"); boolean isMathchUrl=Url.indexOf("38.tmall.com")>=0&&jlogid!=null&&!jlogid.isEmpty(); boolean isMathchPre=pre.indexOf("38.tmall.com")>=0&&jlogid_pre!=null&&!jlogid_pre.isEmpty(); this.ispv2=this.pid!=null&&(!this.logkey.equals("/"))&&this.ad_id.startsWith("10")&&isMathchUrl; this.isclick_2=this.pid!=null&&(!this.logkey.equals("/"))&&this.ad_id.startsWith("10")&&isMathchPre; this.isclick_1=this.pid_Url!=null&&this.logkey.equals("/")&&this.url_ad_id.length()>10&&Url.indexOf("ju.mmstat.com")>=0;//from url } public boolean isvalidate() { return this.ispv2||this.isclick_2||this.isclick_1; } @Override public boolean next() { index++; if(this.ispv2||this.isclick_2) { this.ispv2=false; this.isclick_2=false; return this.isclick_1; } return false; } @Override public Number[] getSum() { if(this.ispv2||this.isclick_2) { return new Number[]{ this.ispv2?1:0 ,0 ,this.isclick_2?1:0 ,0 ,0 ,0 ,0 ,0 }; } return new Number[]{ 0 ,this.isclick_1?1:0 ,0 ,0 ,0 ,0 ,0 ,0 }; } @Override public long getTs() { return (ts/10)*10000; } @Override public Object[] getGroup() { long ts300=(this.ts/300)*300000; Date d= new Date(ts300); if(this.ispv2||this.isclick_2) { return new String[] { String.valueOf(formatDay.format(d)), String.valueOf(formatMin.format(d)), "pc", "aplus_text", "pc", this.pid , "" ,DebugVersion.version+","+index// String.valueOf(actname) } ; } return new String[] { String.valueOf(formatDay.format(d)), String.valueOf(formatMin.format(d)), "pc", "aplus_text", "pc", this.pid_Url , "" ,DebugVersion.version+","+index// String.valueOf(actname) } ; } } private static String[] colSumName={ "pv_2" ,"click_1" ,"click_2" ,"promise_click" ,"pc_2_wap" ,"weakup" ,"backup_1" ,"backup_2" }; private static String[] colname={ "thedate" ,"miniute_5" ,"source" ,"sub_source" ,"media_name" ,"media_pid" ,"channel" ,"o2o" }; private static String decodeString(String args) { try { return new String(java.net.URLDecoder.decode(args,"UTF-8") .getBytes("UTF-8"), "UTF-8"); } catch (Throwable e) { try { return new String(java.net.URLDecoder.decode(args,"GBK") .getBytes("UTF-8"), "UTF-8"); } catch (Throwable e2) { return args; } } } public static String getName(String url,String keyname) { if(url==null) { return null; } try{ String[] tem = decodeString(url).split("\\?", 2); String params=tem[0]; if (tem.length >= 2){ params=tem[1]; } for (String s: params.split("&", -1)) { String[] tem1 = s.split("=", -1); String key = decodeString(tem1[0]); if(key.equals(keyname)) { String value = (tem1.length < 2 ? "" : decodeString(tem1[1])); return value; } } }catch(Throwable e){} return null; } @Override public String[] getSumName() { return colSumName; } @Override public String getTableName() { return "rpt_adpmp_3_8_online"; } private static SimpleDateFormat formatDay = new SimpleDateFormat("yyyyMMdd"); private static SimpleDateFormat formatMin = new SimpleDateFormat("HHmm"); private static SimpleDateFormat formatDayMin = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); @Override public String[] getGroupName() { return colname; } }