package com.taobao.top.analysis.node.io; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.zip.GZIPInputStream; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.taobao.top.analysis.node.job.JobTask; import com.taobao.top.analysis.node.job.JobTaskExecuteInfo; /** * HubInputAdaptor.java * @author yunzhan.jtq * * @since 2012-5-16 下午02:11:23 */ public class HubInputAdaptor implements IInputAdaptor { private static final Log logger = LogFactory.getLog(HubInputAdaptor.class); private SimpleDateFormat simple = new SimpleDateFormat("yyyy-MM-dd"); /* (non-Javadoc) * @see com.taobao.top.analysis.node.io.IInputAdaptor#getInputFormJob(com.taobao.top.analysis.node.job.JobTask) * hub获取数据会有一段逻辑处理,目的在于游标管理、跨天数据处理 * 目前这种方式的hub拉取,有一个不可避免掉的问题————单行错误数据,该问题通过目前的方式不太容易解决 */ @Override public InputStream getInputFormJob(JobTask jobtask, JobTaskExecuteInfo taskExecuteInfo) { try { String input = jobtask.getInput(); String uri = input.replaceAll("hub:", "http:"); Long begin = Long.parseLong(jobtask.getInput().substring(jobtask.getInput().indexOf("&begin=") + 7, jobtask.getInput().indexOf("&end="))); Long end = Long.parseLong(jobtask.getInput().substring(jobtask.getInput().indexOf("&end=") + 5)); URL url; URLConnection conn; //首先获取今日日志的大小 String temp = uri.substring(0, uri.indexOf('?')); temp = temp.replaceAll("/get/", "/size/"); url = new URL(temp); conn = url.openConnection(); conn.setConnectTimeout(30000); conn.setReadTimeout(30000); BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); String record = reader.readLine(); if(!StringUtils.isNumeric(record)) { logger.error(input + " size is " + record); return null; } Long size = Long.parseLong(record); //重置游标处理 if (jobtask.getTailCursor().get()) { taskExecuteInfo.setFileBegin(0); taskExecuteInfo.setFileLength(size); taskExecuteInfo.setTimestamp(System.currentTimeMillis()); return conn.getInputStream(); } //如果不是重置游标,则先检查游标是否超过日志大小 //超过日志大小,对比前一日日志的大小以及当前时间,如果是已经跨天,则直接读新的日志 //否则不读任何数据 //目前此处跨天的处理逻辑 if (size < (begin+2)) { logger.error(input + " size is " + record + " and begin is " + begin); if(!temp.endsWith(".log")) { taskExecuteInfo.setFileBegin(begin); taskExecuteInfo.setFileLength(0L); taskExecuteInfo.setTimestamp(System.currentTimeMillis()); return conn.getInputStream(); } temp = temp + "." + getDate(); url = new URL(temp); conn = url.openConnection(); conn.setConnectTimeout(30000); reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); record = reader.readLine(); long time = System.currentTimeMillis(); long lastTime = jobtask.getJobSourceTimeStamp(); long last = (lastTime + 28800000L) / 86400000; long now = (time + 28800000L) / 86400000; taskExecuteInfo.setFileBegin(0L); taskExecuteInfo.setFileLength(0L); taskExecuteInfo.setTimestamp(time); logger.error(input + " size is " + record + " and begin is " + begin + ",now is " + now + ", last is " + last); if(StringUtils.isNumeric(record)) { Long lastSize = Long.parseLong(record); if(lastSize < begin) { return conn.getInputStream(); } if((time+ 28800000L) % 86400000 > 300000) { return conn.getInputStream(); } if((lastTime == 0 && (time+ 28800000L) % 86400000 < 300000) || now > last) { temp = temp.replaceAll("/size/", "/get/"); if(temp.contains("?")) temp += "&"; else temp += "?"; temp = temp + "begin=" + begin + "&end=" + end; url = new URL(temp); conn = url.openConnection(); conn.setConnectTimeout(30000); return conn.getInputStream(); } else { taskExecuteInfo.setFileBegin(begin); return conn.getInputStream(); } } else { taskExecuteInfo.setFileBegin(begin); return conn.getInputStream(); } } url = new URL(uri); conn = url.openConnection(); conn.setConnectTimeout(30000); conn.setReadTimeout(30000); taskExecuteInfo.setFileBegin(Long.parseLong(conn.getHeaderField("file-begin"))); taskExecuteInfo.setFileLength(Long.parseLong(conn.getHeaderField("file-length"))); taskExecuteInfo.setTimestamp(System.currentTimeMillis()); if (!uri.contains("encode=text")) { GZIPInputStream gzipin = new GZIPInputStream(conn.getInputStream()); return gzipin; } return conn.getInputStream(); } catch (Throwable e) { logger.error("job get input error:" + jobtask.getJobName() + "," + jobtask.getInput(), e); } return null; } private String getDate() { Calendar c = Calendar.getInstance(); c.add(Calendar.DATE, -1); Date date = c.getTime(); return simple.format(date); } /* (non-Javadoc) * @see com.taobao.top.analysis.node.io.IInputAdaptor#ignore(java.lang.String) */ @Override public boolean ignore(String input) { return input.indexOf("hub:") < 0; } }