package com.get_proxy_demo; import com.jackson.bean.ContextSrc; import com.jackson.common.control.ProxyController; import com.jackson.db.po.Proxy; import com.jackson.db.po.Url; import com.jackson.db.service.UrlService; import com.jackson.task.parser.IParser; import com.jackson.utils.ProxyUtil; import com.jackson.utils.RegexUtil; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpRequestBase; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; /** * Created by Jackson on 2016/11/7. */ public class YouDaiLiPage2 implements IParser { private static Logger logger = LogManager.getLogger(YouDaiLiPage2.class.getName()); @Override public boolean responseHandle(Proxy proxy, ProxyController proxyController, Url url, UrlService urlService, ContextSrc contextSrc, HttpRequestBase request, CloseableHttpResponse response, String content) { List<String> matechIp = RegexUtil.matechIp(content); Source.getProxyController().insert(ProxyUtil.getHttpProxy(matechIp)); if(!url.getUrl().contains("_")) startChildUrls(url,content,urlService); logger.info("matechIp size{} url:{}",matechIp.size(),url.getUrl()); if(matechIp.size()==0){ logger.info("parserFailureTime:{} url:{}",url.getParserFailureTime(),url.getUrl()); if(url.getParserFailureTime()<10){ url.setParserFailureTime(url.getParserFailureTime()+1); urlService.add(url); } } if(matechIp.size()>0){ if(proxy!=null) logger.info("成功url:{} proxy{}:{} 时间:{}",url.getUrl(),proxy.getHost(),proxy.getPort(),System.currentTimeMillis()); else logger.info("成功url:{}",url.getUrl()); } return matechIp.size()>0; } private void startChildUrls(Url url, String html, UrlService urlService){ String rgx = "共\\d+页"; List<String> arr = RegexUtil.match(Pattern.compile(rgx), html); ArrayList<Url> urls = new ArrayList<>(); int page=0; if(arr.size()!=0) { String pageStr = arr.get(0); pageStr = pageStr.substring(1, pageStr.length()-1); page = Integer.parseInt(pageStr); } for(int i=page;i>1;i--){ String newRequestDate = url.getUrl().replace(".html", "_"+i+".html"); Url url1 = Url.newHttpGetUrl(newRequestDate, YouDaiLiPage2.class); url1.setPriority(url.getPriority()); urls.add(url1); } urlService.insert(urls); } }