package com.hao.parser; import static com.hao.common.Commons.*; import com.hao.model.Price; import com.hao.model.StopSale; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by user on 2016/2/18. */ public class ParserHomePage { public static final Pattern pattern = Pattern.compile("(\\d+)"); public static final Pattern patternContent = Pattern.compile("\\((.*)\\)"); public static final Logger logger = LoggerFactory.getLogger(ParserHomePage.class); /** * 用来解析首页 获取车系首页的数据 * 车系的数据需要 * 新车指导价:6.27-7.47万 (4款车型) * 二手车价格:1.00-7.20万 (48个车源) * 用户评分 * 同时提取出参数配置的链接 * 还需要停售的数据 * http://www.autohome.com.cn/2097/ * @param url * @throws IOException */ public static Optional<Map<String, Price>> parseHomePage(String url,String path) throws IOException { Document document = getDocument(url); Map<String,Price> homeData = Maps.newLinkedHashMap(); //处理车系首页需要的数据 转换为price的实体类 List<String> oldList = getJsonp(url); if(oldList == null) return null; Price price = homePageData(document,oldList); Elements liElems = document.select(".nav-item"); if(liElems.isEmpty()) return null; Elements aElem = liElems.get(1).select("a"); if(!aElem.isEmpty()){ String href = aElem.attr("href"); homeData.put(href,price); if(logger.isInfoEnabled()){ logger.info("href is:{}" + href); } } Optional<Map<String, Price>> optHomeData = Optional.of(homeData); return optHomeData; } public static Map<StopSale,List<List<Object>>> parseGrayPage(String url) throws IOException { Map<StopSale,List<List<Object>>> map = Maps.newLinkedHashMap(); String link = stopSaleLink(url, ""); if(link == null) return null; List<StopSale> stopSales = ParserStopSalePage.parseStopSaleData(link, ""); if(stopSales == null) return null; for (StopSale stopSale : stopSales) { String configLink = stopSale.getLink(); List<List<Object>> lists = ParserSpecificPage.parseSpecificPage(configLink, ""); if (lists == null){ map.put(stopSale,null); continue; } map.put(stopSale,lists); } return map; } /** * 处理汽车首页需要的数据 将其组装成price model * @param document * @param oldList 这是处理回调函数返回的数据 * @return */ private static Price homePageData(Document document,List<String> oldList){ Elements dtElems = document.select(".autoseries-info > dl > dt"); Price price = new Price(); String newPrice = dtElems.get(0).select("a").get(0).text(); String carType = null; if(document.select(".koubei-score > a").size() > 1){ carType = dtElems.get(0).select("a").get(1).text(); }else{ carType = "暂无"; } String oldPrice = oldList.get(0) + "-" + oldList.get(1); String carSource = oldList.get(2); String SId = oldList.get(3); String engine = document.select(".autoseries-info > dl > dd").get(1).text(); String specData = document.select(".autoseries-info > dl > dd").get(2).text(); String score = null; if(document.select(".koubei-score > a").size() > 1){ score = document.select(".koubei-score > a").get(1).text(); }else{ score = "暂无口碑"; } String carName = document.select(".subnav-title-name > a").get(0).text(); String bigImg = document.select(".autoseries-pic-img1 > a > img").attr("src"); String factImg = document.select(".autoseries-pic-img2").get(0).select("a > img").attr("src"); String videoImg = document.select(".autoseries-pic-img2").get(1).select("a > img").attr("src"); price.setCarName(carName); price.setCarSource(carSource); price.setCarType(carType); price.setNewPrice(newPrice); price.setOldPrice(oldPrice); price.setScore(score); price.setBigImg(bigImg); price.setFactImg(factImg); price.setVideoImg(videoImg); price.setEngine(engine); price.setSpecData(specData); price.setSId(SId); return price; } /** * 解析车系页面 获取停售页面链接 * @param url 车系页面链接 * @param path 保存的文件路径 * @throws IOException */ private static String stopSaleLink(String url,String path) throws IOException { try { Document document = getDocument(url); String href = document.select(".other-car > .link-sale").attr("href"); if (href.isEmpty()) return null; String link = "http://www.autohome.com.cn" + href; if (logger.isDebugEnabled()) { logger.debug("stop sale link is:{}", link); } return link; } catch (Exception e){ writeStringtoFile(path,url + "\n",true); return null; } } private static List<String> getJsonp(String url) throws IOException { Matcher matcher = pattern.matcher(url); String sid = null; if(matcher.find()){ sid = matcher.group(1); } String jsonp = "http://api.che168.com/auto/ForAutoCarPCInterval.ashx?callback=che168CallBack&_appid=cms&sid=" + sid + "&yid=0&pid=110000"; Document document = null; try { document = getDocument(jsonp); } catch (Exception e) { if(e instanceof IllegalArgumentException){ writeStringtoFile("error_url.txt",url + "\n",true); }else{ throw e; } } if(document == null) return null; return parseJsonp(document); } /** * 处理jsonp {"returncode":0,"message":"","result":{"SId":3343,"YId":0,"minPrice":"4.90","maxPrice":"7.80","url":"http://www.che168.com/china/baojun/baojun610/4_8/?pvareaid=100383","count":8}} * * @param document * @return */ private static List<String> parseJsonp(Document document) { List<String> list = Lists.newArrayListWithCapacity(4); String content = document.html(); Matcher matcher = patternContent.matcher(content); if(matcher.find()){ content = matcher.group(1); } content = filter(content); if(content.contains("不存在车源信息")){ list.add("暂无"); list.add("暂无"); list.add("暂无"); list.add("暂无"); return list; } String SId = content.substring(content.indexOf("SId:")+4, content.indexOf(",YId")); String minPrice = content.substring(content.indexOf("minPrice:")+9, content.indexOf(",maxPrice")); String maxPrice = content.substring(content.indexOf("maxPrice:")+9, content.indexOf(",url")); String count = content.substring(content.indexOf("count:")+6, content.indexOf("}")); list.add(minPrice); list.add(maxPrice); list.add(count); list.add(SId); return list; } private static String filter(String content) { String cont = content.replaceAll("&", "").replaceAll("quot;","").replace("\"",""); return cont; } }