package com.hao.fetcher;
import com.hao.model.Price;
import com.hao.model.StopSale;
import com.hao.parser.ParserHomePage;
import com.hao.parser.ParserSpecificPage;
import com.hao.util.HttpClientUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import static com.hao.common.Commons.writeStringtoFile;
/**
* Created by user on 2016/2/17.
*/
public class FetcherAutohomeData {
public static final Logger logger = LoggerFactory.getLogger(FetcherAutohomeData.class);
public static final Pattern pattern = Pattern.compile("(\\d+)");
private static Document getDocument(String url) throws IOException {
String content = HttpClientUtil.sendHttpsGet(url);
Document document = Jsoup.parse(content);
return document;
}
/**
* 得到所有的页面
* @return
*/
private static List<String> pages(){
List<String> urls = Lists.newArrayList();
for(char i = 65; i < 91;i++){
String url = "http://www.autohome.com.cn/grade/carhtml/" + i + ".html";
urls.add(url);
}
urls.remove("http://www.autohome.com.cn/grade/carhtml/E.html");
urls.remove("http://www.autohome.com.cn/grade/carhtml/I.html");
urls.remove("http://www.autohome.com.cn/grade/carhtml/U.html");
urls.remove("http://www.autohome.com.cn/grade/carhtml/V.html");
return urls;
}
/**
* 解析http://www.autohome.com.cn/grade/carhtml/A.html这种格式的页面 按照三层目录层级排放
* http://car1.autoimg.cn/logo/brand/50/129472203719848750.jpg,奥迪,奥迪(进口),Crosslane Coupe,http://www.autohome.com.cn/2908/#levelsource=000000000_0&pvareaid=101594
*http://car1.autoimg.cn/logo/brand/50/129472203719848750.jpg,奥迪,奥迪(进口),奥迪TT offroad,http://www.autohome.com.cn/3479/#levelsource=000000000_0&pvareaid=101594
*http://car1.autoimg.cn/logo/brand/50/129472203719848750.jpg,奥迪,奥迪(进口),e-tron quattro,http://www.autohome.com.cn/3894/#levelsource=000000000_0&pvareaid=101594
*http://car1.autoimg.cn/logo/brand/50/129472203719848750.jpg,奥迪,奥迪(进口),Nanuk,http://www.autohome.com.cn/3210/#levelsource=000000000_0&pvareaid=101594
*http://car1.autoimg.cn/logo/brand/50/129472203719848750.jpg,奥迪,奥迪(进口),quattro,http://www.autohome.com.cn/2218/#levelsource=000000000_0&pvareaid=101594
* @param url
* @return
* @throws IOException
*/
private static List<Map<String, String>> fetchSinglePageLink(String url) throws IOException {
Map<String,String> singlePageMap = Maps.newLinkedHashMap();
Map<String,String> grayPageMap = Maps.newLinkedHashMap();
List<Map<String, String>> lists = Lists.newArrayListWithCapacity(2);
Document document = getDocument(url);
Elements dlElems = document.select("dl");
for (int i = 0; i < dlElems.size(); i++) {
String img = dlElems.get(i).select("dt > a > img").attr("src");
String firstBrand = dlElems.get(i).select("dt > div > a").text();
String href = dlElems.get(i).select("dt > div > a").attr("href");
String firstBrandId = null;
Matcher matcher = pattern.matcher(href);
if(matcher.find()){
firstBrandId = matcher.group(1);
}
Elements divElems = dlElems.get(i).select("dd > div.h3-tit");
for (int j = 0; j < divElems.size(); j++) {
String secondBrand = divElems.get(j).text();
Element ulElem = dlElems.get(i).select("dd > ul").get(j);
Elements liElems = ulElem.select("li");
for (Element liElem : liElems) {
Elements aElems = liElem.select("h4 > a");
if(aElems.text().isEmpty()){
continue;
}
if(aElems.hasClass("greylink")) {
String thirdBrand = aElems.text();
grayPageMap.put(aElems.attr("href").replace("#levelsource=000000000_0&pvareaid=101594",""),"\"" + firstBrand + "\"" + "," + "\"" + firstBrandId + "\"" + "," + "\"" + img + "\"" + "," + "\"" + secondBrand + "\"" + "," + "\"" +thirdBrand + "\"");
}else {
String thirdBrand = aElems.text();
singlePageMap.put(aElems.attr("href").replace("#levelsource=000000000_0&pvareaid=101594",""),"\"" + firstBrand + "\"" + "," + "\"" + firstBrandId + "\"" + "," + "\"" + img + "\"" + "," + "\"" + secondBrand + "\"" + "," + "\"" +thirdBrand + "\"");
}
}
}
}
lists.add(singlePageMap);
lists.add(grayPageMap);
return lists;
}
/**
* 解析程序的入口 只要调用这个函数就可以获取全部数据
* @param salePath 写入在售车型数据
* @param stopSalePath 写入停售车型数据
* @return
* @throws IOException
*/
public static void get(String salePath,String stopSalePath,String errorPath){
List<String> pages = pages();
pages.parallelStream().parallel().forEach(pageUrl -> {
try {
List<Map<String, String>> mapPair = fetchSinglePageLink(pageUrl);
Map<String, String> singleMap = mapPair.get(0);
Map<String, String> grayMap = mapPair.get(1);
//处理带有配置参数的页面
singleMap.forEach((homeUrl, data) -> {
try {
Map<StopSale, List<List<Object>>> stopSaleListMap = ParserHomePage.parseGrayPage(homeUrl);
parseMap(stopSaleListMap, data, stopSalePath);
Optional<Map<String, Price>> priceMap = ParserHomePage.parseHomePage(homeUrl, "");
if (priceMap.isPresent()) {
priceMap.get().forEach((configUrl, homeData) -> {
try {
List<List<Object>> lists = ParserSpecificPage.parseSpecificPage(configUrl, errorPath);
if (lists != null) {
lists.forEach(list -> {
list = list.stream().map(obj -> "\"" + obj + "\"" + ",").collect(Collectors.toList());
String lst = "";
for (Object o : list) {
lst += o;
}
String s = "finally value:" + data + "," + homeData + "," + lst;
System.out.println(s);
String write = data + "," + homeData + "," + lst + "\n";
try {
writeStringtoFile(salePath, write, true);
} catch (IOException e) {
}
});
}
} catch (IOException e) {
}
});
}
} catch (IOException e) {
}
});
//解析灰色链接的数据
grayMap.forEach((homeUrl, data) -> {
try {
Map<StopSale, List<List<Object>>> map = ParserHomePage.parseGrayPage(homeUrl);
parseMap(map, data, stopSalePath);
} catch (IOException e) {
}
});
} catch (Exception e) {
try {
writeStringtoFile("error_url.txt",pageUrl + "\n",true);
} catch (IOException e1) {}
}
});
}
private static void parseMap(Map<StopSale, List<List<Object>>> map,String data,String stopSalePath) throws IOException {
if(map == null){
String s = "finally value:(only data)" + data;
System.out.println(s);
String write = data + "\n";
writeStringtoFile(stopSalePath,write,true);
}else{
map.forEach((stopSale, lists) -> {
if (lists == null){
String s = "finally value:(data and stopSale)" + data + "," + stopSale;
System.out.println(s);
String write = data + "," + stopSale + "\n";
try {
writeStringtoFile(stopSalePath,write,true);
} catch (IOException e) {}
}else{
lists.forEach(list ->{
list = list.stream().map(obj -> "\"" + obj + "\"" + ",").collect(Collectors.toList());
String lst = "";
for (Object o : list) {
lst += o;
}
String s = "finally value:(all)" + data + "," + stopSale + "," + lst;
System.out.println(s);
String write = data + "," + stopSale + "," + lst + "\n";
try {
writeStringtoFile(stopSalePath,write,true);
} catch (IOException e) {}
});
}
});
}
}
public static void main(String[] args) throws IOException {
Scanner scanner = new Scanner(System.in);
System.out.println("在售数据的存储路径:");
String salePath = scanner.next();
System.out.println("停售数据的存储路径:");
String stopSalePath = scanner.next();
System.out.println("错误的URL存储路径");
String errorPath = scanner.next();
FetcherAutohomeData.get(salePath,stopSalePath,errorPath);
}
}