package com.hao.parser;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Table;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.selector.Html;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static com.hao.common.Commons.*;
/**
* Created by user on 2016/2/18.
*/
public class ParserSpecificPage {
public static final Logger logger = LoggerFactory.getLogger(ParserSpecificPage.class);
/**
* 取出一页详情页的数据
* @param url
* @return
* @throws IOException
*/
public static List<List<Object>> parseSpecificPage(String url,String errorPath) throws IOException {
// System.out.println("正在解析的url:" + url);
String content = null;
try {
content = getDocument(url).toString();
} catch (Exception e) {
parseSpecificPage(url,errorPath);
}
if (content.contains("抱歉,暂无相关数据。") || content.contains("您访问的页面出错了")) {
return null;
}
Html html = Html.create(content);
//取出id 解析其中的js,非贪婪匹配
try {
String idList = html.regex("specIDs =\\[(.*?)\\];", 1).get();
String keyLink = html.regex("keyLink = (.*?);", 1).get();
String config = html.regex("var config = (.*?);", 1).get();
String option = html.regex("var option = (.*)var bag", 1).get().replace(";", "");
Preconditions.checkNotNull(keyLink, "keyLink can not be null");
Preconditions.checkNotNull(config, "config can not be null");
Preconditions.checkNotNull(option, "option can not be null");
//用于将来的扩展
// String color = html.regex("var color = (.*?);",1).get();
// String innerColor = html.regex("var innerColor=(.*?);",1).get();
String[] ids = split(idList);
List<String> nameList = parseStandardField(keyLink);
Table<String, String, String> configList = parseJson(config, "paramtypeitems", "paramitems", "value", ids);
Table<String, String, String> optionList = parseJson(option, "configtypeitems", "configitems", "value", ids);
//// List<Map<String, List<String>>> colorList = parseJsonForColor(color, ids);
//// List<Map<String, List<String>>> innerColorList = parseJsonForColor(innerColor, ids);
List<List<Object>> fields = parseListNoColor(ids, configList, optionList, nameList);
// System.out.println(fields);
return fields;
}catch (Exception e){
writeStringtoFile(errorPath,url + "\n",true);
return null;
}
}
/**
* 用来处理没有颜色的数据
* @param ids
* @param configList
* @param optionList
* @return
*/
public static List<List<Object>> parseListNoColor(String[] ids,Table<String,String,String> configList,Table<String,String,String> optionList,
List<String> nameList){
List<List<Object>> contentLists = Lists.newArrayList();
List<Object> contentList = Lists.newArrayList();
for (String id : ids) {
parseSpecList(id,configList,optionList,contentList,nameList);
contentLists.add(contentList);
contentList = Lists.newArrayList();
}
return contentLists;
}
/**
* /**
* [[{车型名称={1002785=宝斯通 2014款 3.0TVIP版NGD3.0-C3HA}}, {车型名称={1001002=宝斯通 2010款 2.2L高级版HFC4GA2-1B}}, {车型名称={1001001=宝斯通 2010款 3.0T高级版NGD3.0-C3HA}}]
* [车型名称, 厂商指导价(元), 厂商, 级别, 发动机, 变速箱, 长*宽*高(mm), 车身结构, .....]
* @param id
* @param
* @param contentList
* @param nameList
* 从list里找到name,如果没有name就赋值为""
*/
private static void parseSpecList(String id,Table<String,String,String> configTable,Table<String,String,String> optionTable, List<Object> contentList
,List<String> nameList) {
for (String name : nameList) {
String value = configTable.get(name, id);
if(value == null) {
value = optionTable.get(name, id);
}
contentList.add(value);
}
contentList.add("id:" + id);
contentList.add("http://www.autohome.com.cn/spec/" + id);
}
/**
* 用于将来的扩展
* @param id
* @param list
* @param contentList
*/
private static void parseSpecListForColor(String id,List<Map<String,List<String>>> list,List<Object> contentList){
for (Map<String, List<String>> map : list) {
List<String> ss = map.get(id);
if(ss == null) continue;
contentList.add(ss);
}
}
/**
* 批量更新
* @param links
* @return
* @throws IOException
*/
public static void updateData(List<String> links,String errorPath) throws IOException {
for (String link : links) {
parseSpecificPage(link,errorPath);
}
}
public static void main(String[] args) throws IOException {
parseSpecificPage("http://car.autohome.com.cn/config/series/2097.html","");
}
}