/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package org.apdplat.search.util.baidu; import org.apache.commons.lang3.StringUtils; import org.apdplat.search.SearchResult; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author JONE * @mail 858305351@qq.com * @time 2013-11-11 * @description 通过Jsoup 获取百度搜索结果的基本信息 */ public class JsoupBaiduInfoUtil { private static final Logger LOG = LoggerFactory.getLogger(JsoupBaiduInfoUtil.class); private Document document = null; private SearchResult baiduModels = new SearchResult(); private String url = "http://www.baidu.com/s"; /** * 百度搜索结果:百度为您找到相关结果约13,100个 */ private static final String cssQuery = "div.nums"; /** * 解析标题 */ String titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left table#" + "tableNum" + ".result tbody tr td.c-default h3.t a"; /** * 解析简介 */ String summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left table#" + "tableNum" + ".result tbody tr td.c-default div.c-abstract"; /** * @author JONE * @param name 需要查询的字段 * @param page * @throws java.io.IOException * @time 2013-11-11 * @description 构造器 */ public JsoupBaiduInfoUtil( String name,int page) throws IOException{ if(StringUtils.isEmpty(StringUtils.trim(name)) || 0 >= page){ throw new NullPointerException(); } this.document = Jsoup.connect(url).data("wd", name).data("pn", String.valueOf((page-1)*10)).get(); } /** * @author JONE * @return String * @time 2013-11-11 * @description 获取百度搜索结果:13100 */ public String getResultsCount(){ String resultsCountText = this.getResultsCountText(); if(StringUtils.isEmpty(StringUtils.trim(resultsCountText))){ return ""; } String regEx="[^0-9]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(resultsCountText); String totalCount = m.replaceAll("").trim(); baiduModels.setTotal(Integer.parseInt(totalCount)); return totalCount; } /** * @author JONE * @return String * @time 2013-11-11 * @description 获取百度搜索结果:百度为您找到相关结果约13,100个 */ public String getResultsCountText(){ if(null == document){ return ""; } LOG.debug("total cssQuery: " + cssQuery); Element totalElement = document.select(cssQuery).first(); String totalText = totalElement.text(); LOG.info("搜索结果:" + totalText); return totalText; } }