/**
*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.seo.rank.list.impl;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.seo.rank.list.Parser;
import org.seo.rank.list.UrlTools;
import org.seo.rank.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author 杨尚川
*/
public class DefaultParser implements Parser{
private static final Logger LOGGER = LoggerFactory.getLogger(DefaultParser.class);
private static final String ACCEPT = "text/html, */*; q=0.01";
private static final String ENCODING = "gzip, deflate";
private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
private static final String CONNECTION = "keep-alive";
private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";
@Override
public List<Article> parse(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery) {
List<Article> articles = new ArrayList<>();
try{
Document document = Jsoup.connect(url)
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("User-Agent", USER_AGENT)
.get();
Elements elements = document.select(titleCssQuery);
for(Element element : elements){
String title = element.text();
String href = element.attr("href");
if(!StringUtils.isBlank(title) && !StringUtils.isBlank(href)){
href = UrlTools.normalizeUrl(url, href);
Article article = new Article();
article.setTitle(title);
article.setUrl(href);
articles.add(article);
}else{
LOGGER.info("解析列表页出错:"+url+" title:"+title+", href:"+href);
}
}
//获取下一页链接地址
String nextPageUrl = getNextPageUrl(document, nextPageCssQuery, nextPageText);
LOGGER.debug("下一页链接:"+nextPageUrl);
if(nextPageUrl != null){
nextPageUrl = UrlTools.normalizeUrl(url, nextPageUrl);
LOGGER.debug("规范化后的下一页链接:"+nextPageUrl);
//解析下一页
List<Article> result = parse(nextPageUrl, nextPageCssQuery, nextPageText, titleCssQuery);
articles.addAll(result);
}else{
LOGGER.info("列表页解析完毕,最后一页:"+url);
}
}catch(Exception e){
LOGGER.error("解析列表页出错:"+url, e);
}
return articles;
}
/**
* 获取下一页链接地址
* @param document 本页文档对象
* @param nextPageCssQuery 获取下一页的CSS路径
* @param nextPageText 下一页CSS路径对应的元素的文本值
* @return 下一页链接地址
*/
private String getNextPageUrl(Document document, String nextPageCssQuery, String nextPageText){
Elements elements = document.select(nextPageCssQuery);
for(Element element : elements){
String text = element.text();
LOGGER.debug(text);
if(text != null && nextPageText.trim().equals(text.trim())){
String href = element.attr("href");
return href;
}
}
return null;
}
public static List<Article> run(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery){
Parser parser = new DefaultParser();
long start = System.currentTimeMillis();
List<Article> articles = parser.parse(url, nextPageCssQuery, nextPageText, titleCssQuery);
long cost = System.currentTimeMillis() - start;
int i=1;
for(Article article : articles){
LOGGER.info((i++) + "、" + article.getTitle() + " : " + article.getUrl());
}
LOGGER.info("采集文章 " + articles.size() + " 篇耗时:" + cost / 1000.0 + " 秒");
return articles;
}
public static List<Article> iteyeBlog(){
String url = "http://yangshangchuan.iteye.com/";
String nextPageCssQuery = "html body div#page div#content.clearfix div#main div.pagination a.next_page";
String nextPageText = "下一页 »";
String titleCssQuery = "html body div#page div#content.clearfix div#main div.blog_main div.blog_title h3 a";
return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
}
public static List<Article> iteyeNews(){
String url = "http://www.iteye.com/news";
String nextPageCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.pagination a.next_page";
String nextPageText = "下一页 »";
//h3 > a表示h3后直接跟着a,这样 h3 span.category a 就不会被选择
String titleCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.news.clearfix div.content h3 > a";
return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
}
public static List<Article> iteyeMagazines(){
String url = "http://www.iteye.com/magazines";
String nextPageCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.pagination a.next_page";
String nextPageText = "下一页 »";
String titleCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.news.clearfix div.content h3 a";
return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
}
public static List<Article> csdnBlog(){
String url = "http://blog.csdn.net/iispring";
String nextPageCssQuery = "html body div#container div#body div#main div.main div#papelist.pagelist a";
String titleCssQuery = "html body div#container div#body div#main div.main div#article_list.list div.list_item.article_item div.article_title h1 span.link_title a";
String nextPageText = "下一页";
return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
}
public static List<Article> oschinaNews(){
String url = "http://www.oschina.net/news";
String nextPageCssQuery = "html body div#OSC_Screen div#OSC_Content.CenterDiv div#NewsChannel.Channel div#NewsList.ListPanel div#RecentNewsList.panel ul.pager li.page.next a";
String titleCssQuery = "html body div#OSC_Screen div#OSC_Content.CenterDiv div#NewsChannel.Channel div#NewsList.ListPanel div#RecentNewsList.panel ul.List li h2 a";
String nextPageText = ">";
return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
}
public static List<Article> oschinaBlog(){
String url = "http://my.oschina.net/apdplat/blog";
String nextPageCssQuery = "html body div#OSC_Screen div#OSC_Content div.blog-user div.container div.flex-item div#search_list.flex-item-9.flex-item-md-9.content div#list.list.blog-list div.pages.sm-hide ul li a";
String titleCssQuery = "html body div#OSC_Screen div#OSC_Content div.blog-user div.container div.flex-item div#search_list.flex-item-9.flex-item-md-9.content div#list.list.blog-list div.list-item div.layout div.layout-column div.title a";
String nextPageText = "下一页";
return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
}
public static List<Article> baidu(String query){
//对查询词进行编码
try {
query = URLEncoder.encode(query, "UTF-8");
} catch (UnsupportedEncodingException e) {
LOGGER.error("url构造失败", e);
return Collections.emptyList();
}
if(StringUtils.isBlank(query)){
return Collections.emptyList();
}
String url = "http://www.baidu.com/s?wd=" + query;
String nextPageCssQuery = "html body div div div p#page a.n";
String titleCssQuery = "html body div div div div div h3.t a";
String nextPageText = "下一页>";
return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
}
/**
* 比较我的OSCHINA博客和ITEYE博客的异同
*/
public static void blogCompare(){
List<Article> ob = oschinaBlog();
List<Article> ib = iteyeBlog();
Map<String, String> om = new HashMap<>();
Map<String, String> im = new HashMap<>();
ob.stream().forEach(b->om.put(b.getTitle(), b.getUrl()));
ib.stream().forEach(b->im.put(b.getTitle(), b.getUrl()));
List<String> iteyeBlog = ib.stream().map(b -> b.getTitle().replace("[置顶]", "").trim()).sorted().collect(Collectors.toList());
List<String> oschinaBlog = ob.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList());
List<String> commons = oschinaBlog.stream().filter(b -> iteyeBlog.contains(b)).collect(Collectors.toList());
LOGGER.info("<h4>oschina和iteye都有("+commons.size()+"):</h4>");
AtomicInteger j = new AtomicInteger();
commons.forEach(item -> LOGGER.info(j.incrementAndGet()+"、"+item+" <a target=\"_blank\" href=\""+om.get(item)+"\">oschina</a> <a target=\"_blank\" href=\""+im.get(item)+"\">iteye</a><br/>"));
List<String> oschina = oschinaBlog.stream().filter(i -> !iteyeBlog.contains(i)).collect(Collectors.toList());
LOGGER.info("<h4>oschina独有("+oschina.size()+"):</h4>");
AtomicInteger l = new AtomicInteger();
oschina.forEach(item -> LOGGER.info(l.incrementAndGet()+"、<a target=\"_blank\" href=\""+om.get(item)+"\">"+item+"</a><br/>"));
List<String> iteye = iteyeBlog.stream().filter(i -> !oschinaBlog.contains(i)).collect(Collectors.toList());
LOGGER.info("<h4>iteye独有("+iteye.size()+"):</h4>");
AtomicInteger k = new AtomicInteger();
iteye.forEach(item -> LOGGER.info(k.incrementAndGet()+"、<a target=\"_blank\" href=\""+im.get(item)+"\">"+item+"</a><br/>"));
}
public static void main(String[] args){
//iteyeBlog();
//iteyeNews();
//iteyeMagazines();
//csdnBlog();
//oschinaNews();
//oschinaBlog();
//baidu("Java应用级产品开发平台APDPlat作者杨尚川专访");
blogCompare();
}
}