package com.fpcms.common.util;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fpcms.common.webcrawler.htmlparser.HtmlPage.Anchor;
import com.fpcms.common.webcrawler.htmlparser.SinglePageCrawler;
public class BlogUtil {
private static Logger logger = LoggerFactory.getLogger(BlogUtil.class);
/**
* 根据blogUrl得到有效的blog文章URL
* @param blogUrl
* @param expectedLinkChineseCount 期待的url文字链接数
*
* @return
*/
public static List<Anchor> getValidBlogLinks(String blogUrl,int expectedLinkChineseCount) {
SinglePageCrawler singlePageCrawler = new SinglePageCrawler();
// singlePageCrawler.setAcceptUrlRegexList(".*"+blogUrl+".*");
List<Anchor> filtered = new ArrayList<Anchor>();
List<Anchor> anchorList = singlePageCrawler.getShoudVisitAnchorList(blogUrl);
for(Anchor a : anchorList) {
if(TextLangUtil.chineseCount(a.getText()) >= expectedLinkChineseCount) {
filtered.add(a);
}
}
return filtered;
}
public static List<Anchor> pingAllBlog(String blogUrl) {
List<Anchor> successList = new ArrayList<Anchor>();
List<Anchor> list = BlogUtil.getValidBlogLinks(blogUrl, 8);
for(Anchor a : list) {
try {
BlogPingUtil.baiduPing(blogUrl, blogUrl, a.getHref(), "");
successList.add(a);
}catch(Exception e) {
logger.error("error_on_ping_blog_url:"+a);
}
}
return successList;
}
}