package com.fpcms.service.article_crawl;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
import org.apache.shiro.util.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.util.Assert;
import com.github.rapid.common.util.DateConvertUtil;
import com.github.rapid.common.util.DateRange;
import com.github.rapid.common.util.page.Page;
import com.github.rapid.common.util.page.PageQuery;
import com.fpcms.common.random_gen_article.BaiduTopBuzzUtil;
import com.fpcms.common.random_gen_article.NaipanArticleGeneratorUtil;
import com.fpcms.common.util.ApplicationContextUtil;
import com.fpcms.common.util.Constants;
import com.fpcms.common.util.GoogleTranslateUtil;
import com.fpcms.common.util.HtmlFormatUtil;
import com.fpcms.common.util.KeywordUtil;
import com.fpcms.common.util.RegexUtil;
import com.fpcms.common.util.SearchEngineUtil;
import com.fpcms.common.util.TextLangUtil;
import com.fpcms.common.util.URLEncoderUtil;
import com.fpcms.common.webcrawler.htmlparser.HtmlPage;
import com.fpcms.common.webcrawler.htmlparser.HtmlPage.Anchor;
import com.fpcms.common.webcrawler.htmlparser.HtmlPageCrawler;
import com.fpcms.common.webcrawler.htmlparser.SinglePageCrawler;
import com.fpcms.model.CmsContent;
import com.fpcms.model.CmsKeyValue;
import com.fpcms.model.CmsSite;
import com.fpcms.service.CmsContentService;
import com.fpcms.service.CmsKeyValueService;
import com.fpcms.service.CmsSiteService;
/**
* 从其它网站进行文章采集的service
*
* @author badqiu
*
*/
public class ArticleCrawlService implements ApplicationContextAware,InitializingBean{
private static Logger logger = LoggerFactory.getLogger(ArticleCrawlService.class);
private List<SinglePageCrawler> singlePageCrawlerList = new ArrayList<SinglePageCrawler>();
private HtmlPageCrawler htmlPageCrawler = new HtmlPageCrawlerImpl();
private CmsContentService cmsContentService;
private CmsSiteService cmsSiteService;
private ApplicationContext applicationContext;
private CmsKeyValueService cmsKeyValueService;
public void loadSinglePageCrawlerList() {
singlePageCrawlerList = ApplicationContextUtil.getBeans(applicationContext,SinglePageCrawler.class);
}
public void setCmsContentService(CmsContentService cmsContentService) {
this.cmsContentService = cmsContentService;
}
public void setCmsKeyValueService(CmsKeyValueService cmsKeyValueService) {
this.cmsKeyValueService = cmsKeyValueService;
}
public void setCmsSiteService(CmsSiteService cmsSiteService) {
this.cmsSiteService = cmsSiteService;
}
@Override
public void setApplicationContext(ApplicationContext applicationContext)
throws BeansException {
this.applicationContext = applicationContext;
}
/**
* 爬网站
*/
public synchronized void crawlAllSite() {
for(SinglePageCrawler crawler : singlePageCrawlerList) {
crawler.setHtmlPageCrawler(htmlPageCrawler);
crawler.execute();
}
}
/**
* 爬每个站点的关键词
*/
public synchronized void crawlAllSiteKeyword() {
for(final CmsSite cmsSite : cmsSiteService.findAll()) {
ArrayList<String> keywordList = KeywordUtil.toTokenizerList(cmsSite.getKeyword());
if(keywordList.isEmpty()) {
continue;
}
String keyword = keywordList.get(0);
crawByKeyword(keyword,keyword,new HtmlPageCrawlerImpl() {
@Override
protected void prepareCmsContent(CmsContent c) {
super.prepareCmsContent(c);
c.setSite(cmsSite.getSiteDomain());
}
});
}
}
/**
* 爬热门关键词
*/
public synchronized void crawlAllBuzzKeyword() {
Set<String> buzzList = BaiduTopBuzzUtil.getBaiduBuzzs();
for(final String buzz : buzzList) {
crawByKeyword(buzz,"buzz",new HtmlPageCrawlerImpl() {
@Override
public void prepareCmsContent(CmsContent c) {
c.setTitle("图片故事-"+c.getTitle());
}
});
}
}
/**
* 爬发票关键词
*/
public synchronized List<CmsContent> crawlKeyword(String keyword) {
return crawlByKeyword("zh_fapiao","zh-CN",keyword,keyword,"zh-CN");
}
public synchronized List<CmsContent> crawlByKeyword(String tags,String sourceLang,final String searchKeyword,final String replaceKeyword,String hl) {
final List<CmsContent> resultCollector = new ArrayList<CmsContent>();
List<String> urls = buildSearchUrl(searchKeyword,10,hl,true);
SinglePageCrawler crawler = newGoogleSinglePageCrawler(tags,sourceLang,new HtmlPageCrawlerImpl(){
@Override
public void visit(HtmlPage page) {
page.setTitle(replaceWithCaseInsentisive(page.getTitle(),searchKeyword, replaceKeyword));
page.setContent(replaceWithCaseInsentisive(page.getContent(),searchKeyword,replaceKeyword));
CmsContent c = buildCmsContent(page,new NaipanTransformer());
if(c != null) {
cmsContentService.create(c);
resultCollector.add(c);
}
}
},urls.toArray(new String[0]));
crawler.execute();
return resultCollector;
}
static String replaceWithCaseInsentisive(String string,final String searchKeyword,
final String replaceKeyword) {
return string.replaceAll("(?i)"+searchKeyword,replaceKeyword);
}
private List<String> buildSearchUrl(String keyword,int pageCount,String hl,boolean keywordAllintitle) {
List<String> urls = new ArrayList<String>();
for(int i = 0; i < pageCount; i++) {
int num = 100;
int start = 0 * num;
String encodeKeyword = keywordAllintitle ? URLEncoderUtil.encode("allintitle:"+keyword) : URLEncoderUtil.encode(keyword);
String searchUrl = "https://www.google.com.hk/search?q="+encodeKeyword+"&num="+num+"&hl="+hl+"&biw=1440&bih=702&tbm=nws&start="+start+"&tbs=qdr:d";
urls.add(searchUrl);
}
return urls;
}
private void crawByKeyword(final String buzz,String tags,HtmlPageCrawler htmlPageCrawler) {
CmsKeyValue cmsKeyValue = new CmsKeyValue(Constants.KEY_VALUE_GROUP_SEARCH_BUZZ,buzz);
if(cmsKeyValueService.exist(cmsKeyValue)) {
logger.info("ignore search,already_search_buzz:"+buzz);
return;
}
cmsKeyValueService.create(cmsKeyValue);
final String finalSearchKeyword = URLEncoderUtil.encode(buzz + " " + DateConvertUtil.format(new Date(), "yyyy年MM月"));
String searchUrl = "https://www.google.com.hk/search?num=10&hl=zh-CN&safe=strict&tbs=qdr:d&q="+finalSearchKeyword;
SinglePageCrawler crawler = newGoogleSinglePageCrawler(tags,"zh-CN",htmlPageCrawler,searchUrl);
crawler.execute();
}
private SinglePageCrawler newGoogleSinglePageCrawler(String tags,String sourceLang,HtmlPageCrawler htmlPageCrawler, String... searchUrl) {
SinglePageCrawler crawler = new SinglePageCrawler();
crawler.setUrlList(searchUrl);
crawler.setSourceLang(sourceLang);
crawler.setTags(tags);
crawler.setExcludeUriRegexList(".*google.*",".*youtube.*",".*blogger.*");
crawler.setHtmlPageCrawler(htmlPageCrawler);
return crawler;
}
/**
* 合并过于短小的文章
*/
public synchronized void mergeSmallArticle() {
DateRange createdRange = new DateRange(DateUtils.addDays(new Date(),-5), new Date());
Page<CmsContent> page = cmsContentService.findPage(new PageQuery(1000), Constants.CRAWL_SITE, Constants.CRAWL_CHANNEL_CODE, createdRange);
List<CmsContent> list = page.getItemList();
for(int i = 0; i < list.size(); i+=2) {
if(i + 1 >= list.size()) {
break;
}
CmsContent one = list.get(i);
CmsContent two = list.get(i+1);
int MERGE_MIN_LENGTH = 450;
if(one.getContent().length() < MERGE_MIN_LENGTH || two.getContent().length() < MERGE_MIN_LENGTH) {
logger.info("mrege small cms_content,id:"+one.getId()+" with id:"+two.getId()+", one title:"+one.getTitle()+", two title:"+two.getTitle());
String mergeTitle = one.getTitle()+";"+two.getTitle();
String mergeContent = "<h1>"+one.getTitle()+"</h1><p>"+one.getContent()+"</p><h1>"+two.getTitle()+"</h1><p>"+two.getContent()+"</p>";
one.setTitle(mergeTitle);
one.setContent(mergeContent);
cmsContentService.update(one);
cmsContentService.removeById(two.getId());
}
}
}
public List<String> getInvalidUrlList() {
List<String> invalidUrlList = new ArrayList<String>();
for(SinglePageCrawler crawler : getSinglePageCrawlerList()) {
for(String url : crawler.getUrlList()) {
try {
List<Anchor> list = crawler.getShoudVisitAnchorList(url);
if(CollectionUtils.isEmpty(list)) {
invalidUrlList.add(url);
}
}catch(Exception e) {
invalidUrlList.add(url);
}
}
}
return invalidUrlList;
}
public List<SinglePageCrawler> getSinglePageCrawlerList() {
return singlePageCrawlerList;
}
private class HtmlPageCrawlerImpl implements HtmlPageCrawler {
@Override
public boolean shoudVisitPage(Anchor a) {
Date start = DateUtils.addDays(new Date(),-160);
Date end = new Date();
int count = cmsContentService.countBySourceUrl(start, end, a.getHref());
if(count > 0) {
return false;
}
return true;
}
@Override
public void visit(HtmlPage page) {
CmsContent c = buildCmsContent(page);
if(c != null) {
prepareCmsContent(c);
cmsContentService.create(c);
}
}
protected void prepareCmsContent(CmsContent c) {
}
}
private CmsContent buildCmsContent(HtmlPage page) {
return buildCmsContent(page,new GoogleTranslateTransformer());
}
public static interface Transformer {
public String transform(String sourceLang,String content);
}
public static class GoogleTranslateTransformer implements Transformer {
public String transform(String sourceLang,String content) {
String transformedContent = null;
if("zh-cn".equalsIgnoreCase(sourceLang) || "zh-tw".equalsIgnoreCase(sourceLang)) {
transformedContent = GoogleTranslateUtil.reverseTwoWayTranslate(content,"zh-CN","en");
}else {
transformedContent = GoogleTranslateUtil.translate(content,sourceLang,"zh-CN");
}
return transformedContent;
}
}
public static class NaipanTransformer implements Transformer {
public String transform(String sourceLang,String content) {
String transformedContent = null;
if("zh-cn".equalsIgnoreCase(sourceLang) || "zh-tw".equalsIgnoreCase(sourceLang)) {
transformedContent = NaipanArticleGeneratorUtil.transformArticle(content);
}else {
transformedContent = GoogleTranslateUtil.translate(content,sourceLang,"zh-CN");
}
return transformedContent;
}
}
private CmsContent buildCmsContent(HtmlPage page,Transformer transformer) {
if(hasFilterKeyword(page.getTitle(),page.getContent())) {
return null;
}
CmsContent c = new CmsContent();
String content = transformer.transform(page.getSourceLang(),page.getContent());
String title = transformer.transform(page.getSourceLang(),page.getTitle());
c.setContent(HtmlFormatUtil.htmlBeauty(content));
c.setTitle(title);
c.setTags(page.getTags());
if(hasFilterKeyword(c.getTitle(),c.getContent())) {
return null;
}
if(StringUtils.isBlank(c.getContent())) {
return null;
}
if(SearchEngineUtil.baiduKeywordsNotExist(c.getTitle())) {
logger.info("baidu_not_exist article:"+c.getTitle());
c.setSourceUrl(page.getAnchor().getHref());
c.setSite(Constants.CRAWL_SITE);
c.setChannelCode(Constants.CRAWL_CHANNEL_CODE);
c.setAuthor(Constants.CRAWL_AUTHOR);
return c;
}else {
throw new RuntimeException("百度已经存在该文章,cmsContent.title:"+c.getTitle()+" htmlPage.title:"+page.getTitle()+" Transformer:"+transformer.getClass()+" page.sourceLang:"+page.getSourceLang());
}
}
static List<String> filterWords = Arrays.asList("\\u","http://","www.","代开","开发票","买发票","卖发票","销售发票","代開");
static List<String> filterRegex = Arrays.asList("开.*发票","发票.*代开","发票.*开","假.*发票","開.*發票");
static boolean hasFilterKeyword(String... contents) {
if(contents == null) return false;
for(String c : contents) {
if(StringUtils.isBlank(c)) {
continue;
}
for(String keyword : filterWords) {
if(c.contains(keyword)) {
return true;
}
}
for(String regex : filterRegex) {
if(c.contains(regex)) {
return true;
}
if(RegexUtil.findByRegexGroup(c, regex, 0) != null) {
return true;
}
}
}
return false;
}
@Override
public void afterPropertiesSet() throws Exception {
Assert.notNull(applicationContext,"applicationContext must be not null");
Assert.notNull(cmsContentService,"cmsContentService must be not null");
Assert.notNull(cmsKeyValueService,"cmsKeyValueService must be not null");
loadSinglePageCrawlerList();
Assert.notEmpty(singlePageCrawlerList,"singlePageCrawlerList must be not empty");
}
}