package com.fpcms.common.random_gen_article; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.github.rapid.common.util.Profiler; import com.fpcms.common.cache.Cache; import com.fpcms.common.cache.CacheManager; import com.fpcms.common.cache.ValueCallback; import com.fpcms.common.util.Constants; import com.fpcms.common.util.KeywordUtil; import com.fpcms.common.util.NetUtil; import com.fpcms.common.util.RegexUtil; /** * 查询百度的相关热门关键字 * * @author badqiu * */ public class BaiduTopBuzzUtil { static Logger logger = LoggerFactory.getLogger(BaiduTopBuzzUtil.class); static String CACHE_KEYWORD_BUZZS = "KEYWORD_BUZZS"; static Cache cache = CacheManager.createCache(BaiduTopBuzzUtil.class,Constants.BAIDU_BUZZ_URLS.length+1); public static Set<String> getBaiduBuzzs() { return format(getBaiduBuzzs0()); } static Set<String> format(Set<String> baiduBuzzs0) { LinkedHashSet<String> set = new LinkedHashSet<String>(); for(String keyword : baiduBuzzs0) { String formatedKeyword = StringUtils.join(KeywordUtil.toTokenizerList(keyword)," "); set.add(formatedKeyword); } return set; } private static Set<String> getBaiduBuzzs0() { Set<String> result = new HashSet<String>(); for(String url : Constants.BAIDU_BUZZ_URLS) { try { Set<String> keywords = cache.get(url, 3600 * 6,new ValueCallback<Set<String>>() { public Set<String> create(String key) { return findBaiduBuzzs(key); } }); result.addAll(keywords); }catch(Exception e) { logger.error("read url for buzz error:"+url,e); } } return result; } public static Set<String> findBaiduBuzzs(String url) { Profiler.enter("findBaiduBuzzs"); try { String topKeyword = NetUtil.httpGet(url); String[] keywordRegex = { "(?s)<a class=\"list-title\" target=\"_blank\" href=\"./detail.{1,80}>(\\W+)</a>", // baidu "(?s)<a title=..{1,70}. href=..{1,120}. target=._blank.>(\\W+)</a></li>", // sougou "(?s)<a href=\".*?\" title=\"\\W+\" onclick=\".*?\" target=\"_blank\">(\\W+)</a>" }; for(String pattern : keywordRegex) { Set<String> keyword = RegexUtil.findAllByRegexGroup(topKeyword, pattern, 1); if(!keyword.isEmpty()) { logger.info("getBaiduKeywords,url=" + url + " result:"+keyword); return keyword; } } return new HashSet<String>(); }finally { Profiler.release(); } } }