package com.fpcms.common.util;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.lang.StringUtils;
import org.springframework.util.Assert;
import com.github.rapid.common.util.Profiler;
import com.fpcms.common.cache.Cache;
import com.fpcms.common.cache.CacheManager;
import com.fpcms.common.cache.ValueCallback;
public class SearchEngineUtil {
public static String sosoSearch(String keywords,int pageNumber) {
Profiler.enter("SearchEngineUtil.sogouSearch");
try {
// sd=5&min=730&max=3287 时间段参数
int pageSize = 10;
String result = NetUtil.httpGet("http://www.soso.com/q",String.format("num="+pageSize+"&pg="+pageNumber+"&w=%s",keywords));
if(isInvalidSearchResult(result)) {
throw new EmptySearchResultException("sosoSearch return empty result,keywords:"+keywords+" pageSize:"+pageSize+" pageNumber:"+pageNumber);
}
return result;
}finally {
Profiler.release();
}
}
public static String googleSearch(String keywords, int pageSize,int pageNumber) {
Profiler.enter("SearchEngineUtil.googleSearch");
try {
int start = (pageNumber - 1) * pageSize;
String result = NetUtil.httpGet("https://www.google.com.hk/search",String.format("hl=zh-CN&start=%s&num=%s&q=%s",start,pageSize,keywords));
if(isInvalidSearchResult(result)) {
throw new EmptySearchResultException("googleSearch return empty result,keywords:"+keywords+" pageSize:"+pageSize+" pageNumber:"+pageSize);
}
return result;
}finally {
Profiler.release();
}
}
public static boolean baiduKeywordsNotExist(String keywords) throws EmptySearchResultException{
Assert.hasText(keywords,"keywords must be not empty");
String maxLengthKeyword = KeywordUtil.getMaxLengthToken(keywords);
try {
String substring = maxLengthKeyword.substring(0,Math.min(36,maxLengthKeyword.length()));
baiduSearch("\""+substring+"\"",1,100);
return false;
}catch(EmptySearchResultException e) {
return true;
}
}
public static String baiduSearch(String keywords, int pageSize,int pageNumber) throws EmptySearchResultException{
Profiler.enter("SearchEngineUtil.baiduSearch");
try {
int start = (pageNumber - 1) * pageSize;
String result = NetUtil.httpGet("http://www.baidu.com/s",String.format("pn=%s&rn=%s&wd=%s",start,pageSize,keywords));
if(isInvalidSearchResult(result)) {
throw new EmptySearchResultException("baiduSearch return empty result,keywords:"+keywords+" pageSize:"+pageSize+" pageNumber:"+pageSize);
}
return result;
}finally {
Profiler.release();
}
}
public static String sogouSearch(String keywords, int pageSize,int pageNumber) {
Profiler.enter("SearchEngineUtil.sogouSearch");
try {
String result = NetUtil.httpGet("http://www.sogou.com/web",String.format("num="+pageSize+"&page="+pageNumber+"&query=%s",keywords));
if(isInvalidSearchResult(result)) {
throw new EmptySearchResultException("sogouSearch return empty result,keywords:"+keywords+" pageSize:"+pageSize+" pageNumber:"+pageNumber);
}
return result;
}finally {
Profiler.release();
}
}
public static int baiduKeywordRank(String keyword,String site) {
String siteRecord = getBaiduSiteContentByRegex(keyword, site);
if(siteRecord == null) {
return 0;
}
String rank = RegexUtil.findByRegexGroup(siteRecord, "class=\"result\" id=\"(\\d+)\"", 1);
if(rank == null) {
return 0;
}
return Integer.parseInt(rank);
}
private static Cache cache = CacheManager.createCache(SearchEngineUtil.class, 500);
public static Map<String,Integer> baiduKeywordsRank(final String keywords,final String site) {
return cache.get("baiduKeywordsRank:"+keywords+"_"+site, 3600, new ValueCallback<Map<String,Integer>>() {
@Override
public Map<String, Integer> create(String key) {
String[] keywordsArray = org.springframework.util.StringUtils.tokenizeToStringArray(keywords, ",_| ");
TreeMap<String,Integer> rankMap = new TreeMap<String,Integer>();
for(String keyword : keywordsArray) {
int rank = SearchEngineUtil.baiduKeywordRank(keyword, site);
if(rank > 0) {
rankMap.put(keyword, rank);
}
}
return MapUtil.sortByValue(rankMap);
}
});
}
private static String getBaiduSiteContentByRegex(String keyword, String site) {
String url = "http://www.baidu.com/s";
String content = NetUtil.httpGet(url,"wd="+keyword+"&rn=100");
int siteIndex = content.indexOf(site);
if(siteIndex >= 0) {
String siteContent = content.substring(0,siteIndex);
int tableIndex = siteContent.lastIndexOf("<table ");
// int tableIndex = siteContent.lastIndexOf("<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" ");
return tableIndex >= 0 ? siteContent.substring(tableIndex) : null;
}
return null;
}
public static int baiduSiteCount(String domain) {
String url = "http://www.baidu.com/s?ie=utf-8&wd=site:"+domain;
return baiduSiteCount0(url);
}
private static int baiduSiteCount0(String url) {
String content = NetUtil.httpGet(url);
String num = RegexUtil.findByRegexGroup(content, "找到相关结果数([\\d,]+)个", 1);
if(StringUtils.isNotBlank(num)) {
return Integer.parseInt(num.replace(",", ""));
}
return 0;
}
public static int baiduRecentlySiteCount(String domain) {
String url = "http://www.baidu.com/s?ie=utf-8&wd=site:"+domain+"&lm=1";
return baiduSiteCount0(url);
}
static String[] invalidSearchs = new String[]{"检查输入是否正确","请检查您输入的关键词是否有错误","请检查输入字词有无错误","请检查输入的关键词是否有误"};
private static boolean isInvalidSearchResult(String result) {
if (StringUtils.isBlank(result)) {
return true;
}
for (String invalid : invalidSearchs) {
if (result.contains(invalid)) {
return true;
}
}
return false;
}
}