package com.fpcms.common.webcrawler.htmlparser; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.Assert; import com.fpcms.common.util.CollectionHelper; import com.fpcms.common.util.JsoupSelectorUtil; import com.fpcms.common.util.JsoupSelectorUtil.JsoupElementParentsSizeComparator; import com.fpcms.common.util.KeywordUtil; import com.fpcms.common.util.NetUtil; import com.fpcms.common.webcrawler.htmlparser.HtmlPage.Anchor; public class SinglePageCrawler { private static Logger logger = LoggerFactory.getLogger(SinglePageCrawler.class); private String[] urlList; private String[] acceptUrlRegexList = new String[]{".*"}; private String[] excludeUriRegexList; private String sourceLang; //TODO 自动识别语言 private String[] mainContentSelector; private int minContentLength = 300; private boolean deleteUrlQueryString = true; /** * 为文章打些标签 */ private String tags; private HtmlPageCrawler htmlPageCrawler = new HtmlPageCrawler() { public boolean shoudVisitPage(Anchor a) { return true; } public void visit(HtmlPage page) { } }; public SinglePageCrawler() { } public SinglePageCrawler(String... url) { super(); setUrlList(url); } public void setHtmlPageCrawler(HtmlPageCrawler htmlPageCrawler) { Assert.notNull(htmlPageCrawler,"htmlPageCrawler must be not null"); this.htmlPageCrawler = htmlPageCrawler; } public void setSourceLang(String sourceLang) { this.sourceLang = sourceLang; } public void setAcceptUrlRegexList(String... acceptUrlRegex) { this.acceptUrlRegexList = acceptUrlRegex; } public void setExcludeUriRegexList(String... excludeUriRegexList) { this.excludeUriRegexList = excludeUriRegexList; } public void setUrlList(String... url) { this.urlList = url; } public String[] getUrlList() { return urlList; } public String getTags() { return tags; } public void setTags(String tags) { this.tags = tags; } public void setMainContentSelector(String... mainContentSelector) { this.mainContentSelector = mainContentSelector; } public void setMinContentLength(int minContentLength) { this.minContentLength = minContentLength; } public boolean isDeleteUrlQueryString() { return deleteUrlQueryString; } public void setDeleteUrlQueryString(boolean deleteUrlQueryString) { this.deleteUrlQueryString = deleteUrlQueryString; } public void execute() { logger.info("start_execute_craw,sourceLang:"+sourceLang+" tags:"+tags+" minContentLength:"+minContentLength+" acceptUrlRegexList:"+StringUtils.join(acceptUrlRegexList,",")); Set<Anchor> anchorSet = new HashSet<Anchor>(); for(String url : urlList) { try { List<Anchor> shoudVisitAnchorList = getShoudVisitAnchorList(url); anchorSet.addAll(shoudVisitAnchorList); }catch(Exception e) { logger.error("error_on_crlaw_url:"+url,e); } } visitAnchorList(anchorSet); } public List<HtmlPage> crlawUrl(String url) { List<Anchor> shoudVisitAnchorList = getShoudVisitAnchorList(url); return visitAnchorList(shoudVisitAnchorList); } List<HtmlPage> visitAnchorList(Collection<Anchor> shoudVisitAnchorList) { List<HtmlPage> visitedPage = new ArrayList<HtmlPage>(); for(Anchor a : shoudVisitAnchorList) { try { HtmlPage page = extractArticleByJsoup(a); if(page != null) { htmlPageCrawler.visit(page); visitedPage.add(page); } }catch(Exception e) { logger.warn("extractArticleByJsoup error",e); } } return visitedPage; } public List<Anchor> getShoudVisitAnchorList(String url) { String content = NetUtil.httpGet(url); Document doc = Jsoup.parse(content); Collection<Anchor> shoudVisitAnchorList = getShoudVisitAnchorList(url, doc); return new ArrayList<Anchor>(shoudVisitAnchorList); } private List<Anchor> getShoudVisitAnchorList(String url, Document doc) { LinkedHashSet<Anchor> allAnchorList = getAllAnchors(url, doc); return filterAnchorList(allAnchorList); } private List<Anchor> filterAnchorList( LinkedHashSet<Anchor> shoudVisitAnchorSet) { List<Anchor> result = new ArrayList<Anchor>(); for(Anchor a : shoudVisitAnchorSet) { if(isAcceptUrl(a.getHref()) && htmlPageCrawler.shoudVisitPage(a)) { result.add(a); }else { logger.info("ignore_by_not_accept_url:{}",a.getHref()); } } return result; } private LinkedHashSet<Anchor> getAllAnchors(String url, Document doc) { Elements elements = doc.getElementsByTag("a"); LinkedHashSet<Anchor> result = new LinkedHashSet<Anchor>(); for(Element anchor : elements) { String href = anchor.attr("href"); String text = StringUtils.trim(anchor.text()); String title = anchor.attr("title"); Anchor a = new Anchor(); String fullHref = Anchor.toFullUrl(url,href); fullHref = deleteUrlQueryString ? Anchor.removeQueryString(fullHref) : fullHref; a.setHref(fullHref); a.setText(text); a.setTitle(title); result.add(a); } return result; } HtmlPage extractArticleByJsoup(Anchor anchor) throws IOException { try { Connection conn = Jsoup.connect(anchor.getHref()); conn.userAgent("Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"); conn.timeout(1000 * 6); Document doc = conn.get(); logger.info("doc.baseUri:"+doc.baseUri() ); String title = HtmlPageTitleUtil.smartGetTitle(anchor,doc.title()); String keywords = JsoupSelectorUtil.select(doc.head(),"[name=keywords]").attr("content"); String description = JsoupSelectorUtil.select(doc.head(),"[name=description]").attr("content"); String mainContentSelectorContent = JsoupSelectorUtil.select(doc.body(),mainContentSelector).text(); Element smartMainContent = smartGetMainContent(doc); HtmlPage page = new HtmlPage(); page.setAnchor(anchor); page.setContent(StringUtils.defaultIfBlank(mainContentSelectorContent,smartMainContent == null ? null : smartMainContent.text())); page.setDescription(description); page.setKeywords(keywords); page.setTitle(title); page.setSourceLang(sourceLang); page.setTags(tags); //TODO 增加anchor.text 与 page.title的比较或者是替换 logger.info("------------------- url:"+page.getAnchor().getHref()+" ---------------------------"); logger.info("smartMainContent.text:" + (smartMainContent == null ? "NOT_FOUND" : smartMainContent.text())); logger.info("title:"+page.getTitle()); logger.info("keywords:"+page.getKeywords()); logger.info("description:"+page.getDescription()); logger.info("content,size:"+ StringUtils.length(page.getContent()) +" "+page.getContent()); logger.info("content.deepLevel:"+JsoupSelectorUtil.select(doc,mainContentSelector).parents().size()); if(smartMainContent != null && StringUtils.isNotBlank(mainContentSelectorContent)) { if(!smartMainContent.text().equals(page.getContent())) { logger.warn("-------------------error: smart max length text != selector["+StringUtils.join(mainContentSelector,",")+"] text----------------------"); } } if(StringUtils.length(page.getContent()) < minContentLength) { return null; } return page; }catch(Exception e) { throw new RuntimeException("error on extractArticleByJsoup anchor:"+anchor,e); } } private Element smartGetMainContent(Document doc) { List<Element> allDiv = JsoupSelectorUtil.selectList(doc,"div"); Collections.sort(allDiv,new JsoupElementParentsSizeComparator()); Map<Element,Float> elementScores = new HashMap<Element,Float>(); for(Element element : allDiv) { float score = getPageElementScore(element); if(score >= 25) { elementScores.put(element, score); } } Element element = CollectionHelper.getMaxKeyByValue(elementScores); if(element != null) { logger.info("success_found_valid_content:"+element.tagName()+ " class:" + element.className() + " id:"+ element.id() +" score:"+getPageElementScore(element)); } return element; } private float getPageElementScore(Element element) { int conditionSymbolesCount = minContentLength / 50; int commonSymbolesCount = KeywordUtil.getCommonSymbolsCount(element.text()); int divCount = element.getElementsByTag("div").size(); int parentsSize = element.parents().size(); /* * TODO 增加判断如果出现空格数过多的文字也属于垃圾特征,如: 首页 产品列表 关于我们 * TODO 包含垃圾子段的父亲,也是垃圾 * TODO */ int textLength = element.text().length(); int anchorSize = element.getElementsByTag("a").size(); int paragraphSize = element.getElementsByTag("p").size(); float score = getPageElementScore(textLength,parentsSize,commonSymbolesCount,conditionSymbolesCount,divCount,anchorSize,paragraphSize); return score; } public float getPageElementScore(int textLength,int parentsSize,int commonSymbolesCount,int conditionSymbolesCount,int divCount,int anchorSize,int paragraphSize) { float score = 0; if(textLength >= minContentLength) { score += 10; } if(parentsSize >= 4) { score += 10; } if(commonSymbolesCount > conditionSymbolesCount) { score += 10; score += paragraphSize * 2.5; } score += parentsSize * 1.5; score += textLength / 700; score -= anchorSize; score -= divCount * 2; return score; } boolean isAcceptUrl(String href) { if(StringUtils.isBlank(href)) { return false; } try { new URL(href); } catch (MalformedURLException e) { return false; } if(excludeUriRegexList != null) { for(String exclude : excludeUriRegexList) { if(StringUtils.isNotBlank(exclude)) { if(href.matches(exclude)) { return false; } } } } if(acceptUrlRegexList != null) { for(String accept : acceptUrlRegexList) { if(StringUtils.isNotBlank(accept)) { if(href.matches(accept)) { return true; } } } } return false; } }