/** * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved. * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. * http://www.ewcms.com */ package com.ewcms.plugin.crawler.generate; import static com.ewcms.common.lang.EmptyUtil.*; import java.io.File; import java.util.ArrayList; import java.util.Calendar; import java.util.HashMap; import java.util.List; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import com.ewcms.content.document.service.ArticleMainServiceable; import com.ewcms.content.resource.ResourceFacable; import com.ewcms.core.site.model.Site; import com.ewcms.plugin.BaseException; import com.ewcms.plugin.crawler.generate.crawler.CrawlConfig; import com.ewcms.plugin.crawler.generate.crawler.CrawlController; import com.ewcms.plugin.crawler.generate.fetcher.PageFetcher; import com.ewcms.plugin.crawler.generate.robotstxt.RobotstxtConfig; import com.ewcms.plugin.crawler.generate.robotstxt.RobotstxtServer; import com.ewcms.plugin.crawler.generate.util.IO; import com.ewcms.plugin.crawler.manager.service.GatherServiceable; import com.ewcms.plugin.crawler.model.Domain; import com.ewcms.plugin.crawler.model.FilterBlock; import com.ewcms.plugin.crawler.model.Gather; import com.ewcms.plugin.crawler.model.MatchBlock; import com.ewcms.plugin.crawler.util.CrawlerUtil; import com.ewcms.web.util.EwcmsContextUtil; /** * * @author wu_zhijun * */ @Service public class EwcmsController implements EwcmsControllerable { private static final Logger logger = LoggerFactory.getLogger(EwcmsController.class); @Autowired private GatherServiceable gatherService; @Autowired private ArticleMainServiceable articleMainService; @Autowired private ResourceFacable resourceFac; private CrawlController controller = null; @Override public void startCrawl(Long gatherId) throws BaseException{ Gather gather = gatherService.findGather(gatherId); if (isNull(gather)){ logger.warn("采集的记录不存在!"); throw new BaseException("采集的记录不存在!","采集的记录不存在!"); } if (!gather.getStatus()){ logger.warn("采集的为停用状态,不能执行!"); throw new BaseException("采集的为停用状态,不能执行!","采集的为停用状态,不能执行!"); } if (isNull(gather.getBaseURI()) || gather.getBaseURI().trim().length() == 0){ logger.warn("采集的网站地址未设定!"); throw new BaseException("采集的网站地址未设定!","采集的网站地址未设定!"); } if (gather.getDomains() == null || gather.getDomains().size() == 0){ logger.warn("采集的地址区配未设定!"); throw new BaseException("采集的地址区配未设定!","采集的地址区配未设定!"); } if (!gather.getIsLocal() && gather.getType() == Gather.Type.CONTENT && isNull(gather.getChannelId())){ logger.warn("收集的频道未设定!"); throw new BaseException("收集的频道未设定!","收集的频道未设定!"); } String gatherFolderPath = CrawlerUtil.ROOT_FOLDER + gatherId + "-"+ Calendar.getInstance().getTimeInMillis(); try{ File gatherFolder = new File(gatherFolderPath); if (gatherFolder.exists()){ boolean delete = IO.deleteFolderContents(gatherFolder); if (!delete){ logger.info("采集器正在运行中,无须再运行!"); throw new BaseException("采集器正在运行中,无须再运行!","采集器正在运行中,无须再运行!"); } } }catch(Exception e){ logger.error("目录删除失败!"); throw new BaseException("采集器正在运行中,无须再运行!","采集器正在运行中,无须再运行!"); } CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(gatherFolderPath); HashMap<String,Object> passingParameters = new HashMap<String,Object>(); if (gather.getType() == Gather.Type.RESOURCE){ Site site = EwcmsContextUtil.getCurrentSite(); config.setIncludeBinaryContentInCrawling(true); passingParameters.put("resourceFac", resourceFac); passingParameters.put("storageFolderName", gatherFolderPath + "/resource"); passingParameters.put("site", site); passingParameters.put("isImage", gather.getIsImage()); passingParameters.put("isFlash", gather.getIsFlash()); passingParameters.put("isVideo", gather.getIsVideo()); passingParameters.put("isAnnex", gather.getIsAnnex()); passingParameters.put("annexType", gather.getAnnexType()); }else{ config.setIncludeBinaryContentInCrawling(gather.getDownloadFile()); passingParameters.put("gatherService", gatherService); passingParameters.put("articleMainService", articleMainService); passingParameters.put("gather", gather); passingParameters.put("matchRegex", initMatchBlock(gatherId)); passingParameters.put("filterRegex",initFilterBlock(gatherId)); } config.setPolitenessDelay(2000); //设置抓取的页面的最大数量,默认值是-1无限深度 config.setMaxPagesToFetch(gather.getMaxPage().intValue()); if (gather.getProxy()){ config.setProxyHost(gather.getProxyHost()); config.setProxyPort(gather.getProxyPort()); config.setProxyUsername(gather.getProxyUserName()); config.setProxyPassword(gather.getProxyPassWord()); } config.setMaxDepthOfCrawling(gather.getDepth().intValue()); config.setResumableCrawling(true); PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); try{ controller = new CrawlController(config, pageFetcher, robotstxtServer, passingParameters); String[] crawlerDomains = conversionDomain(gather.getDomains()); if (crawlerDomains != null) controller.setCustomData(crawlerDomains); controller.addSeed(gather.getBaseURI()); //并发线程数 int numberOfCrawlers = gather.getThreadCount(); if (gather.getType() == Gather.Type.RESOURCE){ controller.startNonBlocking(EwcmsResourceCrawler.class, numberOfCrawlers); }else{ controller.startNonBlocking(EwcmsContentCrawler.class, numberOfCrawlers); } controller.waitUntilFinish(); }catch(Exception e){ logger.error("网络采集器运行失败!- {}", e.getLocalizedMessage()); }finally{ passingParameters.clear(); passingParameters = null; } } @Override public void interruptCrawl() throws BaseException{ if (controller != null){ controller.Shutdown(); controller.waitUntilFinish(); } } private String initMatchBlock(Long gatherId){ List<String> matchRegexs = new ArrayList<String>(); List<MatchBlock> parents = gatherService.findParentMatchBlockByGatherId(gatherId); for (MatchBlock parent : parents){ String regex = ""; String matchRegex = parent.getRegex(); if (matchRegex != null && matchRegex.length() > 0){ regex = matchRegex; } childrenMatchBlock(gatherId, regex, parent, matchRegexs); } if (matchRegexs.isEmpty()) return "*"; String regex = ""; for (String matchRegex : matchRegexs){ if (matchRegex == null || matchRegex.trim().length() == 0) continue; regex += matchRegex + ", "; } if (regex.length() > 0){ regex = regex.substring(0, regex.length() - 2); } return regex; } private void childrenMatchBlock(Long gatherId, String regex, MatchBlock parent, List<String> matchRegexs){ List<MatchBlock> childrens = gatherService.findChildMatchBlockByParentId(gatherId, parent.getId()); if (childrens.isEmpty()){ matchRegexs.add(regex); }else{ for (MatchBlock children : childrens){ String childrenRegex = regex; String matchRegex = children.getRegex(); if (matchRegex != null && matchRegex.length() > 0){ childrenRegex += " > " + matchRegex; } childrenMatchBlock(gatherId, childrenRegex, children, matchRegexs); } } } private String initFilterBlock(Long gatherId){ List<String> filterRegexs = new ArrayList<String>(); List<FilterBlock> parents = gatherService.findParentFilterBlockByGatherId(gatherId); for (FilterBlock parent : parents){ String regex = ""; String filterRegex = parent.getRegex(); if (filterRegex != null && filterRegex.length() > 0){ regex = filterRegex; } childrenFilterBlock(gatherId, regex, parent, filterRegexs); } if (filterRegexs.isEmpty()) return ""; String regex = ""; for (String filterRegex : filterRegexs){ if (filterRegex == null || filterRegex.trim().length() == 0) continue; regex += filterRegex + ", "; } if (regex.length() > 0){ regex = regex.substring(0, regex.length() - 2); } return regex; } private void childrenFilterBlock(Long gatherId, String regex, FilterBlock parent, List<String> filterRegexs){ List<FilterBlock> childrens = gatherService.findChildFilterBlockByParentId(gatherId, parent.getId()); if (childrens.isEmpty()){ filterRegexs.add(regex); }else{ for (FilterBlock children : childrens){ String childrenRegex = regex; String filterRegex = children.getRegex(); if (filterRegex != null && filterRegex.length() > 0){ childrenRegex += " > " + filterRegex; } childrenFilterBlock(gatherId, childrenRegex, children, filterRegexs); } } } private String[] conversionDomain(Set<Domain> domains){ if (domains != null && !domains.isEmpty()){ String[] crawlerDomain = new String[domains.size()]; int i = 0; for (Domain domain : domains){ crawlerDomain[i] = domain.getUrl(); i++; } return crawlerDomain; } return null; } }