package com.geccocrawler.gecco.demo.dynamic; import com.geccocrawler.gecco.GeccoEngine; import com.geccocrawler.gecco.dynamic.DynamicGecco; import com.geccocrawler.gecco.request.HttpGetRequest; /** * 本demo是一个在线修改抓取规则的例子,DyncmicGecco支持规则类的重新加载,不需要重启应用 * * @author huchengyi * */ public class DynamicRuleTest { public static void main(String[] args) throws Exception { //初始化爬虫引擎,此时由于没有初始请求,爬虫引擎会阻塞初始队列,直到获取到初始请求 GeccoEngine ge = GeccoEngine.create("com.geccocrawler.gecco.demo.dynamic") .interval(5000) .loop(true) .engineStart(); //定义爬取规则 Class<?> rule = DynamicGecco .html() .gecco("https://github.com/xtuhcy/gecco", "consolePipeline") .stringField("title").csspath(".repository-meta-content").text(false).build() .intField("star").csspath(".pagehead-actions li:nth-child(2) .social-count").text(false).build() .intField("fork").csspath(".pagehead-actions li:nth-child(3) .social-count").text().build() .loadClass(); //注册规则 ge.register(rule); //加入初始请求,爬虫引擎开始工作 ge.getScheduler().into(new HttpGetRequest("https://github.com/xtuhcy/gecco")); Thread.sleep(5000); System.out.println("修改规则"); try { //开始更新规则 ge.beginUpdateRule(); //修改规则 Class<?> newRule = DynamicGecco .html(rule.getName()) .gecco("https://github.com/xtuhcy/gecco", "consolePipeline") .intField("fork").csspath(".pagehead-actions li:nth-child(3) .social-count").text().build() .removeField("star") .loadClass(); //注册新规则 ge.register(newRule); } catch(Exception ex) { ex.printStackTrace(); } finally { //规则更新完毕 ge.endUpdateRule(); } Thread.sleep(5000); System.out.println("下线规则"); try { //开始更新规则 ge.beginUpdateRule(); //下线之前的规则(也支持不下线规则,直接修改) ge.unregister(rule); } catch(Exception ex) { ex.printStackTrace(); } finally { //规则更新完毕 ge.endUpdateRule(); } } }