package com.brucezee.jspider.test; import com.brucezee.jspider.*; import com.brucezee.jspider.berkeley.BdbPersistentScheduler; import com.brucezee.jspider.common.utils.SpiderUrlUtils; import com.brucezee.jspider.monitor.SpiderMonitor; import com.brucezee.jspider.pipeline.Pipeline; import com.brucezee.jspider.processor.PageProcessor; import com.brucezee.jspider.scheduler.Scheduler; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; /** * Created by brucezee on 2017/1/13. */ public class BdbSpiderTest { public static void main(String[] args) { SpiderConfig spiderConfig = SpiderConfig.create("baidu", 5) .setEmptySleepMillis(1000) .setExitWhenComplete(true); SiteConfig siteConfig = SiteConfig.create() .setMaxConnTotal(200) .setMaxConnPerRoute(100); Scheduler scheduler = new BdbPersistentScheduler("F:\\tmp\\db", "hello"); Spider spider = Spider.create(spiderConfig, siteConfig, new BaiduPageProcessor()) .setPipeline(new BaiduPipeline()) .setScheduler(scheduler) .addStartRequests("https://www.baidu.com/s?wd=ip"); SpiderMonitor.register(spider); spider.start(); System.out.println(); try { Thread.sleep(1000000); } catch (Exception e) { e.printStackTrace(); } } public static class BaiduPageProcessor implements PageProcessor { @Override public Result process(Request request, Page page) { Result result = new Result(); Elements elements = page.document().select(".c-container a"); if (elements != null && elements.size() > 0 && count == 0) { List<String> links = new ArrayList<String>(elements.size()); for (Element element : elements) { String href = element.absUrl("href"); if (StringUtils.isNotBlank(href) && StringUtils.isNotBlank(SpiderUrlUtils.getUrlHost(href))) { links.add(href); } } page.addTargetRequests(links); } result.put("title", page.document().title()); return result; } } private static int count = 0; public static class BaiduPipeline implements Pipeline { @Override public void persist(Request request, Result result) { System.out.println((++count)+"\t"+(String)result.get("title")+"\t"+request.getUrl()); try { Thread.sleep(1000); } catch (Exception e) { e.printStackTrace(); } } } }