package org.jggug.hudson.plugins.gcrawler.crawlers; import static java.lang.String.format; import static java.util.regex.Pattern.compile; import static org.jggug.hudson.plugins.gcrawler.util.HttpUtils.getFile; import java.io.FileNotFoundException; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jggug.hudson.plugins.gcrawler.CrawlContext; import org.jggug.hudson.plugins.gcrawler.GrailsProjectInfo; import org.jggug.hudson.plugins.gcrawler.scm.RepositoryException; import org.jggug.hudson.plugins.gcrawler.scm.SubversionRepository; import org.jggug.hudson.plugins.gcrawler.util.JobTemplate; public class GoogleCodeCrawler extends CrawlerBase { private static final String SEARCH_URL = "http://code.google.com/hosting/search?%s"; private static final Pattern PROJECT_NAME = compile("\"/p/([-a-zA-Z0-9]+)/\""); private static final Pattern NEXT_URL = compile("<a href=\".*search\\?(.*?)\">Next <b>›</b></a>"); private static final List<String> IGNORE_PROJECT_NAMES = Arrays.asList("support"); private static final JobTemplate JOB_DESCRIPTION = JobTemplate.createTemplate("google_grails_description.txt"); private GrailsCrawlerTaskService service; public GoogleCodeCrawler(CrawlContext context) { super(context); } public List<GrailsProjectInfo> crawl() throws Exception { service = new GrailsCrawlerTaskService(); try { crawl("q=label:Grails"); } catch (FileNotFoundException e1) { logger.warn(e1); } return service.getResults(); } private void crawl(String query) throws FileNotFoundException { String url = format(SEARCH_URL, query); logger.info(url); String html = getFile(url).getText(); Matcher nameMatcher = PROJECT_NAME.matcher(html); while (nameMatcher.find()) { String name = nameMatcher.group(1); if (!IGNORE_PROJECT_NAMES.contains(name)) { try { SubversionRepository repository = new SubversionRepository(format("http://%s.googlecode.com/svn/", name), true); GoogleCodeCrawlTask crawlTask = new GoogleCodeCrawlTask(name, context, JOB_DESCRIPTION, repository); service.submit(crawlTask); } catch (RepositoryException e) { logger.warn(e); } } } Matcher nextMatcher = NEXT_URL.matcher(html); if (nextMatcher.find()) { crawl(nextMatcher.group(1).replaceAll("&", "&")); } } }