/* * Copyright 2014 Eric F. Savage, code@efsavage.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.ajah.scrape; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; import lombok.extern.java.Log; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.ajah.http.cache.DiskCache; import com.ajah.http.err.HttpException; import com.ajah.util.StringUtils; /** * @author <a href="http://efsavage.com">Eric F. Savage</a>, <a * href="mailto:code@efsavage.com">code@efsavage.com</a>. */ @Log public class AjahScraper { private final long timeout = 86_400_000; public Document getDocument(final ScrapedLink link) throws IOException, URISyntaxException, HttpException { log.fine("Fetching " + link.getText() + " at " + link.getHref()); final String html = DiskCache.get(new URI(link.getHref()), this.timeout); final Document doc = Jsoup.parse(html, link.getHref()); return doc; } public Document getDocument(final URI uri) throws IOException, HttpException { log.fine("Fetching " + uri); final String html = DiskCache.get(uri, this.timeout); final Document doc = Jsoup.parse(html, uri.toASCIIString()); return doc; } public List<ScrapedLink> scrapeLinks(final URI uri, final String linkPattern, final boolean regex, final boolean ignoreEmptyText) throws IOException, HttpException { log.finest("Scraping links from: " + uri.toString()); final String html = DiskCache.get(uri, this.timeout); final Document doc = Jsoup.parse(html, uri.toASCIIString()); Elements links = null; if (StringUtils.isBlank(linkPattern) || regex) { throw new IllegalArgumentException("Regex not supported"); } links = doc.select(linkPattern); final List<ScrapedLink> scrapedLinks = new ArrayList<>(); for (final Element link : links) { if (!StringUtils.isBlank(link.attr("href"))) { if (ignoreEmptyText && StringUtils.isBlank(link.text())) { continue; } String href = link.attr("href"); if (href.startsWith("/")) { href = uri.getScheme() + "://" + uri.getHost() + href; } final ScrapedLink scrapedLink = new ScrapedLink(href, link.text()); scrapedLinks.add(scrapedLink); } } return scrapedLinks; } public ScrapedTable scrapeTable(final URI uri, final String tablePattern, final boolean regex) throws IOException, HttpException { final String html = DiskCache.get(uri, this.timeout); final Document doc = Jsoup.parse(html); Element table = null; if (StringUtils.isBlank(tablePattern) || regex) { final Elements tables = doc.select("table"); if (regex) { if (tablePattern.startsWith("#")) { for (final Element candidate : tables) { if (candidate.id().matches(tablePattern)) { table = candidate; break; } } } else { throw new IllegalArgumentException("Unsupported regex pattern: " + tablePattern); } } } else { final Elements tables = doc.select(tablePattern); table = tables.get(0); } if (table == null) { log.warning("No table found"); log.warning(html); return null; } log.finest("Found table"); final ScrapedTable scrapedTable = new ScrapedTable(); final Elements rows = table.select("tr"); for (final Element row : rows) { final ScrapedRow scrapedRow = new ScrapedRow(); scrapedRow.setCssClass(row.className()); final Elements cells = row.select("td"); for (final Element cell : cells) { final Elements links = table.select("a"); String href = null; if (links.size() == 1) { href = links.get(0).attr("href"); } final ScrapedCell scrapedCell = new ScrapedCell(cell.text(), href, cell.html()); scrapedRow.add(scrapedCell); } scrapedTable.add(scrapedRow); } return scrapedTable; } }