package focusedCrawler.link.backlink; import java.io.IOException; import java.net.URL; import java.net.URLConnection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import focusedCrawler.util.parser.BackLinkNeighborhood; public class GoogleBacklinkApi implements BacklinkApi { final String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"; public BackLinkNeighborhood[] downloadBacklinks(String host) throws IOException { // 21 -> max number allowed by google... decreases after String backlink = "https://www.google.com/search?q=link:" + host + "&num=21"; try { URLConnection connection = new URL(backlink).openConnection(); connection.setRequestProperty("User-Agent", userAgent); connection.connect(); Document doc = Jsoup.parse(connection.getInputStream(), "UTF-8", host); Elements searchItems = doc.select("div#search"); Elements linkHeaders = searchItems.select(".r"); Elements linksUrl = linkHeaders.select("a[href]"); int resultSize = linksUrl.size(); BackLinkNeighborhood[] backlinks = new BackLinkNeighborhood[resultSize]; int i = 0; for (Element link : linksUrl) { backlinks[i] = new BackLinkNeighborhood(); backlinks[i].setLink(link.attr("href")); backlinks[i].setTitle(link.text()); i++; } return backlinks; } catch (IOException e) { throw new IOException("Failed to download backlinks from Google.", e); } } }