/* * Copyright 2011 Marek Pilecky * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.mefi.jkuuza.parser; import java.net.URI; import java.net.URISyntaxException; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author Marek Pilecky */ public class LinksExtractor { private Document doc; /** * Needs non empty Jsoup Document instance * * @param doc Jsoup Document */ public LinksExtractor(Document doc) { this.doc = doc; } /** * Extracts all links from document and returns links from specified domain * * @param host String: "foo.example.com" * @return set with with links pointing to the host. */ public Set<String> getInternalLinks(String host) { Elements allLinks = doc.select("a[href]"); Set<String> internalLinks = new HashSet<String>(); host = canonizeHost(host); for (Element link : allLinks) { String linkUrl = createLinkUrl(link); if (isInternal(linkUrl, host)) { internalLinks.add(linkUrl); } } return internalLinks; } /** * Creates normalized url from link in org.jsoup.nodes.Element * * @param link Element * @return String with url */ public String createLinkUrl(Element link) { String linkUrl = link.attr("abs:href").toString(); return normalizeUrl(linkUrl); } /** * Check if link points back to the domain specified by host * * @param linkUrl * @param host * @return true if host is part od link url */ public boolean isInternal(String linkUrl, String host) { Pattern pattern = Pattern.compile("^(http|https|ftp)://(www.)?[a-zA-Z0-9.]*" + host + "*"); Matcher matcher = pattern.matcher(linkUrl); if (!linkUrl.isEmpty() && matcher.find()) { return true; } else { return false; } } /** * Transforms String host to required form * * @param host * @return host in form example.com; foo.example.com; ... */ public String canonizeHost(String host) { host = host.trim(); host = host.replace("https://", ""); host = host.replace("http://", ""); host = host.replace("www.", ""); return host; } /** * Removes php session from url * * @param host * @return host in form example.com; foo.example.com; ... */ protected String removePhpsessid(String string) { if (string.contains("PHPSESSID")) { Pattern pattern = Pattern.compile("PHPSESSID=[a-z0-9]*[&]?"); Matcher matcher = pattern.matcher(string); String output = matcher.replaceAll(""); if (output.endsWith("?")) { output = output.substring(0, output.lastIndexOf("?")); } if (output.endsWith("&")) { output = output.substring(0, output.lastIndexOf("&")); } return output; } return string; } public String normalizeUrl(String url) { URI uri = null; // remove anchor if (url.contains("#")) { url = url.substring(0, url.indexOf("#")); } // remove /../ if (url.contains("/../")) { url = url.replace("/../", "/"); } // remove PHPSESSID url = removePhpsessid(url); // remove . // remove .. // remove : try { uri = new URI(url); uri = uri.normalize(); } catch (URISyntaxException ex) { return ""; } return uri.toString(); } /** * Returns Document * * @return Doc */ public Document getDoc() { return doc; } /** * Sets document * * @param doc Document */ public void setDoc(Document doc) { this.doc = doc; } }