/* ############################################################################ ## ## Copyright (C) 2006-2009 University of Utah. All rights reserved. ## ## This file is part of DeepPeep. ## ## This file may be used under the terms of the GNU General Public ## License version 2.0 as published by the Free Software Foundation ## and appearing in the file LICENSE.GPL included in the packaging of ## this file. Please review the following to ensure GNU General Public ## Licensing requirements will be met: ## http://www.opensource.org/licenses/gpl-license.php ## ## If you are unsure which license is appropriate for your use (for ## instance, you are interested in developing a commercial derivative ## of DeepPeep), please contact us at deeppeep@sci.utah.edu. ## ## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE ## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. ## ############################################################################ */ package focusedCrawler.link.frontier; import java.io.IOException; import java.io.Serializable; import java.net.MalformedURLException; import java.net.URL; import java.util.Comparator; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import com.google.common.net.InetAddresses; import com.google.common.net.InternetDomainName; @SuppressWarnings("serial") public class LinkRelevance implements Serializable { public static double DEFAULT_RELEVANCE = 299; public static double DEFAULT_HUB_RELEVANCE = 100; public static double DEFAULT_AUTH_RELEVANCE = 200; public enum Type { FORWARD, ROBOTS, SITEMAP } public static Comparator<LinkRelevance> DESC_ORDER_COMPARATOR = new Comparator<LinkRelevance>() { @Override public int compare(LinkRelevance o1, LinkRelevance o2) { return Double.compare(o2.getRelevance(), o1.getRelevance()); } }; @JsonDeserialize(using = UrlDeseralizer.class) private URL url; private double relevance; private Type type; public LinkRelevance() { // required for JSON serialization } public LinkRelevance(String string, double relevance) throws MalformedURLException { this(new URL(string), relevance); } public LinkRelevance(URL url, double relevance) { this(url, relevance, Type.FORWARD); } public LinkRelevance(String url, double relevance, Type type) throws MalformedURLException { this(new URL(url), relevance, type); } public LinkRelevance(URL url, double relevance, Type type) { this.url = url; this.relevance = relevance; this.type = type; } public URL getURL() { return url; } public double getRelevance() { return relevance; } public Type getType() { return type; } @JsonIgnore private InternetDomainName getDomainName(String host) { InternetDomainName domain = InternetDomainName.from(host); if(host.startsWith("www.")) { return InternetDomainName.from(host.substring(4)); } else { return domain; } } @JsonIgnore public String getTopLevelDomainName() { String host = url.getHost(); InternetDomainName domain = null; try { domain = this.getDomainName(host); if(domain.isUnderPublicSuffix()) { return domain.topPrivateDomain().toString(); } else { // if the domain is a public suffix, just use it as top level domain return domain.toString(); } } catch (IllegalArgumentException e) { // when host is an IP address, use it as TLD if(InetAddresses.isInetAddress(host)) { return host; } throw new IllegalStateException("Invalid top private domain name=["+domain+"] in URL=["+url+"]", e); } } public static LinkRelevance create(String url) throws MalformedURLException { return new LinkRelevance(new URL(url), LinkRelevance.DEFAULT_RELEVANCE); } @Override public String toString() { return "LinkRelevance[url=" + url + ", relevance=" + relevance + "]"; } public static class UrlDeseralizer extends JsonDeserializer<URL> { @Override public URL deserialize(JsonParser parser, DeserializationContext ctxt) throws IOException, JsonProcessingException { JsonNode node = parser.getCodec().readTree(parser); return new URL(node.asText()); } } }