/****************************************************** * Web crawler * * * Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com) * ****************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. * ******************************************************* */ package com.soulgalore.crawler.core; import java.net.URI; import java.net.URL; import java.net.URLDecoder; /** * A page url. * */ public class CrawlerURL { private final String url; private final URI uri; private final String host; private final String referer; private final boolean isWrongSyntax; /** * Create a page url with a blank referer. * * @param theUrl to the asset */ public CrawlerURL(String theUrl) { this(theUrl, ""); } /** * Create a page url with an referer. * * @param theUrl to the asset. * @param theUrlReferer the url to the referer. */ public CrawlerURL(String theUrl, String theUrlReferer) { url = theUrl; referer = theUrlReferer; URI tmpURI = null; try { URL u = null; // sometimes the urls are encoded // but how do we handle the problem with url:s that are // encoded but not the + sign? // better to check if it contains faulty characters // if (url.matches("@^[a-zA-Z0-9%+-_]*$@")) if (url.contains("%")) u = new URL(URLDecoder.decode(theUrl, "UTF-8")); else u = new URL(theUrl); // skipping the segment part, since the # is only for the browser tmpURI = new URI(u.getProtocol(), u.getUserInfo(), u.getHost(), u.getPort(), u.getPath(), u.getQuery(), null); } catch (Exception e) { // an ugly catch all, we should act on it somehow } uri = tmpURI; isWrongSyntax = (uri == null); host = (uri == null) ? null : uri.getHost(); } public boolean isWrongSyntax() { return isWrongSyntax; } public URI getUri() { return uri; } public String getHost() { return host; } public String getReferer() { return referer; } public String getUrl() { return url; } @Override public String toString() { return this.getClass().getSimpleName() + " url:" + url; } @Override public int hashCode() { // here's a hack for saying http://example.com is the same as http://example.com/ if (uri == null) return 0; final int prime = 31; int result = 1; String hash = uri.toString(); if (hash.endsWith("/")) hash = hash.substring(0, hash.length() - 1); result = prime * result + hash.hashCode(); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; final CrawlerURL other = (CrawlerURL) obj; if (uri == null) { if (other.uri != null) return false; } else if (uri.equals(other.uri)) return true; // here's a hack for saying http://example.com is the same as // http://example.com/ else if (uri.toString().endsWith("/")) { String withoutEndingSlash = uri.toString().substring(0, uri.toString().length() - 1); if (withoutEndingSlash.equals(other.uri.toString())) return true; } return false; } }