// Copyright 2009 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.google.enterprise.connector.util; import java.io.IOException; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.security.GeneralSecurityException; import java.util.logging.Level; import java.util.logging.Logger; import javax.net.ssl.HttpsURLConnection; /** * Validates URLs by making an HTTP request. * * @since 2.6.6 */ /* * TODO: We might want to merge XmlFeed#validateSearchUrl into this class. * * TODO: With an URLStreamHandler, we might be able to do some unit * testing of this class. */ public class UrlValidator { /** The logger for this class. */ private static final Logger LOGGER = Logger.getLogger(UrlValidator.class.getName()); /** The connect timeout. */ private volatile int connectTimeout = 60 * 1000; /** The read timeout. */ private volatile int readTimeout = 60 * 1000; /** The HTTP request method. */ private volatile String requestMethod = "HEAD"; /** Whether redirects should be followed or returned as the response. */ private volatile boolean followRedirects = false; /** Whether fully qualified host names must be specified. */ private volatile boolean requireFullyQualifiedHostNames = false; /** Constructs an instance using the default parameter values. */ public UrlValidator() { } /** * Sets the HTTP request method. The default value is "HEAD". * * @param requestMethod should be either "GET" or "HEAD" * @see HttpURLConnection#setRequestMethod */ public void setRequestMethod(String requestMethod) { this.requestMethod = requestMethod; } /** * Sets whether to follow HTTP redirects, or return them as the * response. The default is {@code false}, which returns the * redirect as the HTTP response. * * @param followRedirects {@code true} to follow HTTP * redirects, or {@code false} to return them as the HTTP * response * @see HttpURLConnection#setInstanceFollowRedirects */ public void setFollowRedirects(boolean followRedirects) { this.followRedirects = followRedirects; } /** * Sets whether fully qualified host names are required in the URL. * IP addresses are still OK, but host names must be fully qualified. * The default is {@code false}, which allows non-fully qualified * host names, even thought the GSA requires one in most cases. * * @param requireFullyQualifiedHostNames {@code true} if host * names must be fully qualified, {@code false} if not */ public void setRequireFullyQualifiedHostNames( boolean requireFullyQualifiedHostNames) { this.requireFullyQualifiedHostNames = requireFullyQualifiedHostNames; } /** * Sets the connect timeout. The default value is 60000 milliseconds. * * @param connectTimeout the connect timeout in milliseconds * @see URLConnection#setConnectTimeout */ public void setConnectTimeout(int connectTimeout) { this.connectTimeout = connectTimeout; } /** * Sets the read timeout. The default value is 60000 milliseconds. * * @param readTimeout the read timeout in milliseconds * @see URLConnection#setReadTimeout */ public void setReadTimeout(int readTimeout) { this.readTimeout = readTimeout; } /** * Attempts to validate the given URL by making an HTTP request. In * this case, we're mostly trying to catch typos, so "valid" means: * <ol> * <li>The URL syntax is valid.</li> * <li>If fully qualified host names are required, check that the * host name looks fully qualified (contains a '.').</li> * <li>If the URL uses HTTP or HTTPS: * <ol> * <li>A connection can be made and the response read.</li> * <li>The response code was not 404, * or any of the following related but less common errors: * 400, 405, 410, or 414.</li> * </ol> * </li> * </ol> * <p> * The 405 (Method Not Allowed) is related because the Sun Java * System Web Server, and possibly Apache, return this code rather * than a 404 if you attempt to access a CGI program in an unknown * directory. * <p> * When testing an HTTPS URL, we override server certificate * validation to skip trying to verify the server's certificate, * and we accept hostname mismatches. In this case, all we care * about is that the configured URL can be reached; it's up to the * connector administrator to enter the right URL. * * @param urlString the URL to test * @throws GeneralSecurityException if there is an error configuring * the HTTPS connection * @throws IOException if the URL is malformed, or if there is an * error connecting or reading the response * @throws UrlValidatorException if the HTTP status code was invalid */ /* * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4912484 * The above Sun bug report documents that openConnection * doesn't try to connect. * * This method returns the HTTP response code so that it can be * unit tested. A return value of 0 is arbitrary and unused by the * tests. */ public int validate(String urlString) throws GeneralSecurityException, IOException, UrlValidatorException { if (urlString == null || urlString.trim().length() == 0) { return 0; } URL url = new URL(urlString); if (requireFullyQualifiedHostNames) { // The GSA requires fully qualified host names for most hosts. // This non-rigorous test simply looks for '.' in hostname. // Conveniently, IPv4 addresses also pass this test (but not // IPv6 addresses). String host = url.getHost(); if ((host.charAt(0) != '[') && (host.indexOf('.') < 0)) { // FIXME: This string should be translated, either locally, // which might be troubling, or by throwing a more specific // exception so that the connector can provide a localized // message. LOGGER.severe("Fully qualified host name is required: " + host); throw new UrlValidatorException(HttpURLConnection.HTTP_PRECON_FAILED, "Fully qualified host name is required: " + host); } } URLConnection conn = url.openConnection(); if (!(conn instanceof HttpURLConnection)) { // If the URL is not an HTTP or HTTPS URL, which is // incredibly unlikely, we don't check anything beyond // the URL syntax. return 0; } HttpURLConnection httpConn = (HttpURLConnection) conn; if (httpConn instanceof HttpsURLConnection) { SslUtil.setTrustingHttpsOptions((HttpsURLConnection) httpConn); } setTimeouts(conn); httpConn.setRequestMethod(requestMethod); httpConn.setInstanceFollowRedirects(followRedirects); httpConn.connect(); try { int responseCode = httpConn.getResponseCode(); String responseMessage = httpConn.getResponseMessage(); switch (responseCode) { case HttpURLConnection.HTTP_BAD_REQUEST: case HttpURLConnection.HTTP_NOT_FOUND: case HttpURLConnection.HTTP_BAD_METHOD: case HttpURLConnection.HTTP_GONE: case HttpURLConnection.HTTP_REQ_TOO_LONG: if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe("Validate URL HTTP response: " + responseCode + " " + responseMessage); } throw new UrlValidatorException(responseCode, responseMessage); default: if (LOGGER.isLoggable(Level.CONFIG)) { LOGGER.config("Validate URL HTTP response: " + responseCode + " " + responseMessage); } break; } return responseCode; } catch (IOException e) { if ("Authentication failure".equals(e.getMessage())) { // This exception can be thrown during NTLM negotiation. // TODO(jlacey): Write a test to reproduce this exception. LOGGER.log(Level.CONFIG, "Validate URL HTTP response: returning 401 for {0}", e.toString()); return HttpURLConnection.HTTP_UNAUTHORIZED; } else { throw e; } } finally { httpConn.disconnect(); } } /** * Sets the connect and read timeouts of the given {@code URLConnection}. * * @param conn the URL connection */ private void setTimeouts(URLConnection conn) { conn.setConnectTimeout(connectTimeout); conn.setReadTimeout(readTimeout); } }