/* $Id$ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.webcrawler; import org.apache.manifoldcf.core.interfaces.*; /** This class is used to discover links in a session login context */ public class FindHandler implements IDiscoveredLinkHandler { protected String parentURI; protected String targetURI = null; public FindHandler(String parentURI) { this.parentURI = parentURI; } /** Inform the world of a discovered link. *@param rawURL is the raw discovered url. This may be relative, malformed, or otherwise unsuitable for use until final form is acheived. */ @Override public void noteDiscoveredLink(String rawURL) throws ManifoldCFException { // Build a complete url, but don't filter or anything try { java.net.URI url; if (parentURI != null) { java.net.URI parentURL = new java.net.URI(parentURI); url = parentURL.resolve(rawURL); } else url = new java.net.URI(rawURL); String protocol = url.getScheme(); String host = url.getHost(); // The new URL better darn well have a host and a protocol, and we only know how to deal with // http and https. if (protocol == null || host == null) { return; } if (!WebcrawlerConnector.understoodProtocols.contains(protocol)) { return; } String id = url.toASCIIString(); if (id == null) return; // As a last basic legality check, go through looking for illegal characters. int i = 0; while (i < id.length()) { char x = id.charAt(i++); // Only 7-bit ascii is allowed in URLs - and that has limits too (no control characters) if (x < ' ' || x > 127) { return; } } // Set the target. targetURI = id; } catch (java.net.URISyntaxException e) { return; } catch (java.lang.IllegalArgumentException e) { return; } catch (java.lang.NullPointerException e) { // This gets tossed by url.toAsciiString() for reasons I don't understand, but which have to do with a malformed URL. return; } } public String getTargetURI() { return targetURI; } }