/******************************************************************************* * CogTool Copyright Notice and Distribution Terms * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * CogTool is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * CogTool is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with CogTool; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * CogTool makes use of several third-party components, with the * following notices: * * Eclipse SWT version 3.448 * Eclipse GEF Draw2D version 3.2.1 * * Unless otherwise indicated, all Content made available by the Eclipse * Foundation is provided to you under the terms and conditions of the Eclipse * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this * Content and is also available at http://www.eclipse.org/legal/epl-v10.html. * * CLISP version 2.38 * * Copyright (c) Sam Steingold, Bruno Haible 2001-2006 * This software is distributed under the terms of the FSF Gnu Public License. * See COPYRIGHT file in clisp installation folder for more information. * * ACT-R 6.0 * * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & * John R Anderson. * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * Apache Jakarta Commons-Lang 2.1 * * This product contains software developed by the Apache Software Foundation * (http://www.apache.org/) * * jopt-simple version 1.0 * * Copyright (c) 2004-2013 Paul R. Holser, Jr. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Mozilla XULRunner 1.9.0.5 * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/. * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the * License for the specific language governing rights and limitations * under the License. * * The J2SE(TM) Java Runtime Environment version 5.0 * * Copyright 2009 Sun Microsystems, Inc., 4150 * Network Circle, Santa Clara, California 95054, U.S.A. All * rights reserved. U.S. * See the LICENSE file in the jre folder for more information. ******************************************************************************/ package edu.cmu.cs.hcii.cogtool.model; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.regex.Pattern; import edu.cmu.cs.hcii.cogtool.util.ObjectLoader; import edu.cmu.cs.hcii.cogtool.util.ObjectSaver; /** * Basic type for specifying a URL to crawl (i.e., visit and parse) * in a web spider (see controller/WebCrawler, for instance). */ public class URLCrawlEntry { /** * Each URL may have an associated depth; if not, the default depth * specified in the crawlWeb call is used (if none specified, then * INFINITE_DEPTH is used). */ public static final int USE_DEFAULT_DEPTH = -1; /** * Whenever no default depth is specified, (effective) infinity is used. */ public static final int INFINITE_DEPTH = Integer.MAX_VALUE; public static final int edu_cmu_cs_hcii_cogtool_model_URLCrawlEntry_version = 0; protected static final String urlVAR = "url"; protected static final String toDepthVAR = "toDepth"; protected static final String domainVAR = "domain"; private static ObjectSaver.IDataSaver<URLCrawlEntry> SAVER = new ObjectSaver.ADataSaver<URLCrawlEntry>() { @Override public int getVersion() { return edu_cmu_cs_hcii_cogtool_model_URLCrawlEntry_version; } @Override public void saveData(URLCrawlEntry v, ObjectSaver saver) throws IOException { saver.saveString(v.url, urlVAR); saver.saveInt(v.toDepth, toDepthVAR); saver.saveString(v.domain, domainVAR); } }; public static void registerSaver() { ObjectSaver.registerSaver(URLCrawlEntry.class.getName(), SAVER); } private static ObjectLoader.IObjectLoader<URLCrawlEntry> LOADER = new ObjectLoader.AObjectLoader<URLCrawlEntry>() { @Override public URLCrawlEntry createObject() { return new URLCrawlEntry(); } @Override public void set(URLCrawlEntry target, String variable, int value) { if (variable != null) { if (variable.equals(toDepthVAR)) { target.toDepth = value; } } } @Override public void set(URLCrawlEntry target, String variable, Object value) { if (variable != null) { if (variable.equals(urlVAR)) { target.url = (String) value; } } } }; public static void registerLoader() { ObjectLoader.registerLoader(URLCrawlEntry.class.getName(), edu_cmu_cs_hcii_cogtool_model_URLCrawlEntry_version, LOADER); } // The URL must be compliant! protected String url = null; protected int toDepth = 1; protected String domain=" "; protected URLCrawlEntry() { // For subclasses and loading } public URLCrawlEntry(String urlToCrawl, int depth) { url = urlToCrawl; toDepth = depth; domain=" "; } public URLCrawlEntry(String urlToCrawl, int depth, String domain) { url = urlToCrawl; toDepth = depth; this.domain=domain; } public String getURL() { return url; } public String getDomain() { return domain; } /** * Need not be an absolute URL (yet). * @param newURL */ public void setURL(String newURL) { url = newURL; } public void setDomain(String newDomain) { domain = newDomain; } public boolean isEmpty() { return (url == null) || (url.equals("")); } public int getToDepth() { return toDepth; } public void setToDepth(int newToDepth) { toDepth = newToDepth; } /** * Pattern that matches a protocol followed by a colon at the start * of a given URL. */ protected static final Pattern SCHEME_PATTERN = Pattern.compile("^\\p{L}[\\p{L}0-9+.-]*:.*$"); /** * Determine whether the given URL string is absolute or needs its * parent's protocol/host/port prefix prepended. */ public static boolean isAbsolute(String url) { return SCHEME_PATTERN.matcher(url).matches(); } public static String ensureAbsolute(String url) throws MalformedURLException { return new URL(isAbsolute(url) ? url : ("http://" + url)).toString(); } public boolean isAbsolute() { return isAbsolute(url); } /** * The internal URL string is guaranteed to be absolute after this call. */ public void ensureAbsolute() throws MalformedURLException { setURL(ensureAbsolute(url)); } /** * Removes the #... portion of the URL */ public static String stripFragment(String url) { int charIx = url.lastIndexOf('#'); if (charIx != -1) { return url.substring(0, charIx); } return url; } public void stripFragment() { setURL(stripFragment(url)); } }