/*
* Leech - crawling capabilities for Apache Tika
*
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.util;
import java.io.File;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.TreeSet;
import javax.mail.URLName;
/**
* Offers utility methods related to URLs, network connections, etc.
*/
public class UrlUtil
{
public static String extractFolder(URLName url)
{
String strFolder = url.getFile();
int iIndex = strFolder.indexOf(";");
if(iIndex > 0) strFolder = strFolder.substring(0, iIndex);
while (strFolder.endsWith("/"))
strFolder = strFolder.substring(0, strFolder.length() - 1);
return strFolder;
}
public static String extractUID(URLName url)
{
String strUID = url.toString();
int iIndex = strUID.toLowerCase().indexOf("uid=");
if(iIndex == -1) return null;
strUID = strUID.substring(iIndex + 4);
iIndex = strUID.indexOf(";");
if(iIndex > 0) strUID = strUID.substring(0, iIndex);
return strUID;
}
public static URLName urlNameWithoutPassword(URLName urlNameWithPassword)
{
URLName urlNameWithoutPassword =
new URLName(urlNameWithPassword.getProtocol(), urlNameWithPassword.getHost(), urlNameWithPassword.getPort(),
urlNameWithPassword.getFile(), urlNameWithPassword.getUsername(), "");
return urlNameWithoutPassword;
}
/**
* If the given String can be parsed as an URL, the method will return another URL String without a possible password. In the case the String can
* not be parsed as URL, the method will return the given original String.
*
* @param strPossibleUrlNameWithPassword a String that possibly is an URL String with password
*
* @return the original String in the case it could not be parsed as an Url, an Url String with removed password otherwise.
*/
public static String urlNameWithoutPassword(String strPossibleUrlNameWithPassword)
{
try
{
URLName urlNameWithPassword = new URLName(strPossibleUrlNameWithPassword);
return urlNameWithoutPassword(urlNameWithPassword).toString();
}
catch (Exception e)
{
return strPossibleUrlNameWithPassword;
}
}
/**
* Remove relative references and "mistakes" like double slashes from the path.
*
* @param path The path to normalize.
* @return The normalized path.
*/
public static String normalizePath(String path)
{
String result = path;
// replace all double slashes with a single slash
result = replace("//", "/", result);
// replace all references to the current directory with nothing
result = replace("/./", "/", result);
// replace all references to the parent directory with nothing
result = result.replaceAll("/[^/]+/\\.\\./", "/");
return result;
}
/**
* Normalizes a query string by sorting the query parameters alpabetically.
*
* @param query The query string to normalize.
* @return The normalized query string.
*/
public static String normalizeQuery(String query)
{
TreeSet<String> sortedSet = new TreeSet<String>();
// extract key-value pairs from the query string
StringTokenizer tokenizer = new StringTokenizer(query, "&");
while (tokenizer.hasMoreTokens())
{
sortedSet.add(tokenizer.nextToken());
}
// reconstruct query string
StringBuilder result = new StringBuilder(query.length());
Iterator<String> iterator = sortedSet.iterator();
while (iterator.hasNext())
{
result.append(iterator.next());
if(iterator.hasNext())
{
result.append('&');
}
}
return result.toString();
}
/**
* Normalizes a URL. The following steps are taken to normalize a URL:
*
* <ul>
* <li>The protocol is made lower-case.
* <li>The host is made lower-case.
* <li>A specified port is removed if it matches the default port.
* <li>Any query parameters are sorted alphabetically.
* <li>Any anchor information is removed.
* </ul>
*
* @param url the url that should be normalized
*
* @return the normalized url
*/
public static URLName normalizeURL(URLName url)
{
try
{
// retrieve the various parts of the URL
String protocol = url.getProtocol();
String host = url.getHost();
int port = url.getPort();
String path = url.getFile();
String query = url.getRef();
String username = url.getUsername();
String password = url.getPassword();
// normalize the fields
protocol = protocol.toLowerCase();
if(host != null) host = host.toLowerCase();
String file = "";
if(path != null) file = normalizePath(path);
if(query != null)
{
query = normalizeQuery(query);
file += "?" + query;
}
// create the normalized URL
url = new URLName(protocol, host, port, file, username, password);
return url;
}
catch (Exception e)
{
throw new RuntimeException("Error while normalizing the url " + url.toString() + ". Is it a well formed URL? ");
}
}
/**
* Substitute String "old" by String "new" in String "text" everywhere.
*
* @param olds The String to be substituted.
* @param news The String containing the new content.
* @param text The String in which the substitution is done.
* @return The result String containing the substitutions; if no substitutions were made, the specified 'text' instance is returned.
*/
protected static String replace(String olds, String news, String text)
{
if(olds == null || olds.length() == 0)
{
// nothing to substitute.
return text;
}
if(text == null)
{
return null;
}
// search for any occurences of 'olds'.
int oldsIndex = text.indexOf(olds);
if(oldsIndex == -1)
{
// Nothing to substitute.
return text;
}
// we're going to do some substitutions.
StringBuilder buffer = new StringBuilder(text.length());
int prevIndex = 0;
while (oldsIndex >= 0)
{
// first, add the text between the previous and the current occurence
buffer.append(text.substring(prevIndex, oldsIndex));
// then add the substition pattern
buffer.append(news);
// remember the index for the next loop
prevIndex = oldsIndex + olds.length();
// search for the next occurence
oldsIndex = text.indexOf(olds, prevIndex);
}
// add the part after the last occurence
buffer.append(text.substring(prevIndex));
return buffer.toString();
}
static public URLName sourceString2URL(String strSourceString) throws MalformedURLException
{
URLName url;
try
{
url = new URLName(strSourceString);
// wenn kein Protokoll angegeben ist, kucken wir mal, ob es ein File ist
boolean bNoProtocol = false;
String strProtocol = url.getProtocol();
if(strProtocol == null)
bNoProtocol = true;
else
{
String strOs = System.getProperty("os.name").toLowerCase();
if(strOs.toLowerCase().contains("win") && strProtocol.length() == 1) bNoProtocol = true;
}
if(bNoProtocol) url = new URLName(new File(strSourceString).toURI().toURL());
}
catch (Exception e)
{
// wenn die URL nicht geparst werden kann, dann probieren wir auch mal obs ein File ist
url = new URLName(new File(strSourceString).toURI().toURL());
}
return url;
}
}