UrlUtil.java example

Explorer

leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java

/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech.util;



import java.io.File;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.TreeSet;

import javax.mail.URLName;




/**
 * Offers utility methods related to URLs, network connections, etc.
 */
public class UrlUtil
{

    public static String extractFolder(URLName url)
    {
        String strFolder = url.getFile();
        int iIndex = strFolder.indexOf(";");
        if(iIndex > 0) strFolder = strFolder.substring(0, iIndex);
        while (strFolder.endsWith("/"))
            strFolder = strFolder.substring(0, strFolder.length() - 1);

        return strFolder;
    }



    public static String extractUID(URLName url)
    {
        String strUID = url.toString();

        int iIndex = strUID.toLowerCase().indexOf("uid=");
        if(iIndex == -1) return null;

        strUID = strUID.substring(iIndex + 4);
        iIndex = strUID.indexOf(";");
        if(iIndex > 0) strUID = strUID.substring(0, iIndex);

        return strUID;
    }



    public static URLName urlNameWithoutPassword(URLName urlNameWithPassword)
    {
        URLName urlNameWithoutPassword =
                new URLName(urlNameWithPassword.getProtocol(), urlNameWithPassword.getHost(), urlNameWithPassword.getPort(),
                        urlNameWithPassword.getFile(), urlNameWithPassword.getUsername(), "");


        return urlNameWithoutPassword;
    }



    /**
     * If the given String can be parsed as an URL, the method will return another URL String without a possible password. In the case the String can
     * not be parsed as URL, the method will return the given original String.
     * 
     * @param strPossibleUrlNameWithPassword a String that possibly is an URL String with password
     * 
     * @return the original String in the case it could not be parsed as an Url, an Url String with removed password otherwise.
     */
    public static String urlNameWithoutPassword(String strPossibleUrlNameWithPassword)
    {
        try
        {

            URLName urlNameWithPassword = new URLName(strPossibleUrlNameWithPassword);

            return urlNameWithoutPassword(urlNameWithPassword).toString();

        }
        catch (Exception e)
        {
            return strPossibleUrlNameWithPassword;
        }
    }



    /**
     * Remove relative references and "mistakes" like double slashes from the path.
     * 
     * @param path The path to normalize.
     * @return The normalized path.
     */
    public static String normalizePath(String path)
    {
        String result = path;

        // replace all double slashes with a single slash
        result = replace("//", "/", result);

        // replace all references to the current directory with nothing
        result = replace("/./", "/", result);

        // replace all references to the parent directory with nothing
        result = result.replaceAll("/[^/]+/\\.\\./", "/");

        return result;
    }



    /**
     * Normalizes a query string by sorting the query parameters alpabetically.
     * 
     * @param query The query string to normalize.
     * @return The normalized query string.
     */
    public static String normalizeQuery(String query)
    {
        TreeSet<String> sortedSet = new TreeSet<String>();

        // extract key-value pairs from the query string
        StringTokenizer tokenizer = new StringTokenizer(query, "&");
        while (tokenizer.hasMoreTokens())
        {
            sortedSet.add(tokenizer.nextToken());
        }

        // reconstruct query string
        StringBuilder result = new StringBuilder(query.length());

        Iterator<String> iterator = sortedSet.iterator();
        while (iterator.hasNext())
        {
            result.append(iterator.next());

            if(iterator.hasNext())
            {
                result.append('&');
            }
        }

        return result.toString();
    }



    /**
     * Normalizes a URL. The following steps are taken to normalize a URL:
     * 
     * <ul>
     * <li>The protocol is made lower-case.
     * <li>The host is made lower-case.
     * <li>A specified port is removed if it matches the default port.
     * <li>Any query parameters are sorted alphabetically.
     * <li>Any anchor information is removed.
     * </ul>
     * 
     * @param url the url that should be normalized
     * 
     * @return the normalized url
     */
    public static URLName normalizeURL(URLName url)
    {
        try
        {
            // retrieve the various parts of the URL
            String protocol = url.getProtocol();
            String host = url.getHost();
            int port = url.getPort();
            String path = url.getFile();
            String query = url.getRef();
            String username = url.getUsername();
            String password = url.getPassword();

            // normalize the fields
            protocol = protocol.toLowerCase();
            if(host != null) host = host.toLowerCase();


            String file = "";
            if(path != null) file = normalizePath(path);

            if(query != null)
            {
                query = normalizeQuery(query);
                file += "?" + query;
            }

            // create the normalized URL
            url = new URLName(protocol, host, port, file, username, password);

            return url;

        }
        catch (Exception e)
        {
            throw new RuntimeException("Error while normalizing the url " + url.toString() + ". Is it a well formed URL? ");
        }
    }



    /**
     * Substitute String "old" by String "new" in String "text" everywhere.
     * 
     * @param olds The String to be substituted.
     * @param news The String containing the new content.
     * @param text The String in which the substitution is done.
     * @return The result String containing the substitutions; if no substitutions were made, the specified 'text' instance is returned.
     */
    protected static String replace(String olds, String news, String text)
    {
        if(olds == null || olds.length() == 0)
        {
            // nothing to substitute.
            return text;
        }
        if(text == null)
        {
            return null;
        }

        // search for any occurences of 'olds'.
        int oldsIndex = text.indexOf(olds);
        if(oldsIndex == -1)
        {
            // Nothing to substitute.
            return text;
        }

        // we're going to do some substitutions.
        StringBuilder buffer = new StringBuilder(text.length());
        int prevIndex = 0;

        while (oldsIndex >= 0)
        {
            // first, add the text between the previous and the current occurence
            buffer.append(text.substring(prevIndex, oldsIndex));

            // then add the substition pattern
            buffer.append(news);

            // remember the index for the next loop
            prevIndex = oldsIndex + olds.length();

            // search for the next occurence
            oldsIndex = text.indexOf(olds, prevIndex);
        }

        // add the part after the last occurence
        buffer.append(text.substring(prevIndex));

        return buffer.toString();
    }





    static public URLName sourceString2URL(String strSourceString) throws MalformedURLException
    {
        URLName url;

        try
        {
            url = new URLName(strSourceString);

            // wenn kein Protokoll angegeben ist, kucken wir mal, ob es ein File ist
            boolean bNoProtocol = false;
            String strProtocol = url.getProtocol();
            if(strProtocol == null)
                bNoProtocol = true;
            else
            {
                String strOs = System.getProperty("os.name").toLowerCase();
                if(strOs.toLowerCase().contains("win") && strProtocol.length() == 1) bNoProtocol = true;
            }

            if(bNoProtocol) url = new URLName(new File(strSourceString).toURI().toURL());

        }
        catch (Exception e)
        {
            // wenn die URL nicht geparst werden kann, dann probieren wir auch mal obs ein File ist
            url = new URLName(new File(strSourceString).toURI().toURL());
        }

        return url;
    }
}