HttpURLStreamProvider.java example

Explorer

leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java

/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech.io;



import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Date;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;

import javax.mail.URLName;

import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;

import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.util.CookieManager;
import de.dfki.km.leech.util.LeechException;
import de.dfki.km.leech.util.UrlUtil;



public class HttpURLStreamProvider extends URLStreamProvider
{

    protected static final int connectTimeout = 20000;

    protected static final int MAX_REDIRECTIONS = 20;

    protected static final int readTimeout = 20000;



    protected static String getRedirectedUrl(URL url, URLConnection connection) throws IOException
    {
        String newLocation = connection.getHeaderField("Location");
        if(newLocation == null)
            throw new IOException("missing redirection location");
        else
            return new URL(url, newLocation).toString();
    }





    protected static boolean isRedirected(int responseCode)
    {
        return responseCode == HttpURLConnection.HTTP_MULT_CHOICE || responseCode == HttpURLConnection.HTTP_MOVED_PERM
                || responseCode == HttpURLConnection.HTTP_MOVED_TEMP || responseCode == HttpURLConnection.HTTP_SEE_OTHER;
    }




    /**
     * Adds first metadata and metadata relevant for incremental indexing to the given metadata object
     * 
     * @param url2getMetadata the url for which metadata should be extracte
     * @param metadata2fill the metadata object. The method will put several entries, as Metadata.SOURCE, Metadata.RESOURCE_NAME_KEY,
     *            Metadata.CONTENT_ENCODING, Metadata.CONTENT_TYPE, Metadata.CONTENT_LOCATION and, last but not least, the
     *            {@link IncrementalCrawlingHistory#dataEntityId} and {@link IncrementalCrawlingHistory#dataEntityContentFingerprint} to
     *            determine whether the content behind the url was modified since the last crawl or not. The URL path entry for Metadata.SOURCE is
     *            the last URL behind potential previous redirects (in the case its an http connection). The origin URL will be written into an
     *            attribute "originalsource" in the case it differs from the one into Metadata.SOURCE. To determine whether an url was modified or
     *            not, the method needs a configured crawling history.
     * @param parseContext the parsing context to specify a crawling history. Can be null, in this case no history will be used (of course ;) )
     * 
     * @return the metadata object, enriched with new metadata (in the case this metadata was not set yet)
     */
    @Override
    public Metadata addFirstMetadata(URLName url2getMetadata, Metadata metadata2fill, ParseContext parseContext) throws Exception
    {


        if(metadata2fill == null) metadata2fill = new Metadata();


        // wenn das Teil schon gefüllt ist, dann machen wir gar nix
        if(!(metadata2fill.get(Metadata.SOURCE) == null || metadata2fill.get(Metadata.RESOURCE_NAME_KEY) == null
                || metadata2fill.get(Metadata.CONTENT_ENCODING) == null || metadata2fill.get(Metadata.CONTENT_TYPE) == null
                || metadata2fill.get(Metadata.CONTENT_LOCATION) == null
                || metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null || metadata2fill
                    .get(IncrementalCrawlingHistory.dataEntityId) == null))
        {
            // alle sind bereits gesetzt
            return metadata2fill;
        }


        IncrementalCrawlingHistory crawlingHistory = null;

        if(parseContext == null) parseContext = new ParseContext();

        CrawlerContext crawlerContext = parseContext.get(CrawlerContext.class, new CrawlerContext());

        crawlingHistory = crawlerContext.getIncrementalCrawlingHistory();

        // müssen wir hier evtl. die Lucene-Teile auch wieder closen? das ist immerhin eine utility-Methode^^ och - wir haben ja auch noch nen
        // shutdown hook, und nach dem crawl wirds eh geschlossen. Klingt safe
        if(crawlingHistory != null) crawlingHistory.openLuceneStuff();


        // keep a backup of the originally passed url
        String strOriginalUrlString = url2getMetadata.toString();
        metadata2fill.set(Metadata.SOURCE, strOriginalUrlString);

        URLConnection connection = null;
        int nrRedirections = 0;

        String strCurrentUrl = url2getMetadata.toString();
        
        CookieManager cookies = crawlerContext.getCookieManager();
        
        // We're going to loop, accessing urls until we arrive at a url that is not redirected. The
        // redirection is followed manually rather than automatically, which is HttpURLConnection's
        // default behaviour, so that we know the actual url we arrive at.
        while (true)
        {
            // check if we haven't been redirected too often
            if(nrRedirections > MAX_REDIRECTIONS)
            {
                throw new IOException("too many redirections, max = " + MAX_REDIRECTIONS + ", url = " + strOriginalUrlString);
            }

            // normalize the URL
            URL currentUrl = new URL(strCurrentUrl);
            currentUrl = new URL(UrlUtil.normalizeURL(new URLName(currentUrl)).toString());
            strCurrentUrl = currentUrl.toExternalForm();

            // see if a date was registered for this url
            Date ifModifiedSinceDate = null;
            if(crawlingHistory != null)
            {
                String lastIfModifiedSinceDate = crawlingHistory.getDataEntityContentFingerprint(strCurrentUrl);
                if(lastIfModifiedSinceDate != null && lastIfModifiedSinceDate.matches("\\d+")) ifModifiedSinceDate = new Date(Long.valueOf(lastIfModifiedSinceDate));
            }

            try
            {
                // maybe there exists other connections as http - in this case we want to fall back zu standard Tika behaviour
                connection = currentUrl.openConnection();

                if(!(connection instanceof HttpURLConnection)) break;

                ((HttpURLConnection) connection).setRequestMethod("HEAD");
                cookies.setCookies(connection);
                connection.setConnectTimeout(connectTimeout);
                connection.setReadTimeout(readTimeout);
                connection.setRequestProperty("Accept-Encoding", "gzip");

                Map<String, String> userHeaders = crawlerContext.getUserHeaders();
                if (userHeaders != null) {
                    for (Map.Entry<String, String> entry : userHeaders.entrySet()) {
                        connection.setRequestProperty(entry.getKey(), entry.getValue());
                    }
                }

                String userAgent = crawlerContext.getUserAgent();
                if (userAgent != null && !userAgent.isEmpty())
                {
                    connection.setRequestProperty("User-Agent", userAgent);
                }

                ((HttpURLConnection) connection).setInstanceFollowRedirects(false);
                if(ifModifiedSinceDate != null)
                {
                    connection.setIfModifiedSince(ifModifiedSinceDate.getTime());
                }

                // send the request to the server
                connection.connect();
                cookies.storeCookies(connection);
            }
            catch (Exception e)
            {
                // I've seen IllegalArgumentExceptions in the sun.net classes here because of some freaky URLs
                // that did not generate MalformedUrlExceptions, so therefore a "catch "Exception" to be sure
                if(e instanceof IOException)
                {
                    throw (IOException) e;
                }
                else
                {
                    throw new LeechException("connection to " + strOriginalUrlString + " resulted in an exception", e);
                }
            }

            // check for http-specific response codes
            int responseCode = ((HttpURLConnection) connection).getResponseCode();

            if(isRedirected(responseCode))
            {
                // follow the redirected url
                String lastUrl = strCurrentUrl;
                strCurrentUrl = getRedirectedUrl(currentUrl, connection);
                nrRedirections++;


                // check for urls that redirect to themselves
                if(strCurrentUrl.equals(lastUrl))
                {
                    throw new LeechException("url redirects to itself: " + strCurrentUrl);
                }
            }
            else if(responseCode == HttpURLConnection.HTTP_NOT_FOUND)
            {
                throw new LeechException(strCurrentUrl + " not found");
            }
            else if(responseCode == HttpURLConnection.HTTP_NOT_MODIFIED)
            {
                // des isch nicht modifiziert seit dem letzten crawl - wir geben die ('modification') time des letzten crawls zurück, damit des teil
                // als unmodifiziert erkannt wird.
                if(crawlingHistory != null && ifModifiedSinceDate != null)
                    metadata2fill.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(ifModifiedSinceDate.getTime()));

                break;
            }
            else if(responseCode != HttpURLConnection.HTTP_OK)
            {
                // this is a communication error, quit with an exception
                throw new IOException("Http connection error, response code = " + responseCode + ", url = " + currentUrl);
            }
            else
            {
                // we're done
                break;
            }
        }


        if(metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null)
            metadata2fill.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(System.currentTimeMillis()));

        // die Einträge, die Tika auch in das metadata einträgt, und noch etwas dazu

        metadata2fill.set(Metadata.RESOURCE_NAME_KEY, strCurrentUrl);

        metadata2fill.set(Metadata.SOURCE, strCurrentUrl);
        metadata2fill.set(IncrementalCrawlingHistory.dataEntityId, strCurrentUrl);

        if(strOriginalUrlString.indexOf(strCurrentUrl) == -1) metadata2fill.set("originalsource", strOriginalUrlString);



        String type = connection.getContentType();
        //text/xml is far too general to select the right parser
        if(type != null && !type.contains("text/xml")) metadata2fill.set(Metadata.CONTENT_TYPE, type);


        String encoding = connection.getContentEncoding();
        if(encoding != null) metadata2fill.set(Metadata.CONTENT_ENCODING, encoding);


        int length = connection.getContentLength();
        if(length >= 0) metadata2fill.set(Metadata.CONTENT_LENGTH, Integer.toString(length));

        // das brauchen wir noch, um relative links aufzulösen
        metadata2fill.set(Metadata.CONTENT_LOCATION, strCurrentUrl);



        return metadata2fill;
    }




    @Override
    public TikaInputStream getStream(URLName url2getStream, Metadata metadata, ParseContext parseContext) throws Exception
    {
        final URL asUrl = new URL(url2getStream.toString());
        final CrawlerContext crawlerContext = parseContext.get(CrawlerContext.class, new CrawlerContext());

        return TikaInputStream.get(new ShiftInitInputStream()
        {
            @Override
            protected InputStream initBeforeFirstStreamDataAccess() throws Exception
            {
                CookieManager cookies = crawlerContext.getCookieManager();
                
                URLConnection connection = asUrl.openConnection();
                cookies.setCookies(connection);

                connection.setConnectTimeout(connectTimeout);
                connection.setReadTimeout(readTimeout);
                connection.setRequestProperty("Accept-Encoding", "gzip");

                Map<String, String> userHeaders = crawlerContext.getUserHeaders();
                if (userHeaders != null) {
                    for (Map.Entry<String, String> entry : userHeaders.entrySet()) {
                        connection.setRequestProperty(entry.getKey(), entry.getValue());
                    }
                }

                String userAgent = crawlerContext.getUserAgent();
                if (userAgent != null && !userAgent.isEmpty())
                {
                    connection.setRequestProperty("User-Agent", userAgent);
                }

                connection.connect();
                cookies.storeCookies(connection);
                InputStream ourStream = connection.getInputStream();

                String strContentEncoding = connection.getHeaderField("Content-Encoding");
                if(strContentEncoding != null) strContentEncoding = strContentEncoding.toLowerCase().trim();


                if("gzip".equals(strContentEncoding))
                    ourStream = new BufferedInputStream(new GZIPInputStream(ourStream));
                else
                    ourStream = new BufferedInputStream(ourStream);

                return ourStream;
            }
        });
    }







    @Override
    public Set<String> getSupportedProtocols()
    {
        HashSet<String> hsProtocols = new HashSet<String>();

        hsProtocols.add("http");
        hsProtocols.add("https");

        return hsProtocols;
    }




}