/*
* Leech - crawling capabilities for Apache Tika
*
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.io;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Date;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import javax.mail.URLName;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.util.CookieManager;
import de.dfki.km.leech.util.LeechException;
import de.dfki.km.leech.util.UrlUtil;
public class HttpURLStreamProvider extends URLStreamProvider
{
protected static final int connectTimeout = 20000;
protected static final int MAX_REDIRECTIONS = 20;
protected static final int readTimeout = 20000;
protected static String getRedirectedUrl(URL url, URLConnection connection) throws IOException
{
String newLocation = connection.getHeaderField("Location");
if(newLocation == null)
throw new IOException("missing redirection location");
else
return new URL(url, newLocation).toString();
}
protected static boolean isRedirected(int responseCode)
{
return responseCode == HttpURLConnection.HTTP_MULT_CHOICE || responseCode == HttpURLConnection.HTTP_MOVED_PERM
|| responseCode == HttpURLConnection.HTTP_MOVED_TEMP || responseCode == HttpURLConnection.HTTP_SEE_OTHER;
}
/**
* Adds first metadata and metadata relevant for incremental indexing to the given metadata object
*
* @param url2getMetadata the url for which metadata should be extracte
* @param metadata2fill the metadata object. The method will put several entries, as Metadata.SOURCE, Metadata.RESOURCE_NAME_KEY,
* Metadata.CONTENT_ENCODING, Metadata.CONTENT_TYPE, Metadata.CONTENT_LOCATION and, last but not least, the
* {@link IncrementalCrawlingHistory#dataEntityId} and {@link IncrementalCrawlingHistory#dataEntityContentFingerprint} to
* determine whether the content behind the url was modified since the last crawl or not. The URL path entry for Metadata.SOURCE is
* the last URL behind potential previous redirects (in the case its an http connection). The origin URL will be written into an
* attribute "originalsource" in the case it differs from the one into Metadata.SOURCE. To determine whether an url was modified or
* not, the method needs a configured crawling history.
* @param parseContext the parsing context to specify a crawling history. Can be null, in this case no history will be used (of course ;) )
*
* @return the metadata object, enriched with new metadata (in the case this metadata was not set yet)
*/
@Override
public Metadata addFirstMetadata(URLName url2getMetadata, Metadata metadata2fill, ParseContext parseContext) throws Exception
{
if(metadata2fill == null) metadata2fill = new Metadata();
// wenn das Teil schon gefüllt ist, dann machen wir gar nix
if(!(metadata2fill.get(Metadata.SOURCE) == null || metadata2fill.get(Metadata.RESOURCE_NAME_KEY) == null
|| metadata2fill.get(Metadata.CONTENT_ENCODING) == null || metadata2fill.get(Metadata.CONTENT_TYPE) == null
|| metadata2fill.get(Metadata.CONTENT_LOCATION) == null
|| metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null || metadata2fill
.get(IncrementalCrawlingHistory.dataEntityId) == null))
{
// alle sind bereits gesetzt
return metadata2fill;
}
IncrementalCrawlingHistory crawlingHistory = null;
if(parseContext == null) parseContext = new ParseContext();
CrawlerContext crawlerContext = parseContext.get(CrawlerContext.class, new CrawlerContext());
crawlingHistory = crawlerContext.getIncrementalCrawlingHistory();
// müssen wir hier evtl. die Lucene-Teile auch wieder closen? das ist immerhin eine utility-Methode^^ och - wir haben ja auch noch nen
// shutdown hook, und nach dem crawl wirds eh geschlossen. Klingt safe
if(crawlingHistory != null) crawlingHistory.openLuceneStuff();
// keep a backup of the originally passed url
String strOriginalUrlString = url2getMetadata.toString();
metadata2fill.set(Metadata.SOURCE, strOriginalUrlString);
URLConnection connection = null;
int nrRedirections = 0;
String strCurrentUrl = url2getMetadata.toString();
CookieManager cookies = crawlerContext.getCookieManager();
// We're going to loop, accessing urls until we arrive at a url that is not redirected. The
// redirection is followed manually rather than automatically, which is HttpURLConnection's
// default behaviour, so that we know the actual url we arrive at.
while (true)
{
// check if we haven't been redirected too often
if(nrRedirections > MAX_REDIRECTIONS)
{
throw new IOException("too many redirections, max = " + MAX_REDIRECTIONS + ", url = " + strOriginalUrlString);
}
// normalize the URL
URL currentUrl = new URL(strCurrentUrl);
currentUrl = new URL(UrlUtil.normalizeURL(new URLName(currentUrl)).toString());
strCurrentUrl = currentUrl.toExternalForm();
// see if a date was registered for this url
Date ifModifiedSinceDate = null;
if(crawlingHistory != null)
{
String lastIfModifiedSinceDate = crawlingHistory.getDataEntityContentFingerprint(strCurrentUrl);
if(lastIfModifiedSinceDate != null && lastIfModifiedSinceDate.matches("\\d+")) ifModifiedSinceDate = new Date(Long.valueOf(lastIfModifiedSinceDate));
}
try
{
// maybe there exists other connections as http - in this case we want to fall back zu standard Tika behaviour
connection = currentUrl.openConnection();
if(!(connection instanceof HttpURLConnection)) break;
((HttpURLConnection) connection).setRequestMethod("HEAD");
cookies.setCookies(connection);
connection.setConnectTimeout(connectTimeout);
connection.setReadTimeout(readTimeout);
connection.setRequestProperty("Accept-Encoding", "gzip");
Map<String, String> userHeaders = crawlerContext.getUserHeaders();
if (userHeaders != null) {
for (Map.Entry<String, String> entry : userHeaders.entrySet()) {
connection.setRequestProperty(entry.getKey(), entry.getValue());
}
}
String userAgent = crawlerContext.getUserAgent();
if (userAgent != null && !userAgent.isEmpty())
{
connection.setRequestProperty("User-Agent", userAgent);
}
((HttpURLConnection) connection).setInstanceFollowRedirects(false);
if(ifModifiedSinceDate != null)
{
connection.setIfModifiedSince(ifModifiedSinceDate.getTime());
}
// send the request to the server
connection.connect();
cookies.storeCookies(connection);
}
catch (Exception e)
{
// I've seen IllegalArgumentExceptions in the sun.net classes here because of some freaky URLs
// that did not generate MalformedUrlExceptions, so therefore a "catch "Exception" to be sure
if(e instanceof IOException)
{
throw (IOException) e;
}
else
{
throw new LeechException("connection to " + strOriginalUrlString + " resulted in an exception", e);
}
}
// check for http-specific response codes
int responseCode = ((HttpURLConnection) connection).getResponseCode();
if(isRedirected(responseCode))
{
// follow the redirected url
String lastUrl = strCurrentUrl;
strCurrentUrl = getRedirectedUrl(currentUrl, connection);
nrRedirections++;
// check for urls that redirect to themselves
if(strCurrentUrl.equals(lastUrl))
{
throw new LeechException("url redirects to itself: " + strCurrentUrl);
}
}
else if(responseCode == HttpURLConnection.HTTP_NOT_FOUND)
{
throw new LeechException(strCurrentUrl + " not found");
}
else if(responseCode == HttpURLConnection.HTTP_NOT_MODIFIED)
{
// des isch nicht modifiziert seit dem letzten crawl - wir geben die ('modification') time des letzten crawls zurück, damit des teil
// als unmodifiziert erkannt wird.
if(crawlingHistory != null && ifModifiedSinceDate != null)
metadata2fill.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(ifModifiedSinceDate.getTime()));
break;
}
else if(responseCode != HttpURLConnection.HTTP_OK)
{
// this is a communication error, quit with an exception
throw new IOException("Http connection error, response code = " + responseCode + ", url = " + currentUrl);
}
else
{
// we're done
break;
}
}
if(metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null)
metadata2fill.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(System.currentTimeMillis()));
// die Einträge, die Tika auch in das metadata einträgt, und noch etwas dazu
metadata2fill.set(Metadata.RESOURCE_NAME_KEY, strCurrentUrl);
metadata2fill.set(Metadata.SOURCE, strCurrentUrl);
metadata2fill.set(IncrementalCrawlingHistory.dataEntityId, strCurrentUrl);
if(strOriginalUrlString.indexOf(strCurrentUrl) == -1) metadata2fill.set("originalsource", strOriginalUrlString);
String type = connection.getContentType();
//text/xml is far too general to select the right parser
if(type != null && !type.contains("text/xml")) metadata2fill.set(Metadata.CONTENT_TYPE, type);
String encoding = connection.getContentEncoding();
if(encoding != null) metadata2fill.set(Metadata.CONTENT_ENCODING, encoding);
int length = connection.getContentLength();
if(length >= 0) metadata2fill.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
// das brauchen wir noch, um relative links aufzulösen
metadata2fill.set(Metadata.CONTENT_LOCATION, strCurrentUrl);
return metadata2fill;
}
@Override
public TikaInputStream getStream(URLName url2getStream, Metadata metadata, ParseContext parseContext) throws Exception
{
final URL asUrl = new URL(url2getStream.toString());
final CrawlerContext crawlerContext = parseContext.get(CrawlerContext.class, new CrawlerContext());
return TikaInputStream.get(new ShiftInitInputStream()
{
@Override
protected InputStream initBeforeFirstStreamDataAccess() throws Exception
{
CookieManager cookies = crawlerContext.getCookieManager();
URLConnection connection = asUrl.openConnection();
cookies.setCookies(connection);
connection.setConnectTimeout(connectTimeout);
connection.setReadTimeout(readTimeout);
connection.setRequestProperty("Accept-Encoding", "gzip");
Map<String, String> userHeaders = crawlerContext.getUserHeaders();
if (userHeaders != null) {
for (Map.Entry<String, String> entry : userHeaders.entrySet()) {
connection.setRequestProperty(entry.getKey(), entry.getValue());
}
}
String userAgent = crawlerContext.getUserAgent();
if (userAgent != null && !userAgent.isEmpty())
{
connection.setRequestProperty("User-Agent", userAgent);
}
connection.connect();
cookies.storeCookies(connection);
InputStream ourStream = connection.getInputStream();
String strContentEncoding = connection.getHeaderField("Content-Encoding");
if(strContentEncoding != null) strContentEncoding = strContentEncoding.toLowerCase().trim();
if("gzip".equals(strContentEncoding))
ourStream = new BufferedInputStream(new GZIPInputStream(ourStream));
else
ourStream = new BufferedInputStream(ourStream);
return ourStream;
}
});
}
@Override
public Set<String> getSupportedProtocols()
{
HashSet<String> hsProtocols = new HashSet<String>();
hsProtocols.add("http");
hsProtocols.add("https");
return hsProtocols;
}
}