/*
* Leech - crawling capabilities for Apache Tika
*
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.io;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.mail.URLName;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import ucar.nc2.util.net.URLStreamHandlerFactory;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
/**
* This class offers the generation of streams out of an URL reference, together with crawling-relevant metadata. This can be done in order to simply
* support new types of protocols.<br>
* <br>
* How to implement and register your own URL Protocol:<br>
* <li>Create inside your jar manifest under a folder '/META-INF/services' a file named 'de.dfki.km.leech.io.URLStreamProvider'. Each line inside this
* text file names a {@link URLStreamProvider} class (e.g. de.dfki.km.leech.io.HttpURLStreamProvider).</li> <li>Implement the referenced class as a
* subclass of {@link URLStreamProvider}</li>
*
*
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*/
abstract public class URLStreamProvider
{
protected static boolean bFactoryRegistered = false;
protected final static HashMap<String, URLStreamProvider> m_protocol2StreamProvider = new HashMap<String, URLStreamProvider>();
final protected static ServiceLoader<URLStreamProvider> m_serviceLoader = ServiceLoader.load(URLStreamProvider.class);
final public static URLStreamHandlerFactory theOneAndOnlyURLStreamHandlerFactory;
static
{
theOneAndOnlyURLStreamHandlerFactory = new URLStreamHandlerFactory();
registerProtocols();
}
/**
* Gets the configured URLStreamProvider for a given URL
*
* @param strUrl the URL with the protocol you want to have streams and preliminary metadata for
*
* @return the configured StreamProvider for the protocol of the given URL
*
* @throws MalformedURLException
*/
static public URLStreamProvider getURLStreamProvider(String strUrl) throws MalformedURLException
{
return m_protocol2StreamProvider.get(new URLName(strUrl).getProtocol());
}
/**
* Gets the configured URLStreamProvider for a given URL
*
* @param url the URL with the protocol you want to have streams and preliminary metadata for
*
* @return the configured StreamProvider for the protocol of the given URL
*/
static public URLStreamProvider getURLStreamProvider(URLName url)
{
return m_protocol2StreamProvider.get(url.getProtocol());
}
/**
* Gets the configured URLStreamProvider for a given URL
*
* @param url the URL with the protocol you want to have streams and preliminary metadata for
*
* @return the configured StreamProvider for the protocol of the given URL
*/
static public URLStreamProvider getURLStreamProvider(URL url)
{
return m_protocol2StreamProvider.get(url.getProtocol());
}
/**
* Gets the configured URLStreamProvider for a given protocol (e.g. 'imap', 'imaps', 'http', 'file', etc.
*
* @param strProtocol the protocol you want to have streams and preliminary metadata for
*
* @return the configured StreamProvider for the protocol of the given URL
*/
static public URLStreamProvider getURLStreamProvider4Protocol(String strProtocol)
{
return m_protocol2StreamProvider.get(strProtocol);
}
static public void registerProtocols()
{
// wir sorgen dafür, daß wir in der URL-Klasse neue Protokolle registrieren können
try
{
URL.setURLStreamHandlerFactory(theOneAndOnlyURLStreamHandlerFactory);
bFactoryRegistered = true;
}
catch (Error e)
{
if(!bFactoryRegistered)
{
Logger.getLogger(URLStreamProvider.class.getName())
.log(Level.SEVERE,
"The URLStreamHandlerFactory could not registered to the URL class. Ignore this message in the case you take care yet for stream creation on new protocols with the URL class.",
e);
}
else
return;
}
// hier laden wir jetzt alle verfügbaren URLStreamProvider mit Hilfe dieses schicken java-mechanismus zum nachladen von Klassen über
// einen manifesteintrag, so wie Tika das auch macht :). Dazu beschicken wir die Factory mit den Handlern, und merken uns hier noch das
// mapping von protocol zum urlStreamProvider
for (URLStreamProvider streamProvider : m_serviceLoader)
{
for (String strProtocol : streamProvider.getSupportedProtocols())
{
m_protocol2StreamProvider.put(strProtocol, streamProvider);
}
}
}
/**
* Adds first metadata for the data entity behind a URL - data that is quickly available. This method is NOT to extract the content of the data
* entity, this will be the job of the according Tika Parser, later in the crawling process. Here, some preliminary data such as modification time
* or file name in a file-URL can be offered. Write inside what is there and what you want or need. The crawlers will forward everything to the
* data handler afterwards.<br>
* <br>
* <b>IMPORTANT</b><br>
* Leech and Tika needs some metadata entries in order to work - they have to be there in order to perform crawling and extracting content. These
* are:<br>
* {@link Metadata#RESOURCE_NAME_KEY}: this entry is performed by Tika for giving a stream a name. Will be used by some parsers. e.g. 'myFileName'<br>
* {@link DublinCore#SOURCE}: this is the URL as String, needed by leech for referencing the data entity that should be crawled. e.g.
* 'file:///home/dir/myFileName'<br>
* {@link IncrementalCrawlingHistory#dataEntityId}: an identifier for a data entity that is independent from the content of this entity. It
* is only for identifying the occurence, not to check whether it has changed (e.g. a filename). Needed by Leech in order to perform incremental
* crawling.<br>
* {@link IncrementalCrawlingHistory#dataEntityContentFingerprint} : some fingerprint/identifier that gives the hint whether the content of the
* data entity has changed, e.g. the modified date of a file. Needed by Leech in order to perform incremental crawling.<br>
* <br>
* <b>IMPORTANT 2</b><br>
* It is very good style to check whether there exists some metadata inside the parameter object, and only generate these entries NOT INSIDE YET.
* Some crawlers can quickly fetch metadata information for a bag of data entities (during a recursive call), and are able to prefill the metadata
* with this information even before this method invocation. You can save MUCH PERFORMANCE in such situations if you don't generate them again in
* this method, for a single data entity.
*
* @param url2getMetadata the url you want to get metadata from and add it to the given metadata object
*
* @param metadata2fill the metadata object you potentially want to fill with first metadata. Can be null by convention, in this case the method
* returns a newly generated Metadata Object. Can be prefilled with known metadata, so don't generate it unnecessarily again in this
* method.
* @param parseContext the parse Context configuration for the current crawl. May contain necessary, usefull context data
*
* @return the parameter Metadata Object, potentially filled with some new metadata entries. A new Object in the case the parameter was null.
* Don't forget that most should be extracted from the parser implementations. Fill only what is necessary, what is not offered yet in the
* parameter Object - and low hanging fruits!
*
* @throws Exception
*/
abstract public Metadata addFirstMetadata(URLName url2getMetadata, Metadata metadata2fill, ParseContext parseContext) throws Exception;
/**
* Gets the stream to read out the content behind the given URL. Additional information to perform the connection can be specified in the parse
* context.<br>
* <br>
* <b>IMPORTANT</b><br>
* <li>Sometimes it is a very good idea that you wrap a stream inside {@link TikaInputStream} also with a {@link ShiftInitInputStream}. This will
* prevent performance losts for initialisation in the case the stream won't be used at all.</li> <li>
* Note that a stream will be initialized for sure for determining its mimetype (for reading the magic numbers). Thus the performance win with
* {@link ShiftInitInputStream} is lost. Mimetype detection with the stream can be prevented by setting the correct Content-Type entry into the
* metadata object, if you can. Examples: <code>metadata.set("Content-Type",
* DatasourceMediaTypes.IMAPFOLDER.toString()) or metadata2fill.set("Content-Type", "message/rfc822");</code></li>
*
* @param url2getStream the URL you want to have a stream from
* @param metadata some preliminary metadata, as given from {@link URLStreamProvider#addFirstMetadata(URLName, Metadata, ParseContext)}
* @param parseContext the parse Context configuration for the current crawl. May contain necessary, usefull context data
*
* @return the stream for the data under this URL. This one is wrapped inside a {@link ShiftInitInputStream} Object to make sure that stream
* initialization will be only performed in the case the stream will be needed for extraction. It could be the case that Leech recognize
* because of the metadata that the data entity is crawled yet. In this case we don't want spend anything (internet connection inits,
* etc.) for stream object construction.
*
* @throws Exception
*/
abstract public TikaInputStream getStream(URLName url2getStream, Metadata metadata, ParseContext parseContext) throws Exception;
/**
* Gets the URL protocols supported by this URLStreamProvider. e.g. 'imap' and 'imaps' for an imap URL
* (imap://uname:pwd@hostname:667/folder;uid=20).
*
* @return the URL protocols supported by this URLStreamProvider
*/
abstract public Set<String> getSupportedProtocols();
}