/*
* Leech - crawling capabilities for Apache Tika
*
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.io;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.mail.Folder;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Store;
import javax.mail.URLName;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import com.sun.mail.imap.IMAPFolder;
import com.sun.mail.imap.IMAPMessage;
import de.dfki.km.leech.detect.DatasourceMediaTypes;
import de.dfki.km.leech.parser.ImapCrawlerParser;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.util.UrlUtil;
public class ImapURLStreamProvider extends URLStreamProvider
{
public static String getDataEntityContentFingerprint(String strEntityId)
{
// XXX wir gehen hier davon aus, daß eine message nicht modifiziert werden kann - allerdings kann man sehr wohl z.B. ein attachment
// löschen. Man könnte hier noch die attachment-Liste mit reinpacken (ich weiß nur gerade nicht wie^^)
return strEntityId;
}
public static String getEntityId(String strFolderOfMessage, String strMessageId)
{
if(strMessageId == null) return strFolderOfMessage;
return strFolderOfMessage + " MessageId " + strMessageId;
}
@SuppressWarnings("null")
@Override
public Metadata addFirstMetadata(URLName url2getMetadata, Metadata metadata2fill, ParseContext parseContext) throws Exception
{
if(metadata2fill == null) metadata2fill = new Metadata();
// wenn das Teil schon gefüllt ist, dann machen wir gar nix
if(!(metadata2fill.get(Metadata.SOURCE) == null || metadata2fill.get(IncrementalCrawlingHistory.dataEntityId) == null
|| metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null
|| metadata2fill.get(Metadata.RESOURCE_NAME_KEY) == null || metadata2fill.get("Content-Type") == null))
{
// alle sind bereits gesetzt
return metadata2fill;
}
Store mailStore = ImapCrawlerParser.connect2Server(url2getMetadata, parseContext);
URLName urlNameWithPassword = url2getMetadata;
IMAPFolder folder = (IMAPFolder) mailStore.getFolder(urlNameWithPassword.getFile());
// Für Leech
metadata2fill.set(Metadata.SOURCE, url2getMetadata.toString());
// Für das inkrementelle indexieren
try
{
// folder+messageId, damit sich die uid auch zwischen den crawls ändern darf
String strEntityId = null;
String strDataEntityContentFingerprint = null;
if(folder != null && folder.exists())
{
// das Teil ist ein Folder
strEntityId = url2getMetadata.getFile();
strDataEntityContentFingerprint = strEntityId;
metadata2fill.set("Content-Type", DatasourceMediaTypes.IMAPFOLDER.toString());
}
else if(folder != null)
{
// ist das Teil eine message? Wir popeln mal den folder und die uid raus
if(folder.isOpen()) folder.close(false);
String strFolder = UrlUtil.extractFolder(url2getMetadata);
folder = (IMAPFolder) mailStore.getFolder(strFolder);
if(!folder.isOpen()) folder.open(Folder.READ_ONLY);
String strUID = UrlUtil.extractUID(url2getMetadata);
if(strUID == null) throw new FileNotFoundException("no message uid found");
Message message = folder.getMessageByUID(Long.valueOf(strUID));
String strMessageId = null;
try
{
// das message-Objekt holt sich die Inhalte on the fly und cached sie nach dem ersten laden. Das ist hübsch
strMessageId = message.getHeader("Message-ID")[0];
}
catch (Exception e)
{
throw new IllegalStateException("imap message has no Message-ID header entry");
}
strEntityId = getEntityId(strFolder, strMessageId);
strDataEntityContentFingerprint = getDataEntityContentFingerprint(strEntityId);
metadata2fill.set("Content-Type", "message/rfc822");
// XXX machen wir noch eine Möglichkeit mit der MessageID und search (für den Sven, auch wenn die eben nicht eindeutig ist?
// Vielleicht Folder+MessageId? Oder isch des einfach falsch...andererseits nehmen wir folder+messageId ja auch für den
// ContenFingerprint, oder?)
}
metadata2fill.set(IncrementalCrawlingHistory.dataEntityId, strEntityId);
metadata2fill.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, strDataEntityContentFingerprint);
// Für Tika
URLName urlNameWithoutPassword = UrlUtil.urlNameWithoutPassword(urlNameWithPassword);
metadata2fill.set(Metadata.RESOURCE_NAME_KEY, urlNameWithoutPassword.toString());
return metadata2fill;
}
finally
{
if(folder != null && folder.isOpen()) folder.close(false);
if(mailStore != null && mailStore.isConnected()) mailStore.close();
}
}
static HashMap<String, Store> m_hsHost2Store = new HashMap<String, Store>();
@Override
public TikaInputStream getStream(final URLName url2getStream, final Metadata metadata, final ParseContext parseContext) throws Exception
{
return TikaInputStream.get(new ShiftInitInputStream()
{
IMAPFolder m_folderOfMessage;
boolean m_bCloseStore = false;
@Override
protected InputStream initBeforeFirstStreamDataAccess() throws Exception
{
Store mailStore = m_hsHost2Store.get(url2getStream.getHost() + " usr: " + url2getStream.getUsername());
if(mailStore == null)
{
mailStore = ImapCrawlerParser.connect2Server(url2getStream, parseContext);
m_hsHost2Store.put(url2getStream.getHost() + " usr: " + url2getStream.getUsername(), mailStore);
if(metadata.get("currentCrawlingDepth") == null || "0".equals(metadata.get("currentCrawlingDepth"))) m_bCloseStore = true;
}
Folder folder2Check = mailStore.getFolder(url2getStream.getFile());
try
{
if(folder2Check != null && folder2Check.exists())
{
// das Teil ist ein Folder
return TikaInputStream.get("leech sucks - hopefully :)".getBytes("UTF-8"));
}
else if(folder2Check != null)
{
// ist das Teil eine message? Wir popeln mal den folder und die uid raus
String strFolder = UrlUtil.extractFolder(url2getStream);
m_folderOfMessage = (IMAPFolder) mailStore.getFolder(strFolder);
if(!m_folderOfMessage.isOpen()) m_folderOfMessage.open(Folder.READ_ONLY);
String strUID = UrlUtil.extractUID(url2getStream);
if(strUID == null) throw new FileNotFoundException("no message uid found");
final IMAPMessage message = (IMAPMessage) m_folderOfMessage.getMessageByUID(Long.valueOf(strUID));
return message.getMimeStream();
}
return null;
}
finally
{
if(folder2Check.isOpen()) folder2Check.close(false);
}
}
@Override
public void close() throws IOException
{
try
{
super.close();
if(m_folderOfMessage != null && m_folderOfMessage.isOpen()) m_folderOfMessage.close(false);
Store mailStore = m_hsHost2Store.get(url2getStream.getHost() + " usr: " + url2getStream.getUsername());
if(mailStore != null && mailStore.isConnected() && m_bCloseStore)
{
mailStore.close();
m_hsHost2Store.remove(url2getStream.getHost() + " usr: " + url2getStream.getUsername());
}
}
catch (MessagingException e)
{
Logger.getLogger(ImapURLStreamProvider.class.getName()).log(Level.SEVERE, "Error", e);
}
}
});
}
@Override
public Set<String> getSupportedProtocols()
{
HashSet<String> hsProtocols = new HashSet<String>();
hsProtocols.add("imap");
hsProtocols.add("imaps");
return hsProtocols;
}
}