/* Leech - crawling capabilities for Apache Tika Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.parser; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.Properties; import java.util.Set; import java.util.logging.Logger; import javax.mail.FetchProfile; import javax.mail.Flags; import javax.mail.Folder; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Session; import javax.mail.Store; import javax.mail.UIDFolder; import javax.mail.URLName; import javax.mail.internet.MimeMessage; import javax.mail.search.FlagTerm; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import de.dfki.inquisition.collections.MultiValueHashMap; import de.dfki.inquisition.text.StringUtils; import de.dfki.km.leech.Leech; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.config.ImapCrawlerContext; import de.dfki.km.leech.detect.DatasourceMediaTypes; import de.dfki.km.leech.io.ImapURLStreamProvider; import de.dfki.km.leech.io.URLStreamProvider; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; import de.dfki.km.leech.util.ExceptionUtils; import de.dfki.km.leech.util.certificates.CertificateIgnoringSocketFactory; /** * CrawlerParser implementation for crawling imap servers. The class deals with Metadata.source url of the following form:<br> * <br> * imap[s]://username:password@hostname:port/folder2crawl<br> * <br> * In the case no folder is specified but only the server credentials, the server default directory plus the INBOX folder will be crawled * * @author Christian Reuschling, Dipl.Ing.(BA) */ public class ImapCrawlerParser extends CrawlerParser { private static final long serialVersionUID = 6062546853256504993L; static public Store connect2Server(URLName url, ParseContext context) throws MessagingException { ImapCrawlerContext imapCrawlerContext = context.get(ImapCrawlerContext.class, new ImapCrawlerContext()); Properties properties = System.getProperties(); properties.setProperty("mail.store.protocol", url.getProtocol()); if(imapCrawlerContext.getIgnoreSSLCertificates()) { properties.setProperty("mail.imaps.socketFactory.class", CertificateIgnoringSocketFactory.class.getName()); properties.setProperty("mail.imaps.socketFactory.fallback", "false"); } if(!StringUtils.nullOrWhitespace(imapCrawlerContext.getSSLCertificateFilePath()) && "imaps".equalsIgnoreCase(url.getProtocol())) { properties.setProperty("javax.net.ssl.trustStore", imapCrawlerContext.getSSLCertificateFilePath()); properties.setProperty("javax.net.ssl.trustStorePassword", imapCrawlerContext.getSSLCertificateFilePassword()); } Session session = Session.getDefaultInstance(properties); Store mailStore = session.getStore(url.getProtocol()); String strUserName = imapCrawlerContext.getUserName(); if(strUserName == null) strUserName = url.getUsername(); String strPassword = imapCrawlerContext.getPassword(); if(strPassword == null) strPassword = url.getPassword(); if(!mailStore.isConnected()) mailStore.connect(url.getHost(), url.getPort(), strUserName, strPassword); return mailStore; } /** * Does this folder hold any subfolders? * * @param folder the folder to be checked * @return true if this folder has any subfolders, false otherwise * @throws MessagingException if it prooves impossible to find out */ public static boolean holdsFolders(Folder folder) throws MessagingException { // this if has been added during the work on issue 2005759 // gmail returns wrong type, it is necessary to call list() to determine // if a folder actually contains subfolders if((folder.getType() & Folder.HOLDS_FOLDERS) == Folder.HOLDS_FOLDERS) { return folder.list().length > 0; } else { // this means that the folder can't have any subfolders "by definition" return false; } } /** * Does this folder hold any messages? * * @param folder the folder to be checked * @return true if this folder has any messages, false otherwise * @throws MessagingException if it prooves impossible to find out */ public static boolean holdsMessages(Folder folder) throws MessagingException { return (folder.getType() & Folder.HOLDS_MESSAGES) == Folder.HOLDS_MESSAGES; } protected HashMap<Folder, Boolean> m_hsImapFolder2Stickyness = new HashMap<Folder, Boolean>(); protected Leech m_leech; protected Store m_mailStore; protected boolean checkIfInConstraints(String strURL2Check, MimeMessage message, ParseContext context) throws MessagingException { CrawlerContext crawlerContext = context.get(CrawlerContext.class, new CrawlerContext()); if(!crawlerContext.getURLFilter().accept(strURL2Check)) { String strType = "IMAP directory "; if(message != null) strType = "IMAP message "; if(crawlerContext.getVerbose()) Logger.getLogger(CrawlerParser.class.getName()).info( strType + strURL2Check + " is outside the URL constraints for this data source. Skipping."); return false; } return true; } protected URLName getMessageUrl(Folder folderOfmessage, MimeMessage message) throws MessagingException { String strUrlName4Folder = folderOfmessage.getURLName().toString(); if(!strUrlName4Folder.endsWith("/")) strUrlName4Folder += "/"; // hier ist es bums, ob die id sticky ist oder nicht. Die URL ist der Pointer auf diese, momentane message, mit dem ich die erreichen und // downloaden kann. Fur das inkrementelle indexieren ist dann die dataExistsId relevant, die darf dann NICHT diese Url sein, wenn diese nicht // sticky ist. in diesem Fall (oder vielleicht sogar immer, mal schauen) nehme ich irgendwelche Daten aus dem header (wie wärs mit // folder+messageid) return new URLName(strUrlName4Folder + ";UID=" + ((UIDFolder) folderOfmessage).getUID(message)); // if(uidsAreSticky(folderOfmessage)) // { // // } // else // { // if(useHeadersHash) // { // return strUrlName4Folder + MailUtil.getMessageIdWithHeadersHash(message); // } // else // { // return strUrlName4Folder + MailUtil.getMessageId((MimeMessage) message); // } // } } @Override protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws Exception { // imap url schema: imap[s]://uname@hostname:port/folder;uidvalidity=385759045/;uid=20. Examples (incl. message-referenzierung) // http://xml.resource.org/public/rfc/html/rfc2192.html#anchor10 // allerdings nimmt der Java ImapStore auch URLs mit Passwörtern an. Dann geht auch // imap[s]://uname:pwd@hostname:port/folder;uidvalidity=385759045/;uid=20 CrawlerContext crawlerContext = context.get(CrawlerContext.class, new CrawlerContext()); String strContainerURL = metadata.get(Metadata.SOURCE); URLName containerURLName = new URLName(strContainerURL); if(m_mailStore == null) m_mailStore = connect2Server(containerURLName, context); // wenn kein directory angegeben wird, dann crawlen wir einfach den default folder und die inbox LinkedList<Folder> llFolderz2Crawl = new LinkedList<Folder>(); if(containerURLName.getFile() != null) { Folder folder = m_mailStore.getFolder(containerURLName.getFile()); if(folder != null && folder.exists()) llFolderz2Crawl.add(folder); else throw new FileNotFoundException("Can't find imap folder '" + folder.getFullName() + "'"); } else { Folder folder = m_mailStore.getDefaultFolder(); if(folder != null && folder.exists()) llFolderz2Crawl.add(folder); folder = m_mailStore.getFolder("INBOX"); if(folder != null && folder.exists()) llFolderz2Crawl.add(folder); } LinkedList<MultiValueHashMap<String, Object>> llEntityInfo = new LinkedList<MultiValueHashMap<String, Object>>(); for (Folder folder2crawl : llFolderz2Crawl) { // Jetzt haben wir die Containerobjekte - nun geben wir die Daten zu den SubEntities zurück // die subfolder boolean bFolderCanHaveSubFolders = (folder2crawl.getType() & Folder.HOLDS_FOLDERS) == Folder.HOLDS_FOLDERS; if(bFolderCanHaveSubFolders) { folder2crawl.open(Folder.READ_ONLY); Folder[] subFolders = folder2crawl.list(); for (Folder subFolder : subFolders) { URLName urlName = subFolder.getURLName(); URLName urlNameWithPassword = new URLName(containerURLName.getProtocol(), urlName.getHost(), urlName.getPort(), urlName.getFile(), urlName.getUsername(), containerURLName.getPassword()); if(!checkIfInConstraints(urlName.toString(), null, context)) continue; MultiValueHashMap<String, Object> hsEntityInformation = new MultiValueHashMap<String, Object>(); hsEntityInformation.add(CrawlerParser.SOURCEID, urlName); hsEntityInformation.add("urlNameWithPassword", urlNameWithPassword); hsEntityInformation.add("folder", subFolder.getFullName()); llEntityInfo.add(hsEntityInformation); } } // die messages boolean bFolderCanHaveMessages = (folder2crawl.getType() & Folder.HOLDS_MESSAGES) == Folder.HOLDS_MESSAGES; if(bFolderCanHaveMessages) { if(!folder2crawl.isOpen()) folder2crawl.open(Folder.READ_ONLY); // wir holen uns alle nicht-deleted messages, und werfen noch die raus, die 'expunged' sind Message[] relevantMessagesOfFolder = folder2crawl.search(new FlagTerm(new Flags(Flags.Flag.DELETED), false)); ArrayList<Message> nonDelNonExpungedMessages = new ArrayList<Message>(); for (Message message : relevantMessagesOfFolder) if(!message.isExpunged()) nonDelNonExpungedMessages.add(message); relevantMessagesOfFolder = nonDelNonExpungedMessages.toArray(new Message[0]); // die Daten die wir später benötigen holen wir uns effizient in einem Rutsch - deswegen benötigen wir auch keinen Thread mit dem // OneAfterOneIterator, um Speicher zu sparen (siehe DirectoryCrawlerParser). Das Array haben wir hier eh. Entweder oder. FetchProfile profile = new FetchProfile(); profile.add(UIDFolder.FetchProfileItem.UID); profile.add("Message-ID"); folder2crawl.fetch(relevantMessagesOfFolder, profile); for (int i = 0; i < relevantMessagesOfFolder.length && !crawlerContext.stopRequested(); i++) { MimeMessage message = (MimeMessage) relevantMessagesOfFolder[i]; // hier brauchen wir noch eine URL mit und eine ohne Passwort URLName urlName = getMessageUrl(folder2crawl, message); URLName urlNameWithPassword = new URLName(containerURLName.getProtocol(), urlName.getHost(), urlName.getPort(), urlName.getFile(), urlName.getUsername(), containerURLName.getPassword()); if(!checkIfInConstraints(urlName.toString(), message, context)) continue; MultiValueHashMap<String, Object> hsEntityInformation = new MultiValueHashMap<String, Object>(); hsEntityInformation.add(CrawlerParser.SOURCEID, urlName); hsEntityInformation.add("urlNameWithPassword", urlNameWithPassword); hsEntityInformation.add("Message-ID", message.getHeader("Message-ID")[0]); hsEntityInformation.add("folder", folder2crawl.getFullName()); llEntityInfo.add(hsEntityInformation); } } // wir haben die folder abgearbeitet, dann können wir diesen Speicher wieder frei geben m_hsImapFolder2Stickyness.clear(); if(folder2crawl.isOpen()) folder2crawl.close(false); } return llEntityInfo.iterator(); } @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return Collections.singleton(DatasourceMediaTypes.IMAPFOLDER); } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { super.parse(stream, handler, metadata, context); // Wenn ein completter crawl fertig ist, dann schliessen wir auch wieder unseren MailStore int iCurrentCrawlingDepth = 0; String strDepth = metadata.get(CrawlerParser.CURRENT_CRAWLING_DEPTH); if(strDepth != null) iCurrentCrawlingDepth = Integer.valueOf(strDepth); if(iCurrentCrawlingDepth != 0) return; try { m_mailStore.close(); m_mailStore = null; } catch (MessagingException e) { String strSourceID = metadata.get(Metadata.SOURCE); ExceptionUtils.handleException(e, strSourceID, metadata, context.get(CrawlerContext.class, new CrawlerContext()), context, iCurrentCrawlingDepth, handler); } } @Override protected void processCurrentDataEntity(InputStream stream, Metadata metadata, ContentHandler handler, ParseContext context) throws Exception { // NOP - wie don't process directories - we only process the files inside } @Override protected void processSubDataEntity(MultiValueHashMap<String, Object> subDataEntityInformation, Metadata metadata, ContentHandler handler2use4recursiveCall, ParseContext context) throws Exception { URLName urlNameWithPassword = (URLName) subDataEntityInformation.getFirst("urlNameWithPassword"); String strMessageId = (String) subDataEntityInformation.getFirst("Message-ID"); String strMessageFolder = (String) subDataEntityInformation.getFirst("folder"); String strEntityId = ImapURLStreamProvider.getEntityId(strMessageFolder, strMessageId); // wir setzten die hier schon mal - die Daten haben wir in einem prefetching-Schritt schon effizient geladen. Wenn diese hier schon im // Metadata-Objekt stehen, werden sie von der addFirstMetadata nicht nochmal geladen metadata.set(Metadata.SOURCE, urlNameWithPassword.toString()); metadata.set(IncrementalCrawlingHistory.dataEntityId, strEntityId); metadata.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, ImapURLStreamProvider.getDataEntityContentFingerprint(strEntityId)); URLName urlNameWithoutPassword = new URLName(urlNameWithPassword.getProtocol(), urlNameWithPassword.getHost(), urlNameWithPassword.getPort(), urlNameWithPassword.getFile(), urlNameWithPassword.getUsername(), ""); metadata.set(Metadata.RESOURCE_NAME_KEY, urlNameWithoutPassword.toString()); if(strMessageId == null) metadata.set("Content-Type", DatasourceMediaTypes.IMAPFOLDER.toString()); else metadata.set("Content-Type", "message/rfc822"); metadata = URLStreamProvider.getURLStreamProvider4Protocol(urlNameWithPassword.getProtocol()).addFirstMetadata(urlNameWithPassword, metadata, context); InputStream stream = URLStreamProvider.getURLStreamProvider(urlNameWithPassword).getStream(urlNameWithPassword, metadata, context); try { if(m_leech == null) m_leech = new Leech(); // hier nimmt der dann bei einer message hoffentlich den Tika RFC822Parser Parser parser = m_leech.getParser(); parser.parse(stream, handler2use4recursiveCall, metadata, context); } finally { if(stream != null) stream.close(); } } }