/* * Copyright (C) 2000 - 2011 TagServlet Ltd * * This file is part of Open BlueDragon (OpenBD) CFML Server Engine. * * OpenBD is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * Free Software Foundation,version 3. * * OpenBD is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with OpenBD. If not, see http://www.gnu.org/licenses/ * * Additional permission under GNU GPL version 3 section 7 * * If you modify this Program, or any covered work, by linking or combining * it with any of the JARS listed in the README.txt (or a modified version of * (that library), containing parts covered by the terms of that JAR, the * licensors of this Program grant you additional permission to convey the * resulting work. * README.txt @ http://www.openbluedragon.org/license/README.txt * * http://www.openbluedragon.org/ * * $Id: CrawlFactory.java 2374 2013-06-10 22:14:24Z alan $ */ package com.bluedragon.search.index.crawl; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import com.bluedragon.search.DocumentWrap; import com.bluedragon.search.index.crawl.handler.AbstractFileHandler; import com.bluedragon.search.index.crawl.handler.CrawlException; import com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl; import com.bluedragon.search.index.crawl.handler.FileHandlerJPGImpl; import com.bluedragon.search.index.crawl.handler.FileHandlerMP3Impl; import com.bluedragon.search.index.crawl.handler.FileHandlerMSOfficeImpl; import com.bluedragon.search.index.crawl.handler.FileHandlerOpenOfficeImpl; import com.bluedragon.search.index.crawl.handler.FileHandlerPDFImpl; import com.bluedragon.search.index.crawl.handler.FileHandlerTextImpl; import com.nary.util.string; public class CrawlFactory extends Object { protected Map<String, AbstractFileHandler> extHandlers; public CrawlFactory(boolean bStoreBody){ extHandlers = new HashMap<String, AbstractFileHandler>(); addHandler( new FileHandlerMSOfficeImpl(bStoreBody) ); addHandler( new FileHandlerOpenOfficeImpl(bStoreBody) ); addHandler( new FileHandlerTextImpl(bStoreBody) ); addHandler( new FileHandlerPDFImpl(bStoreBody) ); addHandler( new FileHandlerHTMLImpl(bStoreBody) ); addHandler( new FileHandlerMP3Impl(false) ); addHandler( new FileHandlerJPGImpl(false) ); } private void addHandler(AbstractFileHandler fH){ Iterator<String> it = fH.getExtensions().iterator(); while ( it.hasNext() ) extHandlers.put( it.next(), fH); it = fH.getMimeTypes().iterator(); while ( it.hasNext() ) extHandlers.put( it.next(), fH); } /** * Crawls the file given. If the file cannot be handled then a null is returned * * @param urlroot * @param file * @return * @throws CrawlException */ public DocumentWrap crawlFile( String urlroot, File file ) { if ( !file.exists() || !file.isFile() ) return null; String ext = org.apache.commons.io.FilenameUtils.getExtension( file.getName().toLowerCase() ); if ( extHandlers.containsKey(ext) ){ try { return extHandlers.get(ext).crawl( urlroot, file); } catch (CrawlException e) {} } return null; } /** * Gets the list of files to crawl * * @param dir * @param exts * @param bRecurse * @return * @throws IOException */ public Set<String> getFilesToCrawl(File dir, String exts, boolean bRecurse ) throws IOException { ConfigurableFileFilter filter = new ConfigurableFileFilter( getExtensions(exts), bRecurse ); return recursePath( dir, filter ); } private Set<String> recursePath(File dir, FileFilter filter) throws IOException { Set<String> set = new HashSet<String>(); File[] files = dir.listFiles(filter); for (int i = 0; i < files.length; i++) { if (files[i].isDirectory()) set.addAll(recursePath(files[i], filter)); else set.add(files[i].getCanonicalPath()); } return set; } private Set<String> getExtensions(String exts) { Set<String> extensions = new HashSet<String>(); if ( exts != null ) { List<String> tokens = string.split( exts, " ,:;"); String token = null; for (int i = 0; i < tokens.size(); i++) { token = tokens.get(i).toLowerCase(); if (token.indexOf(".") == -1) token = "." + token; extensions.add(token); } } else { extensions.add(".htm"); extensions.add(".html"); extensions.add(".cfm"); extensions.add(".cfml"); extensions.add(".dbm"); extensions.add(".dbml"); } return extensions; } public void close() {} }