WebCrawlFactory.java example

Explorer
openbd-core-master
- src
/* 
 *  Copyright (C) 2000 - 2011 TagServlet Ltd
 *
 *  This file is part of Open BlueDragon (OpenBD) CFML Server Engine.
 *  
 *  OpenBD is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  Free Software Foundation,version 3.
 *  
 *  OpenBD is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with OpenBD.  If not, see http://www.gnu.org/licenses/
 *  
 *  Additional permission under GNU GPL version 3 section 7
 *  
 *  If you modify this Program, or any covered work, by linking or combining 
 *  it with any of the JARS listed in the README.txt (or a modified version of 
 *  (that library), containing parts covered by the terms of that JAR, the 
 *  licensors of this Program grant you additional permission to convey the 
 *  resulting work. 
 *  README.txt @ http://www.openbluedragon.org/license/README.txt
 *  
 *  http://www.openbluedragon.org/
 *  
 *  $Id: WebCrawlFactory.java 1638 2011-07-31 16:08:50Z alan $
 */

package com.bluedragon.search.index.crawl;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import com.bluedragon.search.DocumentWrap;
import com.bluedragon.search.index.DocumentWriter;
import com.bluedragon.search.index.crawl.handler.AbstractFileHandler;
import com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl;
import com.naryx.tagfusion.cfm.engine.cfArrayData;
import com.naryx.tagfusion.cfm.engine.cfData;
import com.naryx.tagfusion.cfm.engine.cfEngine;
import com.naryx.tagfusion.cfm.engine.cfNumberData;
import com.naryx.tagfusion.cfm.engine.cfStringData;
import com.naryx.tagfusion.cfm.engine.cfStructData;
import com.naryx.tagfusion.cfm.engine.cfmRunTimeException;


public class WebCrawlFactory extends CrawlFactory {
	public static final int	MAX_DOCS	= 100;
	
	private DocumentWriter docWriter;
	private String categoryTree;
	private String[] category;
	private cfStructData custommap;
	private String originalHost;
	
	private cfArrayData	badUrls;
	private Set<String>	visitedUrls;
	private List<URL>	urlsToCrawl;
	private AbstractFileHandler	activeHandler;
	private int	totalDocs = 0;
	
	public WebCrawlFactory(boolean bStoreBody, URL weburl, DocumentWriter docWriter, String categoryTree, String[] category, cfStructData custommap ) {
		super(bStoreBody);

		originalHost			= weburl.getHost();
		this.docWriter		= docWriter;
		this.categoryTree = categoryTree;
		this.category			= category;
		this.custommap		= custommap;
		
		badUrls			= cfArrayData.createArray(1);
		visitedUrls	= new HashSet<String>();
		urlsToCrawl	= new ArrayList<URL>();
		
		urlsToCrawl.add( weburl );
	}

	
	public void close() {
		try {
			docWriter.commit();
		} catch (Exception e) {}
	}
	
	
	public cfData crawl() {
		
		while ( !urlsToCrawl.isEmpty() ){
			if ( totalDocs+badUrls.size() == MAX_DOCS )
				break;
			
			URL toCrawlUrl	= urlsToCrawl.remove(0);

			File	pageFile	= downloadPage( toCrawlUrl );
			if ( pageFile == null )
				continue;

			try{
				if ( !(activeHandler instanceof FileHandlerHTMLImpl) )
					continue;

				DocumentWrap	doc	= activeHandler.crawl( toCrawlUrl.toString(), pageFile );
				
				doc.setCategories(category);
				doc.setCategoryTree(categoryTree);
				
				doc.deleteField( DocumentWrap.URL );
				doc.deleteField( DocumentWrap.ID );
				doc.setId( toCrawlUrl.toString() );
				
				if ( custommap != null ){
					Iterator<String> it = custommap.keySet().iterator();
					while ( it.hasNext() ){
						String k = it.next();
						String v = custommap.getData(k).getString();
						doc.setAttribute(k, v);
					}
				}
				
				// Add this to our collection
				totalDocs++;
				docWriter.add(doc);
				
				// Catch any other url's; only add if inside same domain and we haven't seen them before
				Set<String>	anchors = (Set<String>)activeHandler.getExtra();
				if ( anchors != null ){
					Iterator<String>	is = anchors.iterator();
					while ( is.hasNext() ){
						String urlS = is.next();
						if ( urlS.length() > 0 && !visitedUrls.contains(urlS) ){
							URL url = new URL(is.next());
							if ( url.getHost().equalsIgnoreCase(originalHost) )
								urlsToCrawl.add( url );
						}
					}
				}

			}catch(Exception e){
				
			}finally{
				activeHandler	= null;
				pageFile.delete();
			}
			
		}

		// Set the status
		cfStructData sd = new cfStructData();
		sd.setData("inserted", new cfNumberData(totalDocs) );
		sd.setData("invalid", new cfNumberData(badUrls.size()) );
		sd.setData("badkeys", badUrls );
		return sd;
	}

	
	private File	downloadPage( URL url ){
		if ( visitedUrls.contains(url.toString()) )
			return null;
		
		activeHandler = null;
		visitedUrls.add( url.toString() );
		
		File tmpFile = null;
		
		long	startTime = System.currentTimeMillis();
		int size = 0;
		
		try{
			tmpFile = File.createTempFile("OpenBD-CollectionIndexWeb-", ".html");
			
			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
	    conn.setRequestMethod("GET");
			conn.setReadTimeout( 30000 );
			
		   // Connect and read the response
	    InputStream in = null;
	    String detectedContentType = null;
	    try {
	      conn.connect();
	      
	      if (conn.getResponseCode() == HttpURLConnection.HTTP_FORBIDDEN || conn.getResponseCode() == HttpURLConnection.HTTP_UNAUTHORIZED)
	        throw new Exception("Skipping url: " + url + ". It is protected.");
	      
	      if (conn.getResponseCode() != -1 && conn.getResponseCode() != HttpURLConnection.HTTP_OK)
	        throw new Exception("Cannot read page: " + url + " response code is: " + conn.getResponseCode());
	      
	      in = conn.getInputStream();
	      detectedContentType = URLConnection.guessContentTypeFromStream(in);
	      size = readData( tmpFile, in );
	    } finally {
	      if (in != null)
	        in.close();
	      
	      conn.disconnect();
	    }

	    // Log the fact we made a crawl
	    cfEngine.log( "CollectionIndexWeb: Time=" + (System.currentTimeMillis()-startTime) + "; Size=" + size + "; " + url );
	    
	    // Determine the contentType
	    String specifiedContentType = conn.getContentType();
	    if (specifiedContentType == null || specifiedContentType.equals(""))
	      specifiedContentType = detectedContentType;
	    if (specifiedContentType == null || specifiedContentType.equals(""))
	      specifiedContentType = "text/plain";

	    specifiedContentType = validateMimeType( specifiedContentType ); 
	    
	    activeHandler	= extHandlers.get( specifiedContentType );
	    if ( activeHandler == null ){
	    	badUrls.addElement( new cfStringData( url.toString() ) );
	    	tmpFile.delete();
	    	return null;
	    }else
	    	return tmpFile;
	    
		}catch(Exception e){
			try {
				badUrls.addElement( new cfStringData( url.toString() ) );
			} catch (cfmRunTimeException e1) {}
			
			if ( tmpFile != null ) 
				tmpFile.delete();
			
			return null;
		}
	}
	
	
	
	private String validateMimeType(String mt) {
		// The Java VM returns null when it can't determine the content-type so use the default mime type.
		if (mt == null)
			return "text/plain";

		// Remove any subtypes from the content-type
		if (mt.indexOf(";") != -1)
			mt = mt.substring(0, mt.indexOf(";")).trim();
		
		return mt;
	}
	
	
	
	private int readData(File tmpFile, InputStream in) throws IOException {
		OutputStream out = null;
		int size = 0;

		try {
			out = new BufferedOutputStream(cfEngine.thisPlatform.getFileIO().getFileOutputStream(tmpFile));
			int read = -1;
			
			byte[] buf = new byte[1024];
			while ((read = in.read(buf, 0, buf.length)) != -1) {
				out.write(buf, 0, read);
				size += read;
			}
			out.flush();
		} finally {
			if (out != null)
				out.close();
		}
		
		return size;
	}

}