/*
* Copyright (C) 2000 - 2011 TagServlet Ltd
*
* This file is part of Open BlueDragon (OpenBD) CFML Server Engine.
*
* OpenBD is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* Free Software Foundation,version 3.
*
* OpenBD is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenBD. If not, see http://www.gnu.org/licenses/
*
* Additional permission under GNU GPL version 3 section 7
*
* If you modify this Program, or any covered work, by linking or combining
* it with any of the JARS listed in the README.txt (or a modified version of
* (that library), containing parts covered by the terms of that JAR, the
* licensors of this Program grant you additional permission to convey the
* resulting work.
* README.txt @ http://www.openbluedragon.org/license/README.txt
*
* http://www.openbluedragon.org/
*
* $Id: WebCrawlFactory.java 1638 2011-07-31 16:08:50Z alan $
*/
package com.bluedragon.search.index.crawl;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.bluedragon.search.DocumentWrap;
import com.bluedragon.search.index.DocumentWriter;
import com.bluedragon.search.index.crawl.handler.AbstractFileHandler;
import com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl;
import com.naryx.tagfusion.cfm.engine.cfArrayData;
import com.naryx.tagfusion.cfm.engine.cfData;
import com.naryx.tagfusion.cfm.engine.cfEngine;
import com.naryx.tagfusion.cfm.engine.cfNumberData;
import com.naryx.tagfusion.cfm.engine.cfStringData;
import com.naryx.tagfusion.cfm.engine.cfStructData;
import com.naryx.tagfusion.cfm.engine.cfmRunTimeException;
public class WebCrawlFactory extends CrawlFactory {
public static final int MAX_DOCS = 100;
private DocumentWriter docWriter;
private String categoryTree;
private String[] category;
private cfStructData custommap;
private String originalHost;
private cfArrayData badUrls;
private Set<String> visitedUrls;
private List<URL> urlsToCrawl;
private AbstractFileHandler activeHandler;
private int totalDocs = 0;
public WebCrawlFactory(boolean bStoreBody, URL weburl, DocumentWriter docWriter, String categoryTree, String[] category, cfStructData custommap ) {
super(bStoreBody);
originalHost = weburl.getHost();
this.docWriter = docWriter;
this.categoryTree = categoryTree;
this.category = category;
this.custommap = custommap;
badUrls = cfArrayData.createArray(1);
visitedUrls = new HashSet<String>();
urlsToCrawl = new ArrayList<URL>();
urlsToCrawl.add( weburl );
}
public void close() {
try {
docWriter.commit();
} catch (Exception e) {}
}
public cfData crawl() {
while ( !urlsToCrawl.isEmpty() ){
if ( totalDocs+badUrls.size() == MAX_DOCS )
break;
URL toCrawlUrl = urlsToCrawl.remove(0);
File pageFile = downloadPage( toCrawlUrl );
if ( pageFile == null )
continue;
try{
if ( !(activeHandler instanceof FileHandlerHTMLImpl) )
continue;
DocumentWrap doc = activeHandler.crawl( toCrawlUrl.toString(), pageFile );
doc.setCategories(category);
doc.setCategoryTree(categoryTree);
doc.deleteField( DocumentWrap.URL );
doc.deleteField( DocumentWrap.ID );
doc.setId( toCrawlUrl.toString() );
if ( custommap != null ){
Iterator<String> it = custommap.keySet().iterator();
while ( it.hasNext() ){
String k = it.next();
String v = custommap.getData(k).getString();
doc.setAttribute(k, v);
}
}
// Add this to our collection
totalDocs++;
docWriter.add(doc);
// Catch any other url's; only add if inside same domain and we haven't seen them before
Set<String> anchors = (Set<String>)activeHandler.getExtra();
if ( anchors != null ){
Iterator<String> is = anchors.iterator();
while ( is.hasNext() ){
String urlS = is.next();
if ( urlS.length() > 0 && !visitedUrls.contains(urlS) ){
URL url = new URL(is.next());
if ( url.getHost().equalsIgnoreCase(originalHost) )
urlsToCrawl.add( url );
}
}
}
}catch(Exception e){
}finally{
activeHandler = null;
pageFile.delete();
}
}
// Set the status
cfStructData sd = new cfStructData();
sd.setData("inserted", new cfNumberData(totalDocs) );
sd.setData("invalid", new cfNumberData(badUrls.size()) );
sd.setData("badkeys", badUrls );
return sd;
}
private File downloadPage( URL url ){
if ( visitedUrls.contains(url.toString()) )
return null;
activeHandler = null;
visitedUrls.add( url.toString() );
File tmpFile = null;
long startTime = System.currentTimeMillis();
int size = 0;
try{
tmpFile = File.createTempFile("OpenBD-CollectionIndexWeb-", ".html");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setReadTimeout( 30000 );
// Connect and read the response
InputStream in = null;
String detectedContentType = null;
try {
conn.connect();
if (conn.getResponseCode() == HttpURLConnection.HTTP_FORBIDDEN || conn.getResponseCode() == HttpURLConnection.HTTP_UNAUTHORIZED)
throw new Exception("Skipping url: " + url + ". It is protected.");
if (conn.getResponseCode() != -1 && conn.getResponseCode() != HttpURLConnection.HTTP_OK)
throw new Exception("Cannot read page: " + url + " response code is: " + conn.getResponseCode());
in = conn.getInputStream();
detectedContentType = URLConnection.guessContentTypeFromStream(in);
size = readData( tmpFile, in );
} finally {
if (in != null)
in.close();
conn.disconnect();
}
// Log the fact we made a crawl
cfEngine.log( "CollectionIndexWeb: Time=" + (System.currentTimeMillis()-startTime) + "; Size=" + size + "; " + url );
// Determine the contentType
String specifiedContentType = conn.getContentType();
if (specifiedContentType == null || specifiedContentType.equals(""))
specifiedContentType = detectedContentType;
if (specifiedContentType == null || specifiedContentType.equals(""))
specifiedContentType = "text/plain";
specifiedContentType = validateMimeType( specifiedContentType );
activeHandler = extHandlers.get( specifiedContentType );
if ( activeHandler == null ){
badUrls.addElement( new cfStringData( url.toString() ) );
tmpFile.delete();
return null;
}else
return tmpFile;
}catch(Exception e){
try {
badUrls.addElement( new cfStringData( url.toString() ) );
} catch (cfmRunTimeException e1) {}
if ( tmpFile != null )
tmpFile.delete();
return null;
}
}
private String validateMimeType(String mt) {
// The Java VM returns null when it can't determine the content-type so use the default mime type.
if (mt == null)
return "text/plain";
// Remove any subtypes from the content-type
if (mt.indexOf(";") != -1)
mt = mt.substring(0, mt.indexOf(";")).trim();
return mt;
}
private int readData(File tmpFile, InputStream in) throws IOException {
OutputStream out = null;
int size = 0;
try {
out = new BufferedOutputStream(cfEngine.thisPlatform.getFileIO().getFileOutputStream(tmpFile));
int read = -1;
byte[] buf = new byte[1024];
while ((read = in.read(buf, 0, buf.length)) != -1) {
out.write(buf, 0, read);
size += read;
}
out.flush();
} finally {
if (out != null)
out.close();
}
return size;
}
}