package com.idega.block.websearch.business;
import java.io.File;
import java.net.HttpURLConnection;
import java.util.Collection;
import java.util.Iterator;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import com.idega.block.websearch.data.WebSearchIndex;
import com.idega.idegaweb.IWURL;
import com.idega.util.FileUtil;
import com.idega.util.text.TextSoap;
/**
* <p><code>Crawler</code> Web crawler.</p>
* This class is a part of the websearch webcrawler and search engine block. <br>
* It is based on the <a href="http://lucene.apache.org">Lucene</a> java search engine from the Apache group and loosly <br>
* from the work of David Duddleston of i2a.com.<br>
*
* @copyright Idega Software 2002
* @author <a href="mailto:eiki@idega.is">Eirikur Hrafnsson</a>
*/
public final class Crawler {
private WebSearchIndex index;
private IndexReader reader;
private IndexWriter writer;
private java.util.Stack linkQueue;
private java.util.TreeSet links;
private ContentHandler handler;
private String rootURL;
private String seedURL[];
private String scopeURL[];
private String indexPath; // search index path
private boolean created; // if search index has been created
private String cookie;
private Collection ignoreParameters;
// reporting
private int reporting; // reporting level
// Current URL data
private java.net.URL currentURL;
private String currentURLPath; // path of URL (no file)
// private String href;
private String contentType;
private long lastModified;
private ContentHandler htmlHandler = new HTMLHandler();
private ContentHandler pdfHandler = new PDFHandler();
/**
*
*/
private Crawler() {
}
/**
*
*/
public Crawler(WebSearchIndex index) {
this(index, 0);
}
public Crawler(WebSearchIndex index, int reporting) {
try {
this.index = index;
this.seedURL = index.getSeed();
this.scopeURL = index.getScope();
this.indexPath = index.getIndexPath();
this.rootURL = this.seedURL[0].substring(0, this.seedURL[0].indexOf("/", 8));
this.created = false;
this.reporting = reporting;
this.linkQueue = new java.util.Stack();
this.links = new java.util.TreeSet();
for (int i = 0; i < this.seedURL.length; i++) {
this.links.add(this.seedURL[i].toLowerCase());
this.linkQueue.push(this.seedURL[i]);
}
} catch (Exception e) {
e.printStackTrace();
};
}
public void addIgnoreParameters(Collection parameters) {
this.ignoreParameters = parameters;
}
public void crawl() {
try {
if (this.reporting > 0) {
System.out.println("Websearch: START CRAWLING");
}
File file = new File(this.indexPath);
if (!file.exists()) {
//create directory structure
System.out.println("Websearch: creating index folders...");
System.out.println("Websearch: "+this.indexPath);
FileUtil.createFileAndFolder(this.indexPath,"segments");
if (this.reporting > 0) {
System.out.println("create new index");
//IndexWriter writer = new IndexWriter(indexPath, new StopAnalyzer(), true);
//writer.close();
}
}
/*else {
// delete all files for now and build new index.
// implement incremental index later.
if (reporting > 0) System.out.println("index exists, delete all files");
//FileUtil.delete(indexPath);
//delete all
IndexReader reader = IndexReader.open(indexPath);
int count = reader.numDocs();
if (reporting > 0) {
System.out.println("deleting " + count + " records");
}
for (int i = 0; i < count; i++) {
//if (reporting > 1) System.out.println("deleted " + i);
reader.delete(i);
}
reader.close();
System.out.println("Websearch: creating index folders...");
System.out.println("Websearch: "+indexPath);
FileUtil.createFileAndFolder(indexPath,"segments");
}*/
// create new IndexWriter
this.writer = new IndexWriter(this.indexPath, new StopAnalyzer(), true);
String url;
//System.out.println(linkQueue.toString());
if(this.linkQueue==null) {
System.out.println("WebSearch crawler: linkQueue is null! check that a trailing / is in the seed url");
}
while (this.linkQueue!=null && !this.linkQueue.empty()) {
url = (String)this.linkQueue.pop();
if (!url.startsWith(this.rootURL)) {
// root has changed.
// example http://www.12a.com to https://secure.i2a.com/
this.rootURL = url.substring(0, url.indexOf("/", 8));
}
if (this.reporting > 1) {
System.out.println();
System.out.print("SCANNING : " + url);
}
String result = scanPage(url);
if (result.equals("good")) {
if (this.reporting > 1) {
System.out.print(" status: " + result);
}
if (this.reporting > 2) {
System.out.println(" lastModified : " + this.lastModified);
System.out.println(" contentType : " + this.contentType);
System.out.println(" robot rules: index=" + this.handler.getRobotIndex()
+ " follow=" + this.handler.getRobotFollow());
System.out.println(" HREF : " +this.handler.getHREF());
System.out.println(" title : " +this.handler.getTitle());
System.out.println(" author : " +this.handler.getAuthor());
System.out.println(" published : " +this.handler.getPublished());
System.out.println(" description : " +this.handler.getDescription());
System.out.println(" keywords : " +this.handler.getKeywords());
System.out.println(" links : " + this.handler.getLinks());
if (this.reporting > 3) {
System.out.println(" contents : " +this.handler.getContents());
}
}
} else {
if (this.reporting == 1) {
System.out.println();
System.out.println("SCANNED : " + url);
}
if (this.reporting > 0) {
System.out.println(" *status: " + result);
}
}
}
if (this.reporting > 0) {
System.out.println();
System.out.println();
System.out.println("DONE CRAWLING");
System.out.println("links crawled");
java.util.Iterator it = this.links.iterator();
while (it.hasNext()) {
System.out.println(it.next());
}
System.out.println();
}
//optinmize the search speed
this.writer.optimize();
this.writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public final void handleHTML(HttpURLConnection httpCon) throws Exception {
this.handler = this.htmlHandler;
this.handler.parse(httpCon.getInputStream());
if (this.handler.getRobotFollow()) {
java.util.List links = this.handler.getLinks();
//System.out.println("link count : " + links.size());
for (int i = 0; i < links.size(); i++) {
handleLink((String)links.get(i));
}
}
if (this.handler.getRobotIndex()) {
indexLucene();
}
}
public void handleLink(String url) {
String normalizedURL = normalizedURL(url);
String lowerCaseURL = normalizedURL.toLowerCase();
if (!(normalizedURL.startsWith("http://") || normalizedURL.startsWith("https://"))) {
// link needs to be evaluated, parsed and completed
normalizedURL = parseHREF(normalizedURL, lowerCaseURL);
if (normalizedURL != null) {
lowerCaseURL = normalizedURL.toLowerCase();
}
// is full URL
}
if (normalizedURL != null && inScope(normalizedURL)) {
// is full URL and in scope.
if (!this.links.contains(lowerCaseURL)) {
this.links.add(lowerCaseURL);
this.linkQueue.push(normalizedURL);
}
}
}
/**
* Takes an url and rearranges it so all query parameters are in the same order as another url with the same query parameters.
* This helps eliminate duplicates. It also changes servlet/IBMainServlet/? and servlet/IBMainServlet? to index.jsp?
* @param url
* @return
*/
protected String normalizedURL(String url) {
//change servlet path to index.jsp if crawling an idegaweb application
String normalized = TextSoap.findAndReplace(url,"servlet/IBMainServlet/?", "index.jsp?");
normalized = TextSoap.findAndReplace(url,"servlet/IBMainServlet?", "index.jsp?");
IWURL temp = new IWURL(normalized);
return temp.getFullURL();
}
public final void handlePDF(HttpURLConnection httpCon) throws Exception {
this.handler = this.pdfHandler;
this.handler.parse(httpCon.getInputStream());
indexLucene();
}
/*
* Index the html in lucene.
* Path is the url path, and contents is the parsed html
*/
private void indexLucene() {
try {
//IndexWriter writer;
//if (!created) {
//writer = new IndexWriter(this.indexPath, new StopAnalyzer(), true);
//created = true;
//} else {
//writer = new IndexWriter(this.indexPath, new StopAnalyzer(), false);
//}
Document mydoc = new Document();
mydoc.add(new Field("uid", this.currentURL.toString().toLowerCase(), false, true, false));
mydoc.add(Field.Text("url", this.currentURL.toString()));
mydoc.add(Field.Text("contentType", this.contentType));
mydoc.add(Field.Keyword("lastModified",DateField.timeToString(this.lastModified)));
String contents = this.handler.getContents();
if( contents!=null ){
//clean more!
contents = TextSoap.findAndCut(contents,">");
contents = TextSoap.findAndCut(contents,"<");
contents = TextSoap.findAndCut(contents,"�?");
mydoc.add(Field.Text("contents", contents));
}
if (this.handler.getTitle() != null) {
mydoc.add(Field.Text("title", this.handler.getTitle()));
}
if (this.handler.getKeywords() != null) {
mydoc.add(Field.Text("keywords", this.handler.getKeywords()));
}
if (this.handler.getDescription() != null) {
mydoc.add(Field.Text("description", this.handler.getDescription()));
}
if (this.handler.getCategories() != null) {
mydoc.add(Field.Text("categories", this.handler.getCategories()));
}
if (this.handler.getPublished() != -1) {
// use meta tag
mydoc.add(Field.Keyword("published",
DateField.timeToString(this.handler.getPublished())));
} else {
// use lastmodified from http header.
mydoc.add(Field.Keyword("published",
DateField.timeToString(this.lastModified)));
}
if (this.handler.getPublished() != -1) {
// use meta tag
mydoc.add(Field.Keyword("published",
DateField.timeToString(this.handler.getPublished())));
} else {
// use lastmodified from http header.
}
if (this.handler.getHREF() != null) {
// Replace $link with url.
String href = this.handler.getHREF();
int pos = href.indexOf("$link");
href = href.substring(0, pos) + this.currentURL.toString()
+ href.substring(pos + 5, href.length());
mydoc.add(Field.UnIndexed("href", href));
}
this.writer.addDocument(mydoc);
//writer.close();
} catch (Exception e) {
System.out.println(e.toString());
e.printStackTrace();
}
}
public boolean inScope(String url) {
if (containsIgnoreParameter(url)) {
return false;
}
for (int i = 0; i < this.scopeURL.length; i++) {
if (url.startsWith(this.scopeURL[i])) {
// in scope
return true;
}
}
// not in scope
return false;
}
public boolean containsIgnoreParameter(String url) {
if (this.ignoreParameters != null) {
Iterator iter = this.ignoreParameters.iterator();
while (iter.hasNext()) {
String parameter = (String) iter.next();
if (url.indexOf(parameter) != -1) {
return true;
}
}
}
return false;
}
public String parseHREF(String url, String urlLowCase) {
// Looks for incomplete URL and completes them
if (urlLowCase.startsWith("/")) {
url = this.rootURL + url;
} else if (urlLowCase.startsWith("./")) {
url = this.currentURLPath + url.substring(1, url.length());
} else if (urlLowCase.startsWith("../")) {
int back = 1;
while (urlLowCase.indexOf("../", back*3) != -1) {
back++;
}
int pos = this.currentURLPath.length();
int count = back;
while (count-- > 0) {
pos = this.currentURLPath.lastIndexOf("/", pos) - 1;
}
url = this.currentURLPath.substring(0, pos+2) + url.substring(3*back, url.length());
} else if (urlLowCase.startsWith("javascript:")) {
// ignore javascript:...
url = null;
} else if (urlLowCase.startsWith("#")) {
// internal anchor... ignore.
url = null;
} else if (urlLowCase.startsWith("mailto:")) {
// handle mailto:...
url = null;
} else {
url = this.currentURLPath + "/" + url;
}
// strip anchor if exists otherwise crawler may index content multiple times
// links to the same url but with unique anchors would be considered unique
// by the crawler when they should not be
//int i;
if (url != null) {
int i;
if ((i = url.indexOf("#")) != -1) {
url = url.substring(0,i);
}
}
return url;
}
public String scanPage(String urlString) {
String status = "good";
try {
this.currentURL = new java.net.URL(urlString);
this.currentURLPath = urlString.substring(0, urlString.lastIndexOf("/"));
HttpURLConnection httpCon = (HttpURLConnection)this.currentURL.openConnection();
httpCon.setRequestProperty("User-Agent", "idegaWeb Web Search Engine Crawler http://www.idega.com");
if (this.cookie != null) {
httpCon.setRequestProperty("Cookie", this.cookie);
}
httpCon.connect();
this.lastModified = httpCon.getLastModified();
if (httpCon.getHeaderField("Set-Cookie") != null) {
this.cookie = stripCookie(httpCon.getHeaderField("Set-Cookie"));
if (this.reporting > 1) {
System.out.print(" got cookie : " + this.cookie);
}
}
if (httpCon.getResponseCode() == HttpURLConnection.HTTP_OK) {
this.contentType = httpCon.getContentType();
if (this.contentType.indexOf("text/html") != -1) {
handleHTML(httpCon);
} else if (this.contentType.indexOf("application/pdf") != -1) {
handlePDF(httpCon);
} else {
status = "Not an excepted content type : " + this.contentType;
}
} else {
status = "bad";
}
httpCon.disconnect();
} catch (java.net.MalformedURLException mue) {
status = mue.toString();
} catch (java.net.UnknownHostException uh) {
status = uh.toString(); // Mark as a bad URL
} catch (java.io.IOException ioe) {
status = ioe.toString(); // Mark as a bad URL
} catch (Exception e) {
status = e.toString(); // Mark as a bad URL
}
return status;
}
public static String stripCookie(String cookie) {
int loc = cookie.indexOf(";");
return (loc > 0) ? cookie.substring(0, loc) : cookie;
}
}