package railo.runtime.search.lucene2.net;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import railo.commons.io.log.Log;
import railo.commons.io.log.LogAndSource;
import railo.commons.io.res.util.ResourceUtil;
import railo.commons.lang.HTMLUtil;
import railo.commons.lang.StringUtil;
import railo.commons.lang.SystemOut;
import railo.commons.net.HTTPUtil;
import railo.commons.net.http.HTTPEngine;
import railo.commons.net.http.HTTPResponse;
import railo.runtime.config.Config;
import railo.runtime.engine.ThreadLocalPageContext;
import railo.runtime.search.lucene2.DocumentUtil;
import railo.runtime.tag.Index;
import railo.runtime.type.util.ArrayUtil;
/**
*
*/
public final class WebCrawler {
private static HTMLUtil htmlUtil=new HTMLUtil();
private LogAndSource log;
public WebCrawler(LogAndSource log) {
this.log=log;
}
public void parse(IndexWriter writer, URL current, String[] extensions, boolean recurse, long timeout) throws IOException {
translateExtension(extensions);
if(ArrayUtil.isEmpty(extensions))extensions=Index.EXTENSIONS;
_parse(log,writer,null,current,new ArrayList(), extensions,recurse,0,timeout);
}
private static URL translateURL(URL url) throws MalformedURLException {
//print.out(url.toExternalForm());
String path=url.getPath();
int dotIndex = path.lastIndexOf('.');
// no dot
if(dotIndex==-1){
if(path.endsWith("/")) return HTTPUtil.removeRef(url);
return HTTPUtil.removeRef(new URL(
url.getProtocol(),
url.getHost(),
url.getPort(),
path+"/"+StringUtil.emptyIfNull(url.getQuery())));
}
//print.out("rem:"+HTTPUtil.removeRef(url));
return HTTPUtil.removeRef(url);
}
private void translateExtension(String[] extensions) {
for(int i=0;i<extensions.length;i++){
if(extensions[i].startsWith("*."))extensions[i]=extensions[i].substring(2);
else if(extensions[i].startsWith("."))extensions[i]=extensions[i].substring(1);
}
}
/**
* @param writer
* @param current
* @param content
* @throws IOException
*/
private static Document toDocument(StringBuffer content,IndexWriter writer, String root, URL current,long timeout) throws IOException {
HTTPResponse rsp = HTTPEngine.get(current, null, null, timeout,HTTPEngine.MAX_REDIRECT, null, "RailoBot", null, null);
Document doc = DocumentUtil.toDocument(content,root,current, rsp);
return doc;
}
protected static void _parse(Log log,IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException {
StringBuffer content = _parseItem(log,writer, root, current, urlsDone, extensions, recurse, deep,timeout);
if(content!=null)_parseChildren(log,content,writer, root, current, urlsDone, extensions, recurse, deep,timeout);
}
public static StringBuffer _parseItem(Log log,IndexWriter writer, String root, URL url, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException{
try{
url=translateURL(url);
if(urlsDone.contains(url.toExternalForm())) return null;
urlsDone.add(url.toExternalForm());
StringBuffer content=new StringBuffer();
Document doc=toDocument(content,writer, root, url,timeout);
if(doc==null) return null;
if(writer!=null)writer.addDocument(doc);
// Test
/*Resource dir = ResourcesImpl.getFileResourceProvider().getResource("/Users/mic/Temp/leeway3/");
if(!dir.isDirectory())dir.mkdirs();
Resource file=dir.getRealResource(url.toExternalForm().replace("/", "_"));
IOUtil.write(file, content.toString(), "UTF-8", false);*/
info(log,url.toExternalForm());
return content;
}
catch(IOException ioe){
error(log,url.toExternalForm(),ioe);
throw ioe;
}
}
protected static void _parseChildren(Log log,StringBuffer content,IndexWriter writer, String root, URL base, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException {
if(recurse) {
List urls = htmlUtil.getURLS(content.toString(),base);
// loop through all children
int len=urls.size();
List childIndexer=len>1?new ArrayList():null;
ChildrenIndexer ci;
//print.out("getting content");
for(int i=0;i<len;i++) {
URL url=(URL) urls.get(i);
/*if(url.toExternalForm().indexOf("80")!=-1){
SystemOut.printDate("base:"+base);
SystemOut.printDate("url:"+url);
}*/
url=translateURL(url);
if(urlsDone.contains(url.toExternalForm())) continue;
//urlsDone.add(url.toExternalForm());
String protocol=url.getProtocol().toLowerCase();
String file=url.getPath();
if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) &&
base.getHost().equalsIgnoreCase(url.getHost())) {
try {
ci=new ChildrenIndexer(log,writer,root,url,urlsDone,extensions,recurse,deep+1,timeout);
childIndexer.add(ci);
ci.start();
}
catch(Throwable t) {
//print.printST(t);
}
}
}
if(childIndexer!=null && !childIndexer.isEmpty()){
Iterator it = childIndexer.iterator();
while(it.hasNext()) {
ci=(ChildrenIndexer) it.next();
if(ci.isAlive()) {
try {
ci.join(timeout);
}
catch (InterruptedException e) {
//print.printST(e);
}
}
// timeout exceptionif(ci.isAlive()) throw new IOException("timeout occur while invoking page ["+ci.url+"]");
if(ci.isAlive()){
ci.interrupt();
Config config = ThreadLocalPageContext.getConfig();
SystemOut.printDate(config!=null?config.getErrWriter():new PrintWriter(System.err),"timeout ["+timeout+" ms] occur while invoking page ["+ci.url+"]");
}
}
//print.out("exe child");
it = childIndexer.iterator();
while(it.hasNext()) {
ci=(ChildrenIndexer) it.next();
//print.out("exec-child:"+ci.url);
//print.out(content);
if(ci.content!=null)_parseChildren(log,ci.content,writer, root, ci.url, urlsDone, extensions, recurse, deep,timeout);
}
}
urls.clear();
}
//print.out("end:"+base);
}
/*protected static void _sssparse(IndexWriter writer, String root, URL current, List urlsDone, String[] extensions, boolean recurse,int deep,long timeout) throws IOException {
current=translateURL(current);
print.out("start:"+current);
if(urlsDone.contains(current.toExternalForm())) return;
HttpMethod method = HTTPUtil.invoke(current, null, null, -1, null, "RailoBot", null, -1, null, null, null);
StringBuffer content=new StringBuffer();
Document doc = DocumentUtil.toDocument(content,root,current, method);
urlsDone.add(current.toExternalForm());
if(doc==null) return;
if(writer!=null)writer.addDocument(doc);
if(recurse) {
List urls = htmlUtil.getURLS(content.toString(),current);
// loop through all children
int len=urls.size();
List childIndexer=len>1?new ArrayList():null;
ChildrenIndexer ci;
for(int i=0;i<len;i++) {
URL url=(URL) urls.get(i);
String protocol=url.getProtocol().toLowerCase();
String file=url.getPath();
if((protocol.equals("http") || protocol.equals("https")) && validExtension(extensions,file) &&
current.getHost().equalsIgnoreCase(url.getHost())) {
//_parse(writer,root,url,urlsDone,extensions,recurse,deep+1);
try {
if(len==1 || true)_parse(writer,root,url,urlsDone,extensions,recurse,deep+1,timeout);
else {
ci=new ChildrenIndexer(writer,root,url,urlsDone,extensions,recurse,deep+1);
ci.start();
childIndexer.add(ci);
}
}
catch(Throwable t) {
}
}
}
if(!childIndexer.isEmpty()){
Iterator it = childIndexer.iterator();
while(it.hasNext()) {
ci=(ChildrenIndexer) it.next();
if(ci.isAlive()) {
try {
ci.join(20*1000);
}
catch (InterruptedException e) {}
}
}
}
urls.clear();
}
print.out("end:"+current);
}*/
private static boolean validExtension(String[] extensions, String file) {
String ext = ResourceUtil.getExtension(file,"");
ext=railo.runtime.type.util.ListUtil.first(ext,"/",true);
if(StringUtil.isEmpty(ext))return true;
for(int i=0;i<extensions.length;i++){
if(ext.equalsIgnoreCase(extensions[i]))return true;
}
return false;
}
private static void info(Log log,String doc) {
if(log==null) return;
log.info("Webcrawler", "invoke "+doc);
}
private static void error(Log log,String doc, Exception e) {
if(log==null) return;
log.error("Webcrawler", "invoke "+doc+":"+e.getMessage());
}
}
class ChildrenIndexer extends Thread {
protected IndexWriter writer;
protected String root;
protected URL url;
protected List urlsDone;
protected String[] extensions;
protected boolean recurse;
protected int deep;
protected StringBuffer content;
private long timeout;
private Log log;
public ChildrenIndexer(Log log,IndexWriter writer, String root, URL url,List urlsDone, String[] extensions,boolean recurse, int deep,long timeout) {
this.writer=writer;
this.root=root;
this.url=url;
this.urlsDone=urlsDone;
this.extensions=extensions;
this.recurse=recurse;
this.deep=deep;
this.timeout=timeout;
this.log=log;
}
public void run(){
try {
//WebCrawler._parse(writer, root, url, urlsDone, extensions, recurse, deep);
this.content=WebCrawler._parseItem(log,writer, root, url, urlsDone, extensions, recurse, deep,timeout+1);
} catch (IOException e) {}
}
}