/*
* Copyright (C) 2009 lichtflut Forschungs- und Entwicklungsgesellschaft mbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.lichtflut.infra.html.provider;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import de.lichtflut.infra.html.MTIAccessor;
/**
* <p>
* This Class is providing specified information for a Set of defined targets
* It could be also used as Thread
* TODO: There could be a few problems with Mutual Exclusions.
* Have a look at LinkedList-function-callOrder and behaviour
* </p>
*
* <p>
* Created 04.06.2009
* </p>
*
* @author Nils Bleisch
*/
public class MTICrawler implements Runnable{
//-- MEMBER-FIELDS -------------------------------------
/** The raw data, which is collected by
* all the crawler-threads is already "preprocessed" (DOM-Structure)
* and available/accessable via MTIAccessor-Objects.
* These accessors are added and stored in rawDataAccessors.
* rawDataAccessor can also be set by an extern function call,
* so a HTTP-Request isnt the only way to extracting information through
* X-HTML-Markup.
*/
private List<MTIAccessor> rawDataAccessors;
//Remove-Criterium for "idling" Threads
private final int MAX_BUFFER_CNT = 4;
//private final ArastrejuGate gate;
//List of all Helper-Threads
private List<CrawlersLittleHelper> helperThreads;
//Queue of Threads, which are ready
private List<CrawlersLittleHelper> rdyQueue;
/** This Map is holding the errors/exceptions,
* occured while crawling for specified targets
* One entry consists of the target-string (key) and the occurred Exception
*/
private Map<String, Exception> occurredErrors = new HashMap<String, Exception>();
private List<Object> targets=null;
private int maxThreadCnt=0;
private MTICrawlerExtractionSpec extrationSpec;
//-- CONSTRUCTORS -------------------------------------
/**
* Constructor, initializing the Object
* Created: 04.06.2009
* @author Nils Bleisch
* @param int specifies the maximum amount of crawler-threads,
* a zero means unlimited
* Created: 04.06.2009
* @author Nils Bleisch
*/
public MTICrawler(int maxThreadCnt,MTICrawlerExtractionSpec extractionSpec){
//initializing field
this.maxThreadCnt = Math.abs(maxThreadCnt);
this.maxThreadCnt=maxThreadCnt;
this.rawDataAccessors = new LinkedList<MTIAccessor>();
this.rdyQueue = new LinkedList<CrawlersLittleHelper>();
this.helperThreads = new LinkedList<CrawlersLittleHelper>();
this.extrationSpec = extractionSpec;
}//end of constructor
//-- PROCESSING-METHODS -------------------------------------
/**
* crawl-Method, starts the crawl-process,
* adding and removing new helperThreads, if possible
* @param List targets, target-type: could be:
* io.Reader
* io.File
* net-URL
* Created: 04.06.2009
* @author Nils Bleisch
*/
public void crawl(List<Object> targets){
while(targets!=null&&targets.size()!=0 || helperThreads.size()!=0){
/*if there is no accessor available,
* add a new helperThread to accelerate it
*/
if(rawDataAccessors.size()==0&&rdyQueue.size()==0 && targets.size()!=0){
if(maxThreadCnt==0||maxThreadCnt>helperThreads.size()){
CrawlersLittleHelper helper = new CrawlersLittleHelper(this,helperThreads.size());
helper.setTarget(targets.remove(0));
new Thread(helper).start();
helperThreads.add(helper);
}//end of if
continue;
}//end of if
//iterate over idling helpers and setting up a new target
while(rdyQueue.size()!=0){
if(rawDataAccessors.size()>=MAX_BUFFER_CNT&&rdyQueue.size()>=MAX_BUFFER_CNT || targets.size()==0){
CrawlersLittleHelper helper = rdyQueue.remove(0);
//terminate this Thread
helper.setTerminateCondition(true);
//remove Thread from Thread-List
helperThreads.remove(helper);
}//end of i
//if there is no more target available
if(targets.size()==0) break;
//remove the helper on the first position in queue and set a new target
rdyQueue.remove(0).setTarget(targets.remove(0));
//Get accessor
if(rawDataAccessors.size()!=0){
MTIAccessor accessor = rawDataAccessors.remove(0);
//Call the specified extration-specification for the given context
extrationSpec.extractSpecifiedInformation(accessor);
}
}//end of while
}//end of while
stopAndRemoveThreads();
while(rawDataAccessors.size()!=0){
MTIAccessor accessor = rawDataAccessors.remove(0);
//Call the specified extration-specification for the given context
extrationSpec.extractSpecifiedInformation(accessor);
}
}//end of Method crawl()
/**
* This Method is removing all Threads (CrawlersLittleHelper)
* If there is an active Thread, it would be stopped
* adding and removing new helperThreads, if possible
* Created: 09.06.2009
* @author Nils Bleisch
*/
public void stopAndRemoveThreads(){
//Terminate all Threads
for(CrawlersLittleHelper helperThread: helperThreads){
helperThread.setTerminateCondition(true);
}//end of for
//remove Threads
helperThreads.removeAll(helperThreads);
rdyQueue.retainAll(rdyQueue);
}//end of Method stopThreads()
public void run() {
crawl(getTargets());
}//end of Method run()
//-- GETTERS -------------------------------------
private List<Object> getTargets() {
return targets;
}//end of MEthod getTargets()
public List<MTIAccessor> getRawDataAccessors() {
return rawDataAccessors;
}//end of Method getRawDataAccessors()
public List<CrawlersLittleHelper> getRdyQueue() {
return rdyQueue;
}//end of Method getRdyQueue
public List<CrawlersLittleHelper> getHelperThreads() {
return helperThreads;
}//end of Method getHelperThreads
public Map<String, Exception> getOccurredErrors() {
return occurredErrors;
}//end of Method getOccuredErrors()
//-- SETTERS -------------------------------------
public void setTargets(final List<Object> targets){
this.targets = targets;
}//end of Method setTargets
//-- INNER-CLASSES -------------------------------------
/**
* inner-class, used as Thread
* Delivers MTIAccessor-Objects
* to MTICrawler for specified target-URL's
* Created: 04.06.2009
* @author Nils Bleisch
*/
private static class CrawlersLittleHelper implements Runnable{
//-- MEMBERS -------------------------------------
//-- FIELDS -------------------------------------
private MTICrawler crawlerManager;
/*Specified target, as Object
* should be:
* io.Reader
* io.File
* net.URL
*/
private Object target;
private boolean terminateCondition=false;
//-- CONSTRUCTOR -------------------------------------
public CrawlersLittleHelper(MTICrawler manager, int helperID){
//initializing
this.crawlerManager=manager;
}//end of constructor
//-- PROCESSING-METHODS -------------------------------------
/**
* Run method to crawl for specified targets
* The result is available as an MTIAccessor-Object
* and is stored in rawDataAccessor-Collection of MTICrawler
* Created: 03.06.2009
* @author Nils Bleisch
*/
public void run() {
while(!terminateCondition){
//Waiting for new assignment, if target is null:continue
if(target==null) continue;
//generate an MTIAccessor
MTIAccessor accessor=null;
try{
//Check the type of target and generate a specific accessor
if(target instanceof Reader)
accessor = new MTIAccessor(MTIAccessor.generateExtractor((Reader)target));
else if(target instanceof File)
accessor = new MTIAccessor(MTIAccessor.generateExtractor((File)target));
else if(target instanceof URL){
//ToDo: URL-Encoding
accessor = new MTIAccessor(MTIAccessor.generateExtractor(((URL)target)));
}else{
crawlerManager.getOccurredErrors().put(target.toString(),
new Exception("Type of Target:" +
target.getClass().getName() +
" is not supported"));
}//end of else
}catch(FileNotFoundException any){
crawlerManager.getOccurredErrors().put(target.toString(), any);
target=null;
crawlerManager.getRdyQueue().add(this);
continue;
}catch(IOException any){
crawlerManager.getOccurredErrors().put(target.toString(), any);
target=null;
crawlerManager.getRdyQueue().add(this);
continue;
}//end of catch
crawlerManager.getRawDataAccessors().add(accessor);
//Set this Helper to "idle"
//set target to null
target=null;
crawlerManager.getRdyQueue().add(this);
}//end of while
}//end of Method run()
//-- SETTERS -------------------------------------
//Overloaded
public void setTarget(Object target){
this.target = target;
}//end of Method setTarget()
public void setTerminateCondition(boolean condition){
terminateCondition = condition;
}//end of Method setTerminateCondition
}//end of inner class CrawlerLittleHelper
}//end of class MTICrawler