CrawlController.java example

Explorer

NewsStats-master
- Analyzer
  - src
    - main
      - java
        com
        cse10
        analyzer
        Analyzer.java
        AnalyzerController.java
        Predictor.java
        Uploader.java
    - test
      - com
        cse10
        analyzer
        AnalyzerTest.java
        ENLPredictorAlgorithmTest.java
        LRPredictorAlgorithmTest.java
- Classifier
  - src
    - main
      - java
        com
        cse10
        classifier
        AdaBoostClassifierHandler.java
        BaggingClassifierHandler.java
        CeylonTodayClassifierUIHandler.java
        ClassifierConfigurator.java
        ClassifierHandler.java
        ClassifierUIHandler.java
        DailyMirrorClassifierUIHandler.java
        DataHandler.java
        DataHandlerWithGate.java
        DataHandlerWithSampling.java
        EnsembleClassifierHandler.java
        FeatureVectorTransformer.java
        GenericDataHandler.java
        GridSearch.java
        KeyWordClassifierHandler.java
        LibSVMExtended.java
        NewsFirstClassifierUIHandler.java
        SVMClassifierHandler.java
        StanfordCoreNLPLemmatizer.java
        TheIslandClassifierUIHandler.java
        gate
        CorpusPipeLine.java
        DocumentContentFilter.java
        DocumentKeyWordFinder.java
    - test
      - java
        com
        cse10
        classifier
        AdaBoostClassifierHandlerTest.java
        BaggingClassifierHandlerTest.java
        CeylonTodayClassifierUIHandlerTest.java
        DailyMirrorClassifierUIHandlerTest.java
        DataHandlerWithGateTest.java
        DataHandlerWithSamplingTest.java
        FeatureVectorTransformerTest.java
        GenericDataHandlerTest.java
        KeyWordClassifierHandlerTest.java
        NewsFirstClassifierUIHandlerTest.java
        SVMClassifierHandlerTest.java
        StanfordCoreNLPLemmatizerTest.java
        TheIslandClassifierUIHandlerTest.java
        gate
        DocumentContentFilterTest.java
        DocumentKeyWordFinderTest.java
- Crawler
  - src
    - main
      - java
        com
        cse10
        crawler
        CrawlManager.java
        DateHandler.java
        contentHandler
        BasicContentHandler.java
        CeylonTodayContentHandler.java
        DailyMirrorContentHandler.java
        HiruNewsContentHandler.java
        NewYorkTimesContentHandler.java
        NewsFirstContentHandler.java
        TheIslandContentHandler.java
        crawlControler
        BasicCrawlController.java
        CeylonTodayCrawlController.java
        DailyMirrorCrawlController.java
        HiruNewsCrawlController.java
        NewYorkTimesCrawlController.java
        NewsFirstCrawlController.java
        TheIslandCrawlController.java
        paperCrawler
        BasicCrawler.java
        CeylonTodayCrawler.java
        DailyMirrorCrawler.java
        HiruNewsCrawler.java
        NewYorkTimesCrawler.java
        NewsFirstCrawler.java
        TheIslandCrawler.java
        edu
        uci
        ics
        crawler4j
        crawler
        Configurable.java
        CrawlConfig.java
        CrawlController.java
        Page.java
        WebCrawler.java
        fetcher
        CustomFetchStatus.java
        IdleConnectionMonitorThread.java
        PageFetchResult.java
        PageFetcher.java
        frontier
        Counters.java
        DocIDServer.java
        Frontier.java
        InProcessPagesDB.java
        WebURLTupleBinding.java
        WorkQueues.java
        parser
        BinaryParseData.java
        ExtractedUrlAnchorPair.java
        HtmlContentHandler.java
        HtmlParseData.java
        ParseData.java
        Parser.java
        TextParseData.java
        robotstxt
        HostDirectives.java
        RobotstxtConfig.java
        RobotstxtParser.java
        RobotstxtServer.java
        RuleSet.java
        url
        TLDList.java
        URLCanonicalizer.java
        UrlResolver.java
        WebURL.java
        util
        IO.java
        Util.java
    - test
      - java
        com
        cse10
        crawler
        contentHandler
        BasicContentHandlerTest.java
        crawlControler
        BasicCrawlTest.java
        CeylonTodayCrawlControllerTest.java
        DailyMirrorCrawlControllerTest.java
        NewYorkTimesCrawlControllerTest.java
        NewsFirstCrawlControllerTest.java
        TheIslandCrawlControllerTest.java
        edu
        uci
        ics
        crawler4j
        examples
        basic
        BasicCrawlController.java
        BasicCrawler.java
        imagecrawler
        Cryptography.java
        ImageCrawlController.java
        ImageCrawler.java
        localdata
        CrawlStat.java
        Downloader.java
        LocalDataCollectorController.java
        LocalDataCollectorCrawler.java
        multiple
        BasicCrawler.java
        MultipleCrawlerController.java
        shutdown
        BasicCrawler.java
        ControllerWithShutdown.java
        statushandler
        StatusHandlerCrawlController.java
        StatusHandlerCrawler.java
        tests
        TLDListTest.java
        URLCanonicalizerTest.java
- Database
  - src
    - main
      - java
        com
        cse10
        article
        Article.java
        CeylonTodayArticle.java
        CrimeArticle.java
        DailyMirrorArticle.java
        HiruNewsArticle.java
        NewYorkTimesArticle.java
        NewsFirstArticle.java
        TheIslandArticle.java
        TrainingArticle.java
        database
        DatabaseConstants.java
        DatabaseHandler.java
        HibernateUtil.java
        entities
        CrimeEntityGroup.java
        CrimePerson.java
        LocationDistrictMapper.java
        results
        NewsStatistic.java
        Prediction.java
        Prediction_District.java
        Prediction_Type.java
        util
        ArticleConverter.java
        TableCleaner.java
    - test
      - java
        com
        cse10
        database
        DatabaseHandlerTest.java
        util
        ArticleConverterTest.java
        TableCleanerTest.java
- DuplicateDetector
  - src
    - main
      - java
        com
        cse10
        duplicateDetector
        BinoryWordSegmenter.java
        DataHandler.java
        DuplicateDetectorUIHandler.java
        FullWordSegmenter.java
        HammingDistanceCalculator.java
        HashCalculator.java
        SimHashCalculator.java
        WordSegmenter.java
    - test
      - java
        com
        cse10
        duplicateDetector
        BinoryWordSegmenterTest.java
        DataHandlerTest.java
        DuplicateDetectorUIHandlerTest.java
        FullWordSegmenterTest.java
        HammingDistanceCalculatorTest.java
        HashCalculatorTest.java
        SimHashCalculatorTest.java
- Extractor
  - src
    - main
      - java
        com
        cse10
        extractor
        ExtractorConstants.java
        gate
        DistrictExtractor.java
        EntityExtractor.java
        EntityExtractorTask.java
        GateTest.java
        JSONParser.java
        stanfordcorenlp
        ContentFilter.java
        Controller.java
        CrimeRegexPatterns.java
        CrimeSentenceDetector.java
        StanfordCoreNlpMy.java
        detector
        CourtDetector.java
        CrimeTypeDetector.java
        CriminalDetector.java
        LocationDetector.java
        PoliceStationDetector.java
        PossessionDetector.java
        PrisonDetector.java
        VictimDetector.java
    - test
      - com
        cse10
        extractor
        gate
        DistrictExtractorTest.java
        EntityExtractorTaskTest.java
        EntityExtractorTest.java
        GateTestTest.java
        JSONParserTest.java
- Filter
  - src
    - main
      - java
        com
        cse10
        filter
        KeywordsFilter.java
        LengthFilter.java
- GUI
  - src
    - main
      - java
        com
        cse10
        gui
        InfoDialog.java
        Main.java
        NewsStatsGUI.java
        UIComponents.java
        task
        analyze
        AnalyzeTask.java
        PredictTask.java
        UploadDataTask.java
        classify
        CeylonTodayClassifyTask.java
        ClassifyTask.java
        DailyMirrorClassifyTask.java
        NewsFirstClassifyTask.java
        TheIslandClassifyTask.java
        crawl
        CeylonTodayCrawlTask.java
        CrawlTask.java
        DailyMirrorCrawlTask.java
        NewsFirstCrawlTask.java
        TheIslandCrawlTask.java
        duplicateDetect
        DuplicateDetectorTask.java
        extract
        ExtractorTask.java
    - test
      - java
        com
        cse10
        gui
        MainTest.java
- Util
  - src
    - main
      - java
        com
        cse10
        util
        GlobalConstants.java
- src
  - test
    - java
      - com
        cse10
        crawler
        AppTest.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.uci.ics.crawler4j.crawler;

import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.IO;
import org.apache.log4j.Logger;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

/**
 * The controller that manages a crawling session. This class creates the
 * crawler threads and monitors their progress.
 * 
 * @author Yasser Ganjisaffar <lastname at gmail dot com>
 */
public class CrawlController extends Configurable {

	static final Logger logger = Logger.getLogger(CrawlController.class.getName());

	/**
	 * The 'customData' object can be used for passing custom crawl-related
	 * configurations to different components of the crawler.
	 */
	protected Object customData;

	/**
	 * Once the crawling session finishes the controller collects the local data
	 * of the crawler threads and stores them in this List.
	 */
	protected List<Object> crawlersLocalData = new ArrayList<>();

	/**
	 * Is the crawling of this session finished?
	 */
	protected boolean finished;

	/**
	 * Is the crawling session set to 'shutdown'. Crawler threads monitor this
	 * flag and when it is set they will no longer process new pages.
	 */
	protected boolean shuttingDown;

	protected PageFetcher pageFetcher;
	protected RobotstxtServer robotstxtServer;
	protected Frontier frontier;
	protected DocIDServer docIdServer;

	protected final Object waitingLock = new Object();

	public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer)
			throws Exception {
		super(config);

		config.validate();
		File folder = new File(config.getCrawlStorageFolder());
		if (!folder.exists()) {
			if (!folder.mkdirs()) {
				throw new Exception("Couldn't create this folder: " + folder.getAbsolutePath());
			}
		}

		boolean resumable = config.isResumableCrawling();

		EnvironmentConfig envConfig = new EnvironmentConfig();
		envConfig.setAllowCreate(true);
		envConfig.setTransactional(resumable);
		envConfig.setLocking(resumable);

		File envHome = new File(config.getCrawlStorageFolder() + "/frontier");
		if (!envHome.exists()) {
			if (!envHome.mkdir()) {
				throw new Exception("Couldn't create this folder: " + envHome.getAbsolutePath());
			}
		}
		if (!resumable) {
			IO.deleteFolderContents(envHome);
		}

		Environment env = new Environment(envHome, envConfig);
		docIdServer = new DocIDServer(env, config);
		frontier = new Frontier(env, config, docIdServer);

		this.pageFetcher = pageFetcher;
		this.robotstxtServer = robotstxtServer;

		finished = false;
		shuttingDown = false;
	}

	/**
	 * Start the crawling session and wait for it to finish.
	 * 
	 * @param _c
	 *            the class that implements the logic for crawler threads
	 * @param numberOfCrawlers
	 *            the number of concurrent threads that will be contributing in
	 *            this crawling session.
	 */
	public <T extends WebCrawler> void start(final Class<T> _c, final int numberOfCrawlers) {
		this.start(_c, numberOfCrawlers, true);
	}

	/**
	 * Start the crawling session and return immediately.
	 * 
	 * @param _c
	 *            the class that implements the logic for crawler threads
	 * @param numberOfCrawlers
	 *            the number of concurrent threads that will be contributing in
	 *            this crawling session.
	 */
	public <T extends WebCrawler> void startNonBlocking(final Class<T> _c, final int numberOfCrawlers) {
		this.start(_c, numberOfCrawlers, false);
	}

	protected <T extends WebCrawler> void start(final Class<T> _c, final int numberOfCrawlers, boolean isBlocking) {
		try {
			finished = false;
			crawlersLocalData.clear();
			final List<Thread> threads = new ArrayList<>();
			final List<T> crawlers = new ArrayList<>();

			for (int i = 1; i <= numberOfCrawlers; i++) {
				T crawler = _c.newInstance();
				Thread thread = new Thread(crawler, "Crawler " + i);
				crawler.setThread(thread);
				crawler.init(i, this);
				thread.start();
				crawlers.add(crawler);
				threads.add(thread);
				logger.info("Crawler " + i + " started.");
			}

			final CrawlController controller = this;

			Thread monitorThread = new Thread(new Runnable() {

				@Override
				public void run() {
					try {
						synchronized (waitingLock) {

							while (true) {
								sleep(10);
								boolean someoneIsWorking = false;
								for (int i = 0; i < threads.size(); i++) {
									Thread thread = threads.get(i);
									if (!thread.isAlive()) {
										if (!shuttingDown) {
											logger.info("Thread " + i + " was dead, I'll recreate it.");
											T crawler = _c.newInstance();
											thread = new Thread(crawler, "Crawler " + (i + 1));
											threads.remove(i);
											threads.add(i, thread);
											crawler.setThread(thread);
											crawler.init(i + 1, controller);
											thread.start();
											crawlers.remove(i);
											crawlers.add(i, crawler);
										}
									} else if (crawlers.get(i).isNotWaitingForNewURLs()) {
										someoneIsWorking = true;
									}
								}
								if (!someoneIsWorking) {
									// Make sure again that none of the threads
									// are
									// alive.
									logger.info("It looks like no thread is working, waiting for 10 seconds to make sure...");
									sleep(10);

									someoneIsWorking = false;
									for (int i = 0; i < threads.size(); i++) {
										Thread thread = threads.get(i);
										if (thread.isAlive() && crawlers.get(i).isNotWaitingForNewURLs()) {
											someoneIsWorking = true;
										}
									}
									if (!someoneIsWorking) {
										if (!shuttingDown) {
											long queueLength = frontier.getQueueLength();
											if (queueLength > 0) {
												continue;
											}
											logger.info("No thread is working and no more URLs are in queue waiting for another 10 seconds to make sure...");
											sleep(10);
											queueLength = frontier.getQueueLength();
											if (queueLength > 0) {
												continue;
											}
										}

										logger.info("All of the crawlers are stopped. Finishing the process...");
										// At this step, frontier notifies the
										// threads that were
										// waiting for new URLs and they should
										// stop
										frontier.finish();
										for (T crawler : crawlers) {
											crawler.onBeforeExit();
											crawlersLocalData.add(crawler.getMyLocalData());
										}

										logger.info("Waiting for 10 seconds before final clean up...");
										sleep(10);

										frontier.close();
										docIdServer.close();
										pageFetcher.shutDown();

										finished = true;
										waitingLock.notifyAll();

										return;
									}
								}
							}
						}
					} catch (Exception e) {
						e.printStackTrace();
					}
				}
			});

			monitorThread.start();

			if (isBlocking) {
				waitUntilFinish();
			}

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * Wait until this crawling session finishes.
	 */
	public void waitUntilFinish() {
		while (!finished) {
			synchronized (waitingLock) {
				if (finished) {
					return;
				}
				try {
					waitingLock.wait();
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
			}
		}
	}

	/**
	 * Once the crawling session finishes the controller collects the local data
	 * of the crawler threads and stores them in a List. This function returns
	 * the reference to this list.
	 */
	public List<Object> getCrawlersLocalData() {
		return crawlersLocalData;
	}

	protected static void sleep(int seconds) {
		try {
			Thread.sleep(seconds * 1000);
		} catch (Exception ignored) {
			// Do nothing
		}
	}

	/**
	 * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
	 * to extract new URLs in it and follow them for crawling.
	 * 
	 * @param pageUrl
	 *            the URL of the seed
	 */
	public void addSeed(String pageUrl) {
		addSeed(pageUrl, -1);
	}

	/**
	 * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
	 * to extract new URLs in it and follow them for crawling. You can also
	 * specify a specific document id to be assigned to this seed URL. This
	 * document id needs to be unique. Also, note that if you add three seeds
	 * with document ids 1,2, and 7. Then the next URL that is found during the
	 * crawl will get a doc id of 8. Also you need to ensure to add seeds in
	 * increasing order of document ids.
	 * 
	 * Specifying doc ids is mainly useful when you have had a previous crawl
	 * and have stored the results and want to start a new crawl with seeds
	 * which get the same document ids as the previous crawl.
	 * 
	 * @param pageUrl
	 *            the URL of the seed
	 * @param docId
	 *            the document id that you want to be assigned to this seed URL.
	 * 
	 */
	public void addSeed(String pageUrl, int docId) {
		String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
		if (canonicalUrl == null) {
			logger.error("Invalid seed URL: " + pageUrl);
			return;
		}
		if (docId < 0) {
			docId = docIdServer.getDocId(canonicalUrl);
			if (docId > 0) {
				// This URL is already seen.
				return;
			}
			docId = docIdServer.getNewDocID(canonicalUrl);
		} else {
			try {
				docIdServer.addUrlAndDocId(canonicalUrl, docId);
			} catch (Exception e) {
				logger.error("Could not add seed: " + e.getMessage());
			}
		}

		WebURL webUrl = new WebURL();
		webUrl.setURL(canonicalUrl);
		webUrl.setDocid(docId);
		webUrl.setDepth((short) 0);
		if (!robotstxtServer.allows(webUrl)) {
			logger.info("Robots.txt does not allow this seed: " + pageUrl);
		} else {
			frontier.schedule(webUrl);
		}
	}

	/**
	 * This function can called to assign a specific document id to a url. This
	 * feature is useful when you have had a previous crawl and have stored the
	 * Urls and their associated document ids and want to have a new crawl which
	 * is aware of the previously seen Urls and won't re-crawl them.
	 * 
	 * Note that if you add three seen Urls with document ids 1,2, and 7. Then
	 * the next URL that is found during the crawl will get a doc id of 8. Also
	 * you need to ensure to add seen Urls in increasing order of document ids. 
	 * 
	 * @param url
	 *            the URL of the page
	 * @param docId
	 *            the document id that you want to be assigned to this URL.
	 * 
	 */
	public void addSeenUrl(String url, int docId) {
		String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
		if (canonicalUrl == null) {
			logger.error("Invalid Url: " + url);
			return;
		}
		try {
			docIdServer.addUrlAndDocId(canonicalUrl, docId);
		} catch (Exception e) {
			logger.error("Could not add seen url: " + e.getMessage());
		}
	}

	public PageFetcher getPageFetcher() {
		return pageFetcher;
	}

	public void setPageFetcher(PageFetcher pageFetcher) {
		this.pageFetcher = pageFetcher;
	}

	public RobotstxtServer getRobotstxtServer() {
		return robotstxtServer;
	}

	public void setRobotstxtServer(RobotstxtServer robotstxtServer) {
		this.robotstxtServer = robotstxtServer;
	}

	public Frontier getFrontier() {
		return frontier;
	}

	public void setFrontier(Frontier frontier) {
		this.frontier = frontier;
	}

	public DocIDServer getDocIdServer() {
		return docIdServer;
	}

	public void setDocIdServer(DocIDServer docIdServer) {
		this.docIdServer = docIdServer;
	}

	public Object getCustomData() {
		return customData;
	}

	public void setCustomData(Object customData) {
		this.customData = customData;
	}

	public boolean isFinished() {
		return this.finished;
	}

	public boolean isShuttingDown() {
		return shuttingDown;
	}

	/**
	 * Set the current crawling session set to 'shutdown'. Crawler threads
	 * monitor the shutdown flag and when it is set to true, they will no longer
	 * process new pages.
	 */
	public void shutdown() {
		logger.info("Shutting down...");
		this.shuttingDown = true;
		frontier.finish();
	}
}