/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.frontier;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.frontier.Counters.ReservedCounterNames;
import edu.uci.ics.crawler4j.url.WebURL;
import org.apache.log4j.Logger;
import java.util.List;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class Frontier extends Configurable {
protected static final Logger logger = Logger.getLogger(Frontier.class.getName());
protected WorkQueues workQueues;
protected InProcessPagesDB inProcessPages;
protected final Object mutex = new Object();
protected final Object waitingList = new Object();
protected boolean isFinished = false;
protected long scheduledPages;
protected DocIDServer docIdServer;
protected Counters counters;
public Frontier(Environment env, CrawlConfig config, DocIDServer docIdServer) {
super(config);
this.counters = new Counters(env, config);
this.docIdServer = docIdServer;
try {
workQueues = new WorkQueues(env, "PendingURLsDB", config.isResumableCrawling());
if (config.isResumableCrawling()) {
scheduledPages = counters.getValue(ReservedCounterNames.SCHEDULED_PAGES);
inProcessPages = new InProcessPagesDB(env);
long numPreviouslyInProcessPages = inProcessPages.getLength();
if (numPreviouslyInProcessPages > 0) {
logger.info("Rescheduling " + numPreviouslyInProcessPages + " URLs from previous crawl.");
scheduledPages -= numPreviouslyInProcessPages;
while (true) {
List<WebURL> urls = inProcessPages.get(100);
if (urls.size() == 0) {
break;
}
scheduleAll(urls);
inProcessPages.delete(urls.size());
}
}
} else {
inProcessPages = null;
scheduledPages = 0;
}
} catch (DatabaseException e) {
logger.error("Error while initializing the Frontier: " + e.getMessage());
workQueues = null;
}
}
public void scheduleAll(List<WebURL> urls) {
int maxPagesToFetch = config.getMaxPagesToFetch();
synchronized (mutex) {
int newScheduledPage = 0;
for (WebURL url : urls) {
if (maxPagesToFetch > 0 && (scheduledPages + newScheduledPage) >= maxPagesToFetch) {
break;
}
try {
workQueues.put(url);
newScheduledPage++;
} catch (DatabaseException e) {
logger.error("Error while puting the url in the work queue.");
}
}
if (newScheduledPage > 0) {
scheduledPages += newScheduledPage;
counters.increment(Counters.ReservedCounterNames.SCHEDULED_PAGES, newScheduledPage);
}
synchronized (waitingList) {
waitingList.notifyAll();
}
}
}
public void schedule(WebURL url) {
int maxPagesToFetch = config.getMaxPagesToFetch();
synchronized (mutex) {
try {
if (maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch) {
workQueues.put(url);
scheduledPages++;
counters.increment(Counters.ReservedCounterNames.SCHEDULED_PAGES);
}
} catch (DatabaseException e) {
logger.error("Error while puting the url in the work queue.");
}
}
}
public void getNextURLs(int max, List<WebURL> result) {
while (true) {
synchronized (mutex) {
if (isFinished) {
return;
}
try {
List<WebURL> curResults = workQueues.get(max);
workQueues.delete(curResults.size());
if (inProcessPages != null) {
for (WebURL curPage : curResults) {
inProcessPages.put(curPage);
}
}
result.addAll(curResults);
} catch (DatabaseException e) {
logger.error("Error while getting next urls: " + e.getMessage());
e.printStackTrace();
}
if (result.size() > 0) {
return;
}
}
try {
synchronized (waitingList) {
waitingList.wait();
}
} catch (InterruptedException ignored) {
// Do nothing
}
if (isFinished) {
return;
}
}
}
public void setProcessed(WebURL webURL) {
counters.increment(ReservedCounterNames.PROCESSED_PAGES);
if (inProcessPages != null) {
if (!inProcessPages.removeURL(webURL)) {
logger.warn("Could not remove: " + webURL.getURL() + " from list of processed pages.");
}
}
}
public long getQueueLength() {
return workQueues.getLength();
}
public long getNumberOfAssignedPages() {
return inProcessPages.getLength();
}
public long getNumberOfProcessedPages() {
return counters.getValue(ReservedCounterNames.PROCESSED_PAGES);
}
public void sync() {
workQueues.sync();
docIdServer.sync();
counters.sync();
}
public boolean isFinished() {
return isFinished;
}
public void close() {
sync();
workQueues.close();
counters.close();
}
public void finish() {
isFinished = true;
synchronized (waitingList) {
waitingList.notifyAll();
}
}
}