/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.frontier;
import org.apache.log4j.Logger;
import com.sleepycat.je.*;
import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.util.Util;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class DocIDServer extends Configurable {
protected static final Logger logger = Logger.getLogger(DocIDServer.class.getName());
protected Database docIDsDB = null;
protected final Object mutex = new Object();
protected int lastDocID;
public DocIDServer(Environment env, CrawlConfig config) throws DatabaseException {
super(config);
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(config.isResumableCrawling());
dbConfig.setDeferredWrite(!config.isResumableCrawling());
docIDsDB = env.openDatabase(null, "DocIDs", dbConfig);
if (config.isResumableCrawling()) {
int docCount = getDocCount();
if (docCount > 0) {
logger.info("Loaded " + docCount + " URLs that had been detected in previous crawl.");
lastDocID = docCount;
}
} else {
lastDocID = 0;
}
}
/**
* Returns the docid of an already seen url.
*
* @param url the URL for which the docid is returned.
* @return the docid of the url if it is seen before. Otherwise -1 is returned.
*/
public int getDocId(String url) {
synchronized (mutex) {
if (docIDsDB == null) {
return -1;
}
OperationStatus result;
DatabaseEntry value = new DatabaseEntry();
try {
DatabaseEntry key = new DatabaseEntry(url.getBytes());
result = docIDsDB.get(null, key, value, null);
if (result == OperationStatus.SUCCESS && value.getData().length > 0) {
return Util.byteArray2Int(value.getData());
}
} catch (Exception e) {
e.printStackTrace();
}
return -1;
}
}
public int getNewDocID(String url) {
synchronized (mutex) {
try {
// Make sure that we have not already assigned a docid for this URL
int docid = getDocId(url);
if (docid > 0) {
return docid;
}
lastDocID++;
docIDsDB.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(Util.int2ByteArray(lastDocID)));
return lastDocID;
} catch (Exception e) {
e.printStackTrace();
}
return -1;
}
}
public void addUrlAndDocId(String url, int docId) throws Exception {
synchronized (mutex) {
if (docId <= lastDocID) {
throw new Exception("Requested doc id: " + docId + " is not larger than: " + lastDocID);
}
// Make sure that we have not already assigned a docid for this URL
int prevDocid = getDocId(url);
if (prevDocid > 0) {
if (prevDocid == docId) {
return;
}
throw new Exception("Doc id: " + prevDocid + " is already assigned to URL: " + url);
}
docIDsDB.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(Util.int2ByteArray(docId)));
lastDocID = docId;
}
}
public boolean isSeenBefore(String url) {
return getDocId(url) != -1;
}
public int getDocCount() {
try {
return (int) docIDsDB.count();
} catch (DatabaseException e) {
e.printStackTrace();
}
return -1;
}
public void sync() {
if (config.isResumableCrawling()) {
return;
}
if (docIDsDB == null) {
return;
}
try {
docIDsDB.sync();
} catch (DatabaseException e) {
e.printStackTrace();
}
}
public void close() {
try {
docIDsDB.close();
} catch (DatabaseException e) {
e.printStackTrace();
}
}
}