/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file. Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package focusedCrawler.link.frontier;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import focusedCrawler.util.persistence.PersistentHashtable;
import focusedCrawler.util.persistence.Tuple;
import focusedCrawler.util.persistence.TupleIterator;
public class Frontier {
protected PersistentHashtable<LinkRelevance> urlRelevance;
protected Map<String, Integer> scope = null;
private boolean useScope = false;
public Frontier(String directory, int maxCacheUrlsSize, Map<String, Integer> scope) {
this.urlRelevance = new PersistentHashtable<>(directory, maxCacheUrlsSize, LinkRelevance.class);
if (scope == null) {
this.useScope = false;
this.scope = new HashMap<String, Integer>();
} else {
this.scope = scope;
this.useScope = true;
}
}
public Frontier(String directory, int maxCacheUrlsSize) {
this(directory, maxCacheUrlsSize, null);
}
public void commit() {
urlRelevance.commit();
}
/**
*
* @return
* @throws Exception
*/
public HashSet<String> visitedAuths() throws Exception {
HashSet<String> result = new HashSet<String>();
List<Tuple<LinkRelevance>> tuples = urlRelevance.getTable();
for (Tuple<LinkRelevance> tuple : tuples) {
double value = tuple.getValue().getRelevance();
if (value < -200) {
result.add(URLDecoder.decode(tuple.getKey(), "UTF-8"));
}
}
return result;
}
public HashSet<String> visitedLinks() throws Exception {
HashSet<String> result = new HashSet<String>();
List<Tuple<LinkRelevance>> tuples = urlRelevance.getTable();
for (Tuple<LinkRelevance> tuple : tuples) {
double value = tuple.getValue().getRelevance();
if (value < 0) {
result.add(URLDecoder.decode(tuple.getKey(), "UTF-8"));
}
}
System.out.println(result.size()+" out of "+urlRelevance.getTable().size());
return result;
}
public HashSet<String> unvisitedAuths() throws Exception {
HashSet<String> result = new HashSet<String>();
List<Tuple<LinkRelevance>> tuples = urlRelevance.getTable();
for (Tuple<LinkRelevance> tuple : tuples) {
double value = tuple.getValue().getRelevance();
if (value > 200) {
result.add(URLDecoder.decode(tuple.getKey(), "UTF-8"));
}
}
return result;
}
public HashSet<String> visitedHubs() throws Exception {
HashSet<String> result = new HashSet<String>();
List<Tuple<LinkRelevance>> tuples = urlRelevance.getTable();
for (Tuple<LinkRelevance> tuple : tuples) {
double value = tuple.getValue().getRelevance();
if (value > -200 && value < -100) {
result.add(URLDecoder.decode(tuple.getKey(), "UTF-8"));
}
}
return result;
}
public HashSet<String> unvisitedHubs() throws Exception {
HashSet<String> result = new HashSet<String>();
List<Tuple<LinkRelevance>> tuples = urlRelevance.getTable();
for (Tuple<LinkRelevance> tuple : tuples) {
double value = tuple.getValue().getRelevance();
if (value > 100 && value < 200) {
result.add(URLDecoder.decode(tuple.getKey(), "UTF-8"));
}
}
return result;
}
public void update(LinkRelevance linkRelevance) {
String url = linkRelevance.getURL().toString();
LinkRelevance link = urlRelevance.get(url);
if (link != null) {
if (link.getRelevance() > 0) { // not visited url
urlRelevance.put(url, linkRelevance);
}
}
}
/**
* This method inserts a new link into the frontier
*
* @param linkRelev
* @return
* @throws FrontierPersistentException
*/
public boolean insert(LinkRelevance linkRelev) throws FrontierPersistentException {
boolean inserted = false;
String url = linkRelev.getURL().toString();
Integer rel = exist(linkRelev);
if (rel == null && url.toString().length() < 210) {
urlRelevance.put(url, linkRelev);
inserted = true;
}
return inserted;
}
/**
* It verifies whether a given URL was already visited or does not belong to
* the scope.
*
* @param linkRelev
* @return
* @throws FrontierPersistentException
*/
public Integer exist(LinkRelevance linkRelev) throws FrontierPersistentException {
String url = linkRelev.getURL().toString();
LinkRelevance resStr = urlRelevance.get(url);
if (resStr != null) {
return (int) resStr.getRelevance();
} else {
Integer result = new Integer(-1);
if (useScope == true) {
String host = linkRelev.getURL().getHost();
if (scope.get(host) != null) {
result = null;
}
} else {
result = null;
}
return result;
}
}
/**
* It deletes a URL from frontier (marks as visited).
*
* @param linkRelevance
* @throws FrontierPersistentException
*/
public void delete(LinkRelevance linkRelevance) throws FrontierPersistentException {
String url = linkRelevance.getURL().toString();
if (exist(linkRelevance) != null) {
// we don't want to delete the URL file, it is useful to avoid visiting an old url
double relevance = linkRelevance.getRelevance();
urlRelevance.put(url, new LinkRelevance(linkRelevance.getURL(), -relevance, linkRelevance.getType()));
}
}
public void close() {
urlRelevance.close();
}
public PersistentHashtable<LinkRelevance> getUrlRelevanceHashtable() {
return urlRelevance;
}
public Map<String, Integer> getScope() {
return scope;
}
public TupleIterator<LinkRelevance> iterator() {
return urlRelevance.iterator();
}
}