/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file. Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package focusedCrawler.link;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.Vector;
import focusedCrawler.link.backlink.BacklinkSurfer;
import focusedCrawler.link.classifier.LinkClassifier;
import focusedCrawler.link.classifier.LinkClassifierException;
import focusedCrawler.link.frontier.FrontierManager;
import focusedCrawler.link.frontier.FrontierPersistentException;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.model.Page;
import focusedCrawler.util.parser.BackLinkNeighborhood;
import focusedCrawler.util.parser.LinkNeighborhood;
/**
* This class is responsible to manage the info in the graph (backlinks and outlinks).
* @author lbarbosa
*
*/
public class BipartiteGraphManager {
private FrontierManager frontierManager;
private BacklinkSurfer surfer;
private LinkClassifier backlinkClassifier;
private LinkClassifier outlinkClassifier;
private BipartiteGraphRepository graphRepository;
// Data structure for stop conditions //////////////////////////
private int maxPagesPerDomain = 100; // Maximum number of pages per each domain
private HashMap<String, Integer> domainCounter;// Count number of pages for each domain
///////////////////////////////////////////////////////////////
public BipartiteGraphManager(FrontierManager frontierManager,
BipartiteGraphRepository graphRepository,
LinkClassifier outlinkClassifier,
int maxPagesPerDomain,
BacklinkSurfer surfer,
LinkClassifier backlinkClassifier) {
this.frontierManager = frontierManager;
this.graphRepository = graphRepository;
this.outlinkClassifier = outlinkClassifier;
this.backlinkClassifier = backlinkClassifier;
this.domainCounter = new HashMap<String, Integer>();
this.maxPagesPerDomain = maxPagesPerDomain;
this.surfer = surfer;
}
public void setBacklinkClassifier(LinkClassifier classifier){
this.backlinkClassifier = classifier;
}
public void setOutlinkClassifier(LinkClassifier classifier){
this.outlinkClassifier = classifier;
}
public BipartiteGraphRepository getRepository(){
return this.graphRepository;
}
public void insertOutlinks(Page page) throws IOException, FrontierPersistentException, LinkClassifierException {
LinkRelevance[] linksRelevance = outlinkClassifier.classify(page);
ArrayList<LinkRelevance> temp = new ArrayList<LinkRelevance>();
HashSet<String> relevantURLs = new HashSet<String>();
for (int i = 0; i < linksRelevance.length; i++) {
//System.out.println("linksRelevance.length "+linksRelevance.length);
if (frontierManager.isRelevant(linksRelevance[i])) {
String url = linksRelevance[i].getURL().toString();
//System.out.println(url);
if (!relevantURLs.contains(url)) {
String domain = linksRelevance[i].getTopLevelDomainName();
Integer domainCount;
synchronized (domainCounter) {
domainCount = domainCounter.get(domain);
if (domainCount == null) {
domainCount = 0;
} else {
domainCount++;
}
domainCounter.put(domain, domainCount);
}
if (domainCount < maxPagesPerDomain) { // Stop Condition
relevantURLs.add(url);
temp.add(linksRelevance[i]);
}
}
}
}
LinkRelevance[] filteredLinksRelevance = temp.toArray(new LinkRelevance[relevantURLs.size()]);
LinkNeighborhood[] lns = page.getParsedData().getLinkNeighborhood();
for (int i = 0; i < lns.length; i++) {
if (!relevantURLs.contains(lns[i].getLink().toString())) {
lns[i] = null;
}
}
graphRepository.insertOutlinks(page.getURL(), lns);
frontierManager.insert(filteredLinksRelevance);
}
public void insertBacklinks(Page page) throws IOException, FrontierPersistentException, LinkClassifierException{
URL url = page.getURL();
BackLinkNeighborhood[] links = graphRepository.getBacklinks(url);
if(links == null || (links != null && links.length < 10)){
links = surfer.getLNBacklinks(url);
}
if(links != null && links.length > 0){
LinkRelevance[] linksRelevance = new LinkRelevance[links.length];
for (int i = 0; i < links.length; i++){
BackLinkNeighborhood backlink = links[i];
if(backlink != null){
LinkNeighborhood ln = new LinkNeighborhood(new URL(backlink.getLink()));
String title = backlink.getTitle();
if(title != null){
ln.setAround(tokenizeText(title));
}
linksRelevance[i] = backlinkClassifier.classify(ln);
}
}
frontierManager.insert(linksRelevance);
}
URL normalizedURL = new URL(url.getProtocol(), url.getHost(), "/");
graphRepository.insertBacklinks(normalizedURL, links);
}
private String[] tokenizeText(String text) {
StringTokenizer tokenizer = new StringTokenizer(text," ");
Vector<String> anchorTemp = new Vector<String>();
while(tokenizer.hasMoreTokens()){
anchorTemp.add(tokenizer.nextToken());
}
String[] aroundArray = new String[anchorTemp.size()];
anchorTemp.toArray(aroundArray);
return aroundArray;
}
}