Seeding.java example

Explorer
BigSemanticsJava-master
/**
 * 
 */
package ecologylab.bigsemantics.collecting;

import java.io.File;
import java.util.Collection;
import java.util.HashSet;

import ecologylab.bigsemantics.seeding.Seed;
import ecologylab.bigsemantics.seeding.SeedDistributor;
import ecologylab.bigsemantics.seeding.SeedPeer;
import ecologylab.bigsemantics.seeding.SeedSet;
import ecologylab.bigsemantics.seeding.SemanticsPrefs;
import ecologylab.collections.PrefixCollection;
import ecologylab.collections.PrefixPhrase;
import ecologylab.generic.Debug;
import ecologylab.generic.StringTools;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.ElementState;

/**
 * All state related to Seeding.
 * 
 * @author andruid
 */
public class Seeding extends ElementState
implements SemanticsPrefs
{
	private SemanticsSessionScope 	semanticsSessionScope;
	
	private Crawler									crawler;
	/**
	 * 
	 */
	public Seeding()
	{
		
	}
	public Seeding(SemanticsSessionScope semanticsSessionScope)
	{
		setSemanticsSessionScope(semanticsSessionScope);
	}
	public void setSemanticsSessionScope(SemanticsSessionScope semanticsSessionScope)
	{
		this.semanticsSessionScope	= semanticsSessionScope;
		this.crawler								= semanticsSessionScope.getCrawler();
	}

	////////////////////////////////////////// seeding stuff ////////////////////////////////////////////////////////////
	/**
	 * Bias for ads: if filter matches a url to an ad (for HTMLPages or MediaElements), multiply bias
	 * by this number. Special values: 0 reject ads (eg, filter matches), altogether
	 */
	protected float															adsBias;

	/**
	 * Hashtable of domains the information space author doesn't want any information elements from.
	 */
	protected HashSet<String>	rejectDomains									= new HashSet<String>();

	/**
	 * A count of seeds whose downloading failed. Used during startup, to determine whether it is time
	 * to start the crawler. We make sure to download all seeds before giving resources to the
	 * crawler. This makes startup fairer; otherwise, the first recorded seeds get too much priority.
	 */
	protected int																badSeeds;
	
	protected boolean														playOnStart;

	/**
	 * Address prefixes derived from original seeding, traversable| seeding specs, and server-side
	 * redirects. Defines the spanning set of basis vectors of the information space, when limit
	 * traversal (stay close) is true.
	 */
	// private Vector<String> traversableURLStrings = new Vector<String>();
	protected PrefixCollection									traversablePrefixes						= new PrefixCollection();

	/**
	 * Untraversable urls defined at seeding time. These supercede traversable specs.
	 */
	// Vector<String> untraversableURLStrings = new Vector<String>();
	protected PrefixCollection									untraversablePrefixes					= new PrefixCollection();

	private SeedSet															seedSet;

	//
	static final Object													SEEDING_STATE_LOCK = new Object();

	protected boolean														duringSeeding;

	public boolean 															heterogeneousSearchScenario = true;
	
	boolean 																		acceptAll;

	public void setHeterogeneousSearchScenario(boolean b)
	{
		this.heterogeneousSearchScenario	= b;
	}

	// ++++++++++++++++++++++++++++++++++++++++ //
	
	/**
	 * Add the directory that this URL references to the traversable set; that is, to the bounding set
	 * of path prefixes that we are willing to download from, given "limit traversal." This is called
	 * automatically, as well as through traversable|; thus it parses to the directory level, removing
	 * any filename portion of the URL.
	 */
	public void traversable(ParsedURL purl)
	{
		traversable(purl, false);
	}
	
	public void traversable(ParsedURL purl, boolean ignoreReject)
	{
		// String uniquePrefix = purl.directoryString();
		// debug("add traversable " +url +"->" +uniquePrefix);
		// println("-- allow downloads that start with " + uniquePrefix + " --");
		// traversable(uniquePrefix);
		// PrefixPhrase prefixPhrase = traversablePrefixes.add(purl);
		// StringBuilder buffy = new StringBuilder("-- allow downloads that start with ");
		// prefixPhrase.toStringBuilder(buffy, traversablePrefixes.separator());
		// buffy.append(" --");
		// println(buffy);
		if ((ignoreReject || this.isNotReject(purl)) &&
				!traversablePrefixes.match(purl)) //If this purl already exists in traversablePrefixes, don't add/
			recordPrefix(traversablePrefixes, purl, "-- allow downloads that start with ");
	}

	private void recordPrefix(PrefixCollection prefixCollection, ParsedURL purl, String message)
	{
		// String uniquePrefix = purl.directoryString();
		// debug("add traversable " +url +"->" +uniquePrefix);
		// println("-- allow downloads that start with " + uniquePrefix + " --");
		// traversable(uniquePrefix);
		PrefixPhrase prefixPhrase = prefixCollection.add(purl);
		StringBuilder buffy = new StringBuilder(message);
		prefixPhrase.toStringBuilder(buffy, traversablePrefixes.separator());
		buffy.append(" --");
		Debug.println(buffy);
	}

	public boolean isNotReject(ParsedURL purl)
	{
		String domain = purl.domain();
		boolean result = domain != null;
		if (result)
		{
			result = !rejectDomains.contains(domain);
		}
		if (result)
		{
			result = !purl.isUnsupported();
		}
		if (!result)
			warning("Rejecting navigation to " + purl);

		return result;
	}

	/**
	 * Define the directory of the purl as a prefix that will not be crawled to, if limit_traveral is
	 * on.
	 */
	public void untraversable(ParsedURL purl)
	{
		// String uniquePrefix = purl.directoryString();
		// // debug("add traversable " +url +"->" +uniquePrefix);
		// if (!untraversableURLStrings.contains(uniquePrefix))
		// {
		// println("-- refusing downloads that start with " + uniquePrefix + " --");
		// untraversableURLStrings.addElement(uniquePrefix);
		// }
		recordPrefix(untraversablePrefixes, purl, "-- refuse downloads that start with ");
	}

	public void beginSeeding()
	{
		synchronized (SEEDING_STATE_LOCK)
		{
			if (!duringSeeding)
			{
				debug("beginSeeding() pause crawler");
				duringSeeding = true;
				semanticsSessionScope.getDownloadMonitors().pauseRegular(true);
				if (crawler != null)
					crawler.pause();
			}
		}
	}

	/**
	 * Called when seeding is complete.
	 */
	public void endSeeding()
	{
		synchronized (SEEDING_STATE_LOCK)
		{
			if (duringSeeding)
			{
				duringSeeding = false;
				debug("endSeeding() unpause crawler");
				semanticsSessionScope.getDownloadMonitors().pauseRegular(false);
				if (crawler != null)
					crawler.start();	// start thread *or* unpause()
			}
		}
	}

	public Collection<String> traversableURLStrings()
	{
		return traversablePrefixes.values();
	}

	public Collection<String> untraversableURLStrings()
	{
		return untraversablePrefixes.values();
	}

	public Collection<String> rejectDomainsCollection()
	{
		return rejectDomains;
	}

	public int numSeeds()
	{
		return (seedSet == null) ? -1 : seedSet.size();
	}

	// Accessors for InfoCollectorState
	// ///////////////////////////////////////////////////////
	public float adsBias()
	{
		return adsBias;
	}

	/**
	 * Are we willing to accept this url into one of the agent's candidate pools?
	 *
	 * @return	-1 if this url is null or from a domain we're rejecting.
	 * 		0  if this url is null or from a domain we go justFollow for.
	 * 		1  if this url is null or from a domain we're accepting.
	 */
	public boolean accept(ParsedURL purl)
	{
		if (acceptAll)
			return true;
		
		boolean result	= !(purl.url().getProtocol().equals("https://"));
		if (result)
		{
			result		= isNotReject(purl);
			if (result)
			{
				result	= !untraversablePrefixes.match(purl);
				if (result && LIMIT_TRAVERSAL.value())
				{
					result	= traversablePrefixes.match(purl);
				}
			}
		}
//		if (!result)
//			debug("accept() NOT " + purl);
		return result;
	}

	

	public void trackFirstSeedSet(SeedSet seedSet)
	{
		// TODO Auto-generated method stub
		
	}

	public void setPlayOnStart(boolean b)
	{
		this.playOnStart	= b;
	}

	public void clear()
	{
		// TODO Auto-generated method stub
		
	}


	public void setCurrentFileFromUntitled(File file)
	{
		// TODO Auto-generated method stub
		
	}
	public SeedPeer constructSeedPeer(Seed seed)
	{
		return null;
//		return DASHBOARD_ENABLED ? new SeedPeerDashboardOperand(seed, this) : null;
	}

	public void reject(String siteAddr)
	{
		if (siteAddr != null)
		{
			String domain	= StringTools.domain(siteAddr);
			if (domain != null)
			{
				rejectDomains.add(domain);
				Debug.println("-- rejecting all web addresses from domain "+domain+ " --");
			}
		}
	}
	
	public SeedSet getSeedSet()
	{
		SeedSet result = this.seedSet;
		if (result == null)
		{
			result = new SeedSet();
			this.seedSet = result;
		}
		return result;
	}

	public void clearSeedSet()
	{
		if(seedSet != null)
			seedSet.clear();
	}
	
	public void addSeeds(SeedSet<? extends Seed> newSeeds)
	{
		
		if (this.seedSet == null)
			this.seedSet = newSeeds;
		else
		{
			if (!newSeeds.isEmpty())
			{
				for (Seed seed: newSeeds)
				{
					this.seedSet.add(seed, semanticsSessionScope);
				}
			}
		}
	}

	public void getMoreSeedResults()
	{
		if (seedSet != null)
		{
			System.out.println(this + ".getMoreSeedResults()!!! " + seedSet.getStartingResultNum());
			seedSet.performNextSeeding(semanticsSessionScope);
		}
	}

	public SeedDistributor getSeedDistributor()
	{
		return seedSet.seedDistributer(semanticsSessionScope);
	}
	
	/**
	 * @return the duringSeeding
	 */
	public boolean isDuringSeeding()
	{
		return duringSeeding;
	}
	/**
	 * @return the heterogeneousSearchScenario
	 */
	public boolean isHeterogeneousSearchScenario()
	{
		return heterogeneousSearchScenario;
	}
	/**
	 * @return the playOnStart
	 */
	public boolean isPlayOnStart()
	{
		return playOnStart;
	}

	////////////////////////////////////////// end seeding stuff //////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	
}