/** * */ package ecologylab.bigsemantics.collecting; import java.io.File; import java.util.Collection; import java.util.HashSet; import ecologylab.bigsemantics.seeding.Seed; import ecologylab.bigsemantics.seeding.SeedDistributor; import ecologylab.bigsemantics.seeding.SeedPeer; import ecologylab.bigsemantics.seeding.SeedSet; import ecologylab.bigsemantics.seeding.SemanticsPrefs; import ecologylab.collections.PrefixCollection; import ecologylab.collections.PrefixPhrase; import ecologylab.generic.Debug; import ecologylab.generic.StringTools; import ecologylab.net.ParsedURL; import ecologylab.serialization.ElementState; /** * All state related to Seeding. * * @author andruid */ public class Seeding extends ElementState implements SemanticsPrefs { private SemanticsSessionScope semanticsSessionScope; private Crawler crawler; /** * */ public Seeding() { } public Seeding(SemanticsSessionScope semanticsSessionScope) { setSemanticsSessionScope(semanticsSessionScope); } public void setSemanticsSessionScope(SemanticsSessionScope semanticsSessionScope) { this.semanticsSessionScope = semanticsSessionScope; this.crawler = semanticsSessionScope.getCrawler(); } ////////////////////////////////////////// seeding stuff //////////////////////////////////////////////////////////// /** * Bias for ads: if filter matches a url to an ad (for HTMLPages or MediaElements), multiply bias * by this number. Special values: 0 reject ads (eg, filter matches), altogether */ protected float adsBias; /** * Hashtable of domains the information space author doesn't want any information elements from. */ protected HashSet<String> rejectDomains = new HashSet<String>(); /** * A count of seeds whose downloading failed. Used during startup, to determine whether it is time * to start the crawler. We make sure to download all seeds before giving resources to the * crawler. This makes startup fairer; otherwise, the first recorded seeds get too much priority. */ protected int badSeeds; protected boolean playOnStart; /** * Address prefixes derived from original seeding, traversable| seeding specs, and server-side * redirects. Defines the spanning set of basis vectors of the information space, when limit * traversal (stay close) is true. */ // private Vector<String> traversableURLStrings = new Vector<String>(); protected PrefixCollection traversablePrefixes = new PrefixCollection(); /** * Untraversable urls defined at seeding time. These supercede traversable specs. */ // Vector<String> untraversableURLStrings = new Vector<String>(); protected PrefixCollection untraversablePrefixes = new PrefixCollection(); private SeedSet seedSet; // static final Object SEEDING_STATE_LOCK = new Object(); protected boolean duringSeeding; public boolean heterogeneousSearchScenario = true; boolean acceptAll; public void setHeterogeneousSearchScenario(boolean b) { this.heterogeneousSearchScenario = b; } // ++++++++++++++++++++++++++++++++++++++++ // /** * Add the directory that this URL references to the traversable set; that is, to the bounding set * of path prefixes that we are willing to download from, given "limit traversal." This is called * automatically, as well as through traversable|; thus it parses to the directory level, removing * any filename portion of the URL. */ public void traversable(ParsedURL purl) { traversable(purl, false); } public void traversable(ParsedURL purl, boolean ignoreReject) { // String uniquePrefix = purl.directoryString(); // debug("add traversable " +url +"->" +uniquePrefix); // println("-- allow downloads that start with " + uniquePrefix + " --"); // traversable(uniquePrefix); // PrefixPhrase prefixPhrase = traversablePrefixes.add(purl); // StringBuilder buffy = new StringBuilder("-- allow downloads that start with "); // prefixPhrase.toStringBuilder(buffy, traversablePrefixes.separator()); // buffy.append(" --"); // println(buffy); if ((ignoreReject || this.isNotReject(purl)) && !traversablePrefixes.match(purl)) //If this purl already exists in traversablePrefixes, don't add/ recordPrefix(traversablePrefixes, purl, "-- allow downloads that start with "); } private void recordPrefix(PrefixCollection prefixCollection, ParsedURL purl, String message) { // String uniquePrefix = purl.directoryString(); // debug("add traversable " +url +"->" +uniquePrefix); // println("-- allow downloads that start with " + uniquePrefix + " --"); // traversable(uniquePrefix); PrefixPhrase prefixPhrase = prefixCollection.add(purl); StringBuilder buffy = new StringBuilder(message); prefixPhrase.toStringBuilder(buffy, traversablePrefixes.separator()); buffy.append(" --"); Debug.println(buffy); } public boolean isNotReject(ParsedURL purl) { String domain = purl.domain(); boolean result = domain != null; if (result) { result = !rejectDomains.contains(domain); } if (result) { result = !purl.isUnsupported(); } if (!result) warning("Rejecting navigation to " + purl); return result; } /** * Define the directory of the purl as a prefix that will not be crawled to, if limit_traveral is * on. */ public void untraversable(ParsedURL purl) { // String uniquePrefix = purl.directoryString(); // // debug("add traversable " +url +"->" +uniquePrefix); // if (!untraversableURLStrings.contains(uniquePrefix)) // { // println("-- refusing downloads that start with " + uniquePrefix + " --"); // untraversableURLStrings.addElement(uniquePrefix); // } recordPrefix(untraversablePrefixes, purl, "-- refuse downloads that start with "); } public void beginSeeding() { synchronized (SEEDING_STATE_LOCK) { if (!duringSeeding) { debug("beginSeeding() pause crawler"); duringSeeding = true; semanticsSessionScope.getDownloadMonitors().pauseRegular(true); if (crawler != null) crawler.pause(); } } } /** * Called when seeding is complete. */ public void endSeeding() { synchronized (SEEDING_STATE_LOCK) { if (duringSeeding) { duringSeeding = false; debug("endSeeding() unpause crawler"); semanticsSessionScope.getDownloadMonitors().pauseRegular(false); if (crawler != null) crawler.start(); // start thread *or* unpause() } } } public Collection<String> traversableURLStrings() { return traversablePrefixes.values(); } public Collection<String> untraversableURLStrings() { return untraversablePrefixes.values(); } public Collection<String> rejectDomainsCollection() { return rejectDomains; } public int numSeeds() { return (seedSet == null) ? -1 : seedSet.size(); } // Accessors for InfoCollectorState // /////////////////////////////////////////////////////// public float adsBias() { return adsBias; } /** * Are we willing to accept this url into one of the agent's candidate pools? * * @return -1 if this url is null or from a domain we're rejecting. * 0 if this url is null or from a domain we go justFollow for. * 1 if this url is null or from a domain we're accepting. */ public boolean accept(ParsedURL purl) { if (acceptAll) return true; boolean result = !(purl.url().getProtocol().equals("https://")); if (result) { result = isNotReject(purl); if (result) { result = !untraversablePrefixes.match(purl); if (result && LIMIT_TRAVERSAL.value()) { result = traversablePrefixes.match(purl); } } } // if (!result) // debug("accept() NOT " + purl); return result; } public void trackFirstSeedSet(SeedSet seedSet) { // TODO Auto-generated method stub } public void setPlayOnStart(boolean b) { this.playOnStart = b; } public void clear() { // TODO Auto-generated method stub } public void setCurrentFileFromUntitled(File file) { // TODO Auto-generated method stub } public SeedPeer constructSeedPeer(Seed seed) { return null; // return DASHBOARD_ENABLED ? new SeedPeerDashboardOperand(seed, this) : null; } public void reject(String siteAddr) { if (siteAddr != null) { String domain = StringTools.domain(siteAddr); if (domain != null) { rejectDomains.add(domain); Debug.println("-- rejecting all web addresses from domain "+domain+ " --"); } } } public SeedSet getSeedSet() { SeedSet result = this.seedSet; if (result == null) { result = new SeedSet(); this.seedSet = result; } return result; } public void clearSeedSet() { if(seedSet != null) seedSet.clear(); } public void addSeeds(SeedSet<? extends Seed> newSeeds) { if (this.seedSet == null) this.seedSet = newSeeds; else { if (!newSeeds.isEmpty()) { for (Seed seed: newSeeds) { this.seedSet.add(seed, semanticsSessionScope); } } } } public void getMoreSeedResults() { if (seedSet != null) { System.out.println(this + ".getMoreSeedResults()!!! " + seedSet.getStartingResultNum()); seedSet.performNextSeeding(semanticsSessionScope); } } public SeedDistributor getSeedDistributor() { return seedSet.seedDistributer(semanticsSessionScope); } /** * @return the duringSeeding */ public boolean isDuringSeeding() { return duringSeeding; } /** * @return the heterogeneousSearchScenario */ public boolean isHeterogeneousSearchScenario() { return heterogeneousSearchScenario; } /** * @return the playOnStart */ public boolean isPlayOnStart() { return playOnStart; } ////////////////////////////////////////// end seeding stuff ////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////// }