package ecologylab.bigsemantics.metadata.builtins; /** * This is not generated code, but a hand-authored base class in the Metadata hierarchy. It is * hand-authored in order to provide specific functionalities **/ import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import ecologylab.bigsemantics.collecting.DownloadStatus; import ecologylab.bigsemantics.collecting.LocalDocumentCollections; import ecologylab.bigsemantics.collecting.SemanticsGlobalScope; import ecologylab.bigsemantics.collecting.SemanticsSite; import ecologylab.bigsemantics.documentparsers.DocumentParser; import ecologylab.bigsemantics.documentparsers.ParserResult; import ecologylab.bigsemantics.html.documentstructure.SemanticAnchor; import ecologylab.bigsemantics.html.documentstructure.SemanticInLinks; import ecologylab.bigsemantics.logging.DocumentLogRecord; import ecologylab.bigsemantics.metadata.Metadata; import ecologylab.bigsemantics.metadata.mm_no; import ecologylab.bigsemantics.metadata.builtins.declarations.DocumentDeclaration; import ecologylab.bigsemantics.metadata.scalar.MetadataParsedURL; import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField; import ecologylab.bigsemantics.metametadata.MetaMetadataRepository; import ecologylab.bigsemantics.seeding.SearchState; import ecologylab.bigsemantics.seeding.Seed; import ecologylab.generic.Continuation; import ecologylab.net.ParsedURL; import ecologylab.serialization.SIMPLTranslationException; import ecologylab.serialization.SimplTypesScope; import ecologylab.serialization.annotations.FieldUsage; import ecologylab.serialization.annotations.simpl_exclude_usage; import ecologylab.serialization.annotations.simpl_inherit; import ecologylab.serialization.annotations.simpl_scalar; import ecologylab.serialization.formatenums.StringFormat; /** * The Document Class **/ @simpl_inherit public class Document extends DocumentDeclaration { static public final Document RECYCLED_DOCUMENT = new Document(ParsedURL.getAbsolute("http://recycled.document")); static public final Document UNDEFINED_DOCUMENT = new Document(ParsedURL.getAbsolute("http://undefined.document")); protected SemanticsGlobalScope semanticsScope; private SemanticsSite site; private SemanticInLinks semanticInlinks; private DocumentLogRecord logRecord; private DocumentClosure documentClosure; private ParserResult parserResult; protected int badImages; private boolean sameDomainAsPrevious; private boolean alwaysAcceptRedirect; /** * Seed object associated with this, if this is a seed. */ private Seed seed; /** * Indicates that this Container is a truly a seed, not just one that is associated into a Seed's * inverted index. */ private boolean isTrueSeed; /** * Indicates that this Container is processed via drag and drop. */ private boolean isDnd; @simpl_exclude_usage(FieldUsage.SERIALIZATION_IN_STREAM) @simpl_scalar @mm_no private DownloadStatus downloadStatus = DownloadStatus.UNPROCESSED; /** * Used to keep track of transition times. */ private long lastActionTimestamp = 0; /** * Stores time in milliseconds taken before reaching a download status. */ private HashMap<DownloadStatus, Long> transitionTimeToDownloadStatus = new HashMap<DownloadStatus, Long>(); public Document() { super(); } public Document(MetaMetadataCompositeField metaMetadata) { super(metaMetadata); } /** * Construct an instance of this, the base document type, and set its location. * * @param location */ protected Document(ParsedURL location) { super(MetaMetadataRepository.getBaseDocumentMM()); initDocument(this, location); } /** * With the new *Declaration classes, constructor inheritance might be limited. Thus, the * initialization process can be abstracted out for subclasses to use. * * @param document * @param location */ protected static void initDocument(Document document, ParsedURL location) { document.setLocation(location); } /** * Use the local location if there is one; otherwise, just use the regular location. * * @return */ public ParsedURL getDownloadLocation() { ParsedURL result = getLocation(); ParsedURL localLocation = getLocalLocation(); if (localLocation != null) { File localFile = localLocation.file(); if (localFile.exists()) result = localLocation; } return result; } /** * Sets the value of the field location **/ @Override public void setLocation(ParsedURL location) { if (location != null) { this.location().setValue(location); if (logRecord != null) { logRecord.setDocumentLocation(location); } Document ancestor = getAncestor(); if (ancestor != null) { ParsedURL ancestorLocation = ancestor.getLocation(); String domain = location.domain(); sameDomainAsPrevious = (ancestorLocation != null && domain != null && domain.equals(ancestorLocation.domain())); } } } public boolean isLocationNull() { return getLocationMetadata() == null || getLocationMetadata().getValue() == null; } /** * @return the alwaysAcceptRedirects */ public boolean isAlwaysAcceptRedirect() { return alwaysAcceptRedirect; } /** * @param alwaysAcceptRedirects * the alwaysAcceptRedirects to set */ public void setAlwaysAcceptRedirect(boolean alwaysAcceptRedirects) { this.alwaysAcceptRedirect = alwaysAcceptRedirects; } public Document getAncestor() { return semanticInlinks == null ? null : semanticInlinks.getAncestor(); } public int getGeneration() { return semanticInlinks == null ? 0 : semanticInlinks.getGeneration(); } public int getEffectiveGeneration() { return semanticInlinks == null ? 0 : semanticInlinks.getEffectiveGeneration(); } /** * FOR BACKWARD COMPATABILITY. * The heavy weight setter method for field title * FIXME **/ public void hwSetTitle(String title) { } /** * FOR BACKWARD COMPATABILITY. * * FIXME **/ public String getTitle() { ParsedURL location = getLocation(); return location == null ? null : location.toString(); } /** * FOR BACKWARD COMPATABILITY. * * FIXME **/ public void setTitle(String foo) { } /** * @return the sameDomainAsPrevious */ public boolean isSameDomainAsPrevious() { return sameDomainAsPrevious; } @Override public int hashCode() { return (getLocationMetadata() == null) ? -1 : getLocationMetadata().hashCode(); } final Object CREATE_CLOSURE_LOCK = new Object(); public DocumentClosure documentClosure() { return documentClosure; } /** * * @return A closure for this, or null, if this is not fit to be parsed. */ public DocumentClosure getOrConstructClosure() { DocumentClosure result = this.documentClosure; if (result == null && !isRecycled() && getLocation() != null) { synchronized (CREATE_CLOSURE_LOCK) { result = this.documentClosure; if (result == null) { if (semanticInlinks == null) { semanticInlinks = new SemanticInLinks(); } result = constructClosure(); this.documentClosure = result; } } } return result == null || result.getDownloadStatus() == DownloadStatus.RECYCLED ? null : result; } private DocumentClosure constructClosure() { return new DocumentClosure(this, semanticInlinks); } public SemanticsSite getSite() { SemanticsSite result = this.site; if (result == null) { result = semanticsScope.getMetaMetadataRepository().getSite(this, semanticsScope); this.site = result; } return result; } SemanticsSite site() { return site; } /** * @return the infoCollector */ public SemanticsGlobalScope getSemanticsScope() { return semanticsScope; } /** * @param semanticsSessionScope * the infoCollector to set */ @Override public void setSemanticsSessionScope(SemanticsGlobalScope semanticsSessionScope) { this.semanticsScope = semanticsSessionScope; } /** * Get or lazily create a DocumentLogRecord or subclass instance. * * @return */ public DocumentLogRecord logRecord() { if (logRecord == null) { synchronized (this) { if (logRecord == null) { logRecord = new DocumentLogRecord(); } } } return logRecord; } public DocumentLogRecord getLogRecord() { return logRecord; } public void setLogRecord(DocumentLogRecord logRecord) { this.logRecord = logRecord; if (logRecord != null) { logRecord.setDocumentLocation(this.getLocation()); } } public void addAdditionalLocation(ParsedURL newPurl) { addAdditionalLocation(new MetadataParsedURL(newPurl)); } public void addAdditionalLocation(MetadataParsedURL newMPurl) { if (location().equals(newMPurl.getValue())) return; if (containsLocation(getAdditionalLocations(), newMPurl.getValue())) return; if (getAdditionalLocations() == null) setAdditionalLocations(new ArrayList<MetadataParsedURL>(3)); getAdditionalLocations().add(newMPurl); } private boolean containsLocation(List<MetadataParsedURL> list, ParsedURL purl) { if (list != null) for (MetadataParsedURL metadataPurl : list) if (purl.equals(metadataPurl.getValue())) return true; return false; } /** * Get the old location from this. Set the location of this to the newLocation. Add a mapping in * the GlobalCollection from newLocation to this. Add the old location for this as an * additionalLocation for this. * * @param newLocation */ public void changeLocation(final ParsedURL newLocation) { if (newLocation != null) { ParsedURL origLocation = getLocation(); if (!origLocation.equals(newLocation)) ; { setLocation(newLocation); getSemanticsScope().getLocalDocumentCollection().addMapping(newLocation, this); addAdditionalLocation(origLocation); } } } /** * Used when oldDocument turns out to be re-directed from this. * * @param oldDocument */ public void inheritValues(Document oldDocument) { oldDocument.getSemanticsScope().getLocalDocumentCollection().remap(oldDocument, this); if (getLocationMetadata() == null) { setLocationMetadata(oldDocument.getLocationMetadata()); oldDocument.setLocationMetadata(null); } this.semanticsScope = oldDocument.semanticsScope; SemanticInLinks oldInlinks = oldDocument.semanticInlinks; if (semanticInlinks == null || semanticInlinks.size() == 0) { this.semanticInlinks = oldInlinks; oldDocument.semanticInlinks = null; } else if (oldInlinks != null) semanticInlinks.merge(oldInlinks); List<Metadata> oldMixins = oldDocument.getMixins(); if (oldMixins != null) for (Metadata oldMixin : oldMixins) addMixin(oldMixin); List<MetadataParsedURL> oldAdditionalLocations = oldDocument.getAdditionalLocations(); if (oldAdditionalLocations != null) { for (MetadataParsedURL otherLocation : oldAdditionalLocations) addAdditionalLocation(otherLocation); } // TODO -- are there other values that should be propagated?! -- can use // MetadataFieldDescriptors. } public SemanticInLinks getSemanticInlinks() { SemanticInLinks result = this.semanticInlinks; if (result == null) { // TODO add concurrency control?! result = new SemanticInLinks(); this.semanticInlinks = result; } return result; } public void addSemanticInlink(SemanticAnchor semanticAnchor, Document source) { getSemanticInlinks().add(semanticAnchor, source); } public void addInlink(Document source) { getSemanticInlinks().add(source); } public boolean queueDownload(Continuation dispatchTarget) { DocumentClosure documentClosure = getOrConstructClosure(); if (documentClosure == null) return false; if (dispatchTarget != null) documentClosure.addContinuation(dispatchTarget); return documentClosure.queueDownload(); } public boolean queueDownload() { return queueDownload(null); } /** * Queue this document for downloading, and then block to wait for it to be downloaded and parsed * done. * * @param timeoutMs * The longest time to wait, in milliseconds. * @return If the downloading and parsing succeeded, as queueDownload(). * @throws InterruptedException */ public boolean queueDownloadAndWait(long timeoutMs) throws InterruptedException { if (downloadStatus != DownloadStatus.DOWNLOAD_DONE) { final Object lock = new Object(); boolean result = false; synchronized (lock) { result = queueDownload(new Continuation<DocumentClosure>() { @Override public void callback(DocumentClosure o) { synchronized (lock) { lock.notifyAll(); } } }); if (downloadStatus != DownloadStatus.DOWNLOAD_DONE) { lock.wait(timeoutMs); } } return result; } return true; } /** * Lookout for instances of the AnonymousDocument. * * @return false in the base class and most subs. */ public boolean isAnonymous() { return false; } public boolean isRecycled() { return super.isRecycled() || downloadStatus == DownloadStatus.RECYCLED; } void setRecycled() { LocalDocumentCollections globalCollection = semanticsScope.getLocalDocumentCollection(); globalCollection.setRecycled(getLocation()); if (getAdditionalLocations() != null) { for (MetadataParsedURL additionalMPurl : getAdditionalLocations()) globalCollection.setRecycled(additionalMPurl.getValue()); } } @Override public void recycle() { recycle(new HashSet<Metadata>()); } @Override public synchronized void recycle(HashSet<Metadata> visitedMetadata) { super.recycle(visitedMetadata); if (semanticInlinks != null) { semanticInlinks.recycle(); semanticInlinks = null; } if (parserResult != null) { parserResult.recycle(); parserResult = null; } this.downloadStatus = DownloadStatus.RECYCLED; } @Override public String toString() { return super.toString() + "[" + getLocation() + "]"; } public boolean isJustCrawl() { return false; } public void downloadAndParseDone(DocumentParser documentParser) { setDownloadStatus(DownloadStatus.DOWNLOAD_DONE); } public boolean isSeed() { return false; } public void addCandidateOutlink(Document newOutlink) { // Overridden in subclasses } public void perhapsAddDocumentClosureToPool() { // Overridden in subclasses } public String getLocationsString() { String result; if (getAdditionalLocations() == null || getAdditionalLocations().size() == 0) result = getLocationMetadata().toString(); else { StringBuilder buffy = new StringBuilder(getLocationMetadata().toString()); for (MetadataParsedURL otherLocation : getAdditionalLocations()) { buffy.append(','); buffy.append(otherLocation.toString()); } result = buffy.toString(); } return result; } /** * @return the parserResult */ public ParserResult getParserResult() { return parserResult; } /** * @param parserResult * the parserResult to set */ public void setParserResult(ParserResult parserResult) { this.parserResult = parserResult; } /** * @return the seed */ public Seed getSeed() { return seed; } /** * @param seed * the seed to set */ public void setSeed(Seed seed) { this.seed = seed; } /** * If this Container was a search, the index number of that search among the searches being * aggregated at one time. Otherwise, -1. * * @return The search index number or -1 if not a search. */ public int searchNum() { if (isTrueSeed && (seed instanceof SearchState)) { return ((SearchState) seed).searchNum(); } return -1; } /** * Called for true seed Containers. Calling this method does more than bind the Seed object with * the Container in the model. It also sets the crucial isSeed flag, establishing that this * Container is truly a Seed. * <p/> * NB: The seed object will also be bound with ancestors of the Container. * * @param seed */ public void setAsTrueSeed(Seed seed) { // associateSeed(seed); this.seed = seed; isTrueSeed = true; } /** * Indicate that this Container is being processed via DnD. * */ void setDnd() { isDnd = true; } public boolean isDnd() { return isDnd; } @Override public boolean hasLocation() { return getLocationMetadata() != null; } public boolean hasLocation(ParsedURL location) { if (location.equals(getLocation())) return true; List<MetadataParsedURL> additionalLocations = getAdditionalLocations(); if (additionalLocations != null && additionalLocations.size() > 0) { for (MetadataParsedURL mpurl : additionalLocations) { if (mpurl != null && location.equals(mpurl.getValue())) return true; } } return false; } public DownloadStatus getDownloadStatus() { return downloadStatus; } public void setDownloadStatus(DownloadStatus downloadStatus) { if (lastActionTimestamp == 0) { lastActionTimestamp = System.currentTimeMillis(); } long now = System.currentTimeMillis(); long delta = now - lastActionTimestamp; transitionTimeToDownloadStatus.put(downloadStatus, delta); this.downloadStatus = downloadStatus; lastActionTimestamp = now; } public HashMap<DownloadStatus, Long> getTransitionTimeToDownloadStatus() { return transitionTimeToDownloadStatus; } public ParsedURL getLocationOrFirstAdditionLocation() { if (getLocation() != null) return getLocation(); if (getAdditionalLocations() != null && getAdditionalLocations().size() > 0) return getAdditionalLocations().get(0).getValue(); return null; } /** * Get a collection of clippings, if we have one. * * @return always null in the base class */ public List<Clipping> getClippings() { return null; } /** * Deserialize A JSON metadataString to form a Document. If that works, add it to the global * collection, with location as key. Set its downloadStatus to DOWNLOAD_DONE. * * @param metadataString * @param semanticsScope * @param metadataFormat * @return */ public static Document constructAndMapFromJson(String jsonMetadata, SemanticsGlobalScope semanticsScope) { return constructAndMapFromSerialized(jsonMetadata, semanticsScope, StringFormat.JSON); } /** * Deserialize the metadataString to form a Document. If that works, add it to the global * collection, with location as key. Set its downloadStatus to DOWNLOAD_DONE. * * @param metadataString * @param semanticsScope * @param metadataFormat * @return */ public static Document constructAndMapFromSerialized(String metadataString, SemanticsGlobalScope semanticsScope, StringFormat metadataFormat) { Document document = null; try { SimplTypesScope documentsTypeScope = semanticsScope.getDocumentsTypeScope(); document = (Document) documentsTypeScope.deserialize(metadataString, metadataFormat); if (document != null) { document.setSemanticsSessionScope(semanticsScope); document.setDownloadStatus(DownloadStatus.DOWNLOAD_DONE); // andruid 2012/08 -- perhaps this should be unconditional put to change map to latest // extracted metadata semanticsScope.putDocumentIfAbsent(document); } } catch (SIMPLTranslationException e) { e.printStackTrace(); } return document; } }