/** * */ package ecologylab.bigsemantics.metadata.builtins; import java.util.ArrayList; import java.util.List; import ecologylab.bigsemantics.collecting.Crawler; import ecologylab.bigsemantics.documentparsers.DocumentParser; import ecologylab.bigsemantics.documentparsers.RichDocumentParserCrawlerResult; import ecologylab.bigsemantics.metadata.builtins.declarations.RichDocumentDeclaration; import ecologylab.bigsemantics.metadata.scalar.MetadataString; import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField; import ecologylab.bigsemantics.metametadata.MetaMetadataRepository; import ecologylab.bigsemantics.seeding.Seed; import ecologylab.net.ParsedURL; import ecologylab.serialization.TranslationContext; import ecologylab.serialization.annotations.simpl_inherit; import ecologylab.serialization.annotations.simpl_other_tags; /** * A Document that can be broken down into clippings, including references to other documents. * HTML and PDF are prime examples. * * @author andruid */ @simpl_inherit @simpl_other_tags({"compound_document", "html_document"}) public class RichDocument extends RichDocumentDeclaration { private static final String CONTENT_PAGE = "content_page"; private static final String INDEX_PAGE = "index_page"; // /** // * For debugging. Type of the structure recognized by information extraction. // **/ // @mm_name("page_structure") // @simpl_scalar // private MetadataString pageStructure; // // /** // * The search query // **/ // @simpl_scalar @simpl_hints(Hint.XML_LEAF) // private MetadataString query; // // /** // * Clippings that this document contains. // */ // @mm_name("clippings") // @simpl_collection // @simpl_classes({ImageClipping.class, TextClipping.class}) //// @simpl_scope(SemanticsNames.REPOSITORY_CLIPPING_TRANSLATIONS) // List<Clipping> clippings; // // /** // * The rootDocument is filled in to create an alternative, connected RichDocument instance that is used to // * store the List<Clipping> clippings object associated with this. // * It can be used to merge the clippings collection for two or more related documents, such as a metadata page, and an associated PDF. // */ // @simpl_composite // private RichDocument rootDocument; /** * Seed object associated with this, if this is a seed. */ private Seed seed; /** * Indicates that this Document is a truly a seed, not just one * that is associated into a Seed's inverted index. */ private boolean isTrueSeed; /** Number of surrogates from this container that are currently on screen */ int onScreenCount; int onScreenTextCount; /** Total number of surrogates that have ever been on screen from this container */ int totalVisualized; /** * */ public RichDocument() { // TODO Auto-generated constructor stub } /** * @param metaMetadata */ public RichDocument(MetaMetadataCompositeField metaMetadata) { super(metaMetadata); // TODO Auto-generated constructor stub } /** * @param location */ public RichDocument(ParsedURL location) { this(MetaMetadataRepository.getBaseDocumentMM()); Document.initDocument(this, location); } @Override public boolean isRichDocument() { return true; } /** * The heavy weight setter method for field pageStructure **/ public void hwSetPageStructure(String pageStructure) { this.pageStructure().setValue(pageStructure); rebuildCompositeTermVector(); } /** * Heavy Weight Direct setter method for pageStructure **/ public void hwSetPageStructureMetadata(MetadataString pageStructure) { if (!isPageStructureNull() && hasTermVector()) termVector().remove(this.getPageStructureMetadata().termVector()); this.setPageStructureMetadata(pageStructure); rebuildCompositeTermVector(); } public boolean isPageStructureNull() { return this.getPageStructureMetadata() == null || this.getPageStructureMetadata().getValue() == null; } /** * The heavy weight setter method for field query **/ public void hwSetQuery(String query) { this.query().setValue(query); rebuildCompositeTermVector(); } /** * Heavy Weight Direct setter method for query **/ public void hwSetQueryMetadata(MetadataString query) { if (!isQueryNull() && hasTermVector()) termVector().remove(this.getQueryMetadata().termVector()); this.setQueryMetadata(query); rebuildCompositeTermVector(); } public boolean isQueryNull() { return this.getQueryMetadata() == null || this.getQueryMetadata().getValue() == null; } /** * Insert the queryMetadata into the composite term vector FOR THE FIRST TIME. * Use a coefficient to control its emphasis, in order to avoid overpowering * the weighting with a weak (distantly crawled) relationship to the original search. * * @param query * @param weight Factor to affect the impact of the search query on the composite term vector weights. */ public void hwInitializeQueryMetadata(MetadataString query, double weight) { setQueryMetadata(query); termVector().add(weight, query.termVector()); } ////////////////////////////////// Downloadable ///////////////////////////////////////////////////// @Override public void downloadAndParseDone(DocumentParser documentParser) { // long t0 = System.currentTimeMillis(); if (documentParser != null && numClippings() > 0) { getSite(); // initialize this.site if haven't if (documentParser.isIndexPage()) { getSite().newIndexPage(); setPageStructure(INDEX_PAGE); } else if (documentParser.isContentPage()) { getSite().newContentPage(); setPageStructure(CONTENT_PAGE); } // When downloadDone, add best surrogate and best container to infoCollector Crawler crawler = semanticsScope.getCrawler(); if (crawler != null) { RichDocumentParserCrawlerResult crawlerResult = crawler.constructRichDocumentParserResult(this, isJustCrawl()); crawlerResult.collect(); } //TODO -- completely recycle DocumentParser!? } else { // due to dynamic mime type type detection in connect(), // we didnt actually turn out to be a Container object. // or, the parse didn't collect any information! // recycle(); // so free all resources, including connectionRecycle() } // if (documentParser != null) // { // documentParser.getLogRecord().setMsRichDocumentDnpDone(System.currentTimeMillis() - t0); // } } @Override public boolean isJustCrawl() { return isTrueSeed && seed != null && seed.isJustCrawl(); } @Override public void serializationPreHook(TranslationContext translationContext) { // if (clippings == null) // { // int size = 0; // boolean doImages = false; // if (candidateImageClosures != null) // { // size += candidateImageClosures.size(); // doImages = true; // } // if (candidateTextClippings != null) // { // size += candidateTextClippings.size(); // clippings = new ArrayList<Metadata>(size); // for (GenericElement<TextClipping> textClippingGE : candidateTextClippings) // clippings.add(textClippingGE.getGeneric()); // } // if (doImages) // { // if (clippings == null) // clippings = new ArrayList<Metadata>(size); // for (ImageClosure ic : candidateImageClosures) // clippings.add(ic.getDocument()); // } // } } @Override public void setAsTrueSeed(Seed seed) { associateSeed(seed); isTrueSeed = true; } /** * Associate the Seed object with this Container. * Calls to this method may reflect that this Container is just a Seed, or * they may only reflect that this Container needs to be in the Seed's inverted index. * @param seed */ public void associateSeed(Seed seed) { this.seed = seed; } @Override public boolean isSeed() { return isTrueSeed; } /** * return the seed from where the container originated * @return */ @Override public Seed getSeed() { return seed; } /** * The heavy weight setter method for field title **/ @Override public void hwSetTitle(String title) { title().setValue(title); rebuildCompositeTermVector(); } /** * Heavy Weight Direct setter method for title **/ public void hwSetTitleMetadata(MetadataString title) { if (!isTitleNull() && hasTermVector()) termVector().remove(getTitleMetadata().termVector()); setTitleMetadata(title); rebuildCompositeTermVector(); } public boolean isTitleNull() { return this.getTitleMetadata() == null || this.getTitleMetadata().getValue() == null; } /** * The heavy weight setter method for field description **/ public void hwSetDescription(String description) { description().setValue(description); rebuildCompositeTermVector(); } /** * Heavy Weight Direct setter method for description **/ public void hwSetDescriptionMetadata(MetadataString description) { if (!isDescriptionNull() && hasTermVector()) termVector().remove(getDescriptionMetadata().termVector()); setDescriptionMetadata(description); rebuildCompositeTermVector(); } public boolean isDescriptionNull() { return getDescriptionMetadata() == null || getDescriptionMetadata().getValue() == null; } /** * Lazy evaluation of clippings field. * If rootDocument non-null, get and construct in that, as necessary; else get and construct in this, as necessary. * @return */ public List<Clipping> clippings() { return /* getRootDocument() != null ? getRootDocument().selfClippings() : */ selfClippings(); //FIXME andruid and yin 2014-08-12 } private List<Clipping> selfClippings() { List<Clipping> result = this.getClippings(); if (result == null) { result = new ArrayList<Clipping>(); this.setClippings(result); } return result; } /** * @return the clippings */ public List<Clipping> getSelfClippings() { return getClippings(); } /** * Add to collection of clippings, representing our Rich documentness. */ public void addClipping(Clipping clipping) { clippings().add(clipping); } /** * * @return The number of Clippings that have been collected, if any. */ public int numClippings() { return getClippings() == null ? 0 : getClippings().size(); } /** * Used when oldDocument turns out to be re-directed from this. * @param oldDocument */ @Override public void inheritValues(Document oldDocument) { super.inheritValues(oldDocument); if (oldDocument instanceof RichDocument) { RichDocument oldRich= (RichDocument) oldDocument; String queryString = this.getQuery(); if (queryString == null || queryString.length() == 0) this.setQueryMetadata(oldRich.getQueryMetadata()); oldRich.setQueryMetadata(null); List<Clipping> oldClippings = oldRich.getClippings(); if (this.getClippings() == null && oldClippings != null) this.setClippings(oldClippings); } } }