/**
*
*/
package ecologylab.bigsemantics.metadata.builtins;
import java.util.ArrayList;
import java.util.List;
import ecologylab.bigsemantics.collecting.Crawler;
import ecologylab.bigsemantics.documentparsers.DocumentParser;
import ecologylab.bigsemantics.documentparsers.RichDocumentParserCrawlerResult;
import ecologylab.bigsemantics.metadata.builtins.declarations.RichDocumentDeclaration;
import ecologylab.bigsemantics.metadata.scalar.MetadataString;
import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField;
import ecologylab.bigsemantics.metametadata.MetaMetadataRepository;
import ecologylab.bigsemantics.seeding.Seed;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.TranslationContext;
import ecologylab.serialization.annotations.simpl_inherit;
import ecologylab.serialization.annotations.simpl_other_tags;
/**
* A Document that can be broken down into clippings, including references to other documents.
* HTML and PDF are prime examples.
*
* @author andruid
*/
@simpl_inherit
@simpl_other_tags({"compound_document", "html_document"})
public class RichDocument extends RichDocumentDeclaration
{
private static final String CONTENT_PAGE = "content_page";
private static final String INDEX_PAGE = "index_page";
// /**
// * For debugging. Type of the structure recognized by information extraction.
// **/
// @mm_name("page_structure")
// @simpl_scalar
// private MetadataString pageStructure;
//
// /**
// * The search query
// **/
// @simpl_scalar @simpl_hints(Hint.XML_LEAF)
// private MetadataString query;
//
// /**
// * Clippings that this document contains.
// */
// @mm_name("clippings")
// @simpl_collection
// @simpl_classes({ImageClipping.class, TextClipping.class})
//// @simpl_scope(SemanticsNames.REPOSITORY_CLIPPING_TRANSLATIONS)
// List<Clipping> clippings;
//
// /**
// * The rootDocument is filled in to create an alternative, connected RichDocument instance that is used to
// * store the List<Clipping> clippings object associated with this.
// * It can be used to merge the clippings collection for two or more related documents, such as a metadata page, and an associated PDF.
// */
// @simpl_composite
// private RichDocument rootDocument;
/**
* Seed object associated with this, if this is a seed.
*/
private Seed seed;
/**
* Indicates that this Document is a truly a seed, not just one
* that is associated into a Seed's inverted index.
*/
private boolean isTrueSeed;
/** Number of surrogates from this container that are currently on screen */
int onScreenCount;
int onScreenTextCount;
/** Total number of surrogates that have ever been on screen from this container */
int totalVisualized;
/**
*
*/
public RichDocument()
{
// TODO Auto-generated constructor stub
}
/**
* @param metaMetadata
*/
public RichDocument(MetaMetadataCompositeField metaMetadata)
{
super(metaMetadata);
// TODO Auto-generated constructor stub
}
/**
* @param location
*/
public RichDocument(ParsedURL location)
{
this(MetaMetadataRepository.getBaseDocumentMM());
Document.initDocument(this, location);
}
@Override
public boolean isRichDocument()
{
return true;
}
/**
* The heavy weight setter method for field pageStructure
**/
public void hwSetPageStructure(String pageStructure)
{
this.pageStructure().setValue(pageStructure);
rebuildCompositeTermVector();
}
/**
* Heavy Weight Direct setter method for pageStructure
**/
public void hwSetPageStructureMetadata(MetadataString pageStructure)
{
if (!isPageStructureNull() && hasTermVector())
termVector().remove(this.getPageStructureMetadata().termVector());
this.setPageStructureMetadata(pageStructure);
rebuildCompositeTermVector();
}
public boolean isPageStructureNull()
{
return this.getPageStructureMetadata() == null || this.getPageStructureMetadata().getValue() == null;
}
/**
* The heavy weight setter method for field query
**/
public void hwSetQuery(String query)
{
this.query().setValue(query);
rebuildCompositeTermVector();
}
/**
* Heavy Weight Direct setter method for query
**/
public void hwSetQueryMetadata(MetadataString query)
{
if (!isQueryNull() && hasTermVector())
termVector().remove(this.getQueryMetadata().termVector());
this.setQueryMetadata(query);
rebuildCompositeTermVector();
}
public boolean isQueryNull()
{
return this.getQueryMetadata() == null || this.getQueryMetadata().getValue() == null;
}
/**
* Insert the queryMetadata into the composite term vector FOR THE FIRST TIME.
* Use a coefficient to control its emphasis, in order to avoid overpowering
* the weighting with a weak (distantly crawled) relationship to the original search.
*
* @param query
* @param weight Factor to affect the impact of the search query on the composite term vector weights.
*/
public void hwInitializeQueryMetadata(MetadataString query, double weight)
{
setQueryMetadata(query);
termVector().add(weight, query.termVector());
}
////////////////////////////////// Downloadable /////////////////////////////////////////////////////
@Override
public void downloadAndParseDone(DocumentParser documentParser)
{
// long t0 = System.currentTimeMillis();
if (documentParser != null && numClippings() > 0)
{
getSite(); // initialize this.site if haven't
if (documentParser.isIndexPage())
{
getSite().newIndexPage();
setPageStructure(INDEX_PAGE);
}
else if (documentParser.isContentPage())
{
getSite().newContentPage();
setPageStructure(CONTENT_PAGE);
}
// When downloadDone, add best surrogate and best container to infoCollector
Crawler crawler = semanticsScope.getCrawler();
if (crawler != null)
{
RichDocumentParserCrawlerResult crawlerResult =
crawler.constructRichDocumentParserResult(this, isJustCrawl());
crawlerResult.collect();
}
//TODO -- completely recycle DocumentParser!?
}
else
{
// due to dynamic mime type type detection in connect(),
// we didnt actually turn out to be a Container object.
// or, the parse didn't collect any information!
// recycle(); // so free all resources, including connectionRecycle()
}
// if (documentParser != null)
// {
// documentParser.getLogRecord().setMsRichDocumentDnpDone(System.currentTimeMillis() - t0);
// }
}
@Override
public boolean isJustCrawl()
{
return isTrueSeed && seed != null && seed.isJustCrawl();
}
@Override
public void serializationPreHook(TranslationContext translationContext)
{
// if (clippings == null)
// {
// int size = 0;
// boolean doImages = false;
// if (candidateImageClosures != null)
// {
// size += candidateImageClosures.size();
// doImages = true;
// }
// if (candidateTextClippings != null)
// {
// size += candidateTextClippings.size();
// clippings = new ArrayList<Metadata>(size);
// for (GenericElement<TextClipping> textClippingGE : candidateTextClippings)
// clippings.add(textClippingGE.getGeneric());
// }
// if (doImages)
// {
// if (clippings == null)
// clippings = new ArrayList<Metadata>(size);
// for (ImageClosure ic : candidateImageClosures)
// clippings.add(ic.getDocument());
// }
// }
}
@Override
public void setAsTrueSeed(Seed seed)
{
associateSeed(seed);
isTrueSeed = true;
}
/**
* Associate the Seed object with this Container.
* Calls to this method may reflect that this Container is just a Seed, or
* they may only reflect that this Container needs to be in the Seed's inverted index.
* @param seed
*/
public void associateSeed(Seed seed)
{
this.seed = seed;
}
@Override
public boolean isSeed()
{
return isTrueSeed;
}
/**
* return the seed from where the container originated
* @return
*/
@Override
public Seed getSeed()
{
return seed;
}
/**
* The heavy weight setter method for field title
**/
@Override
public void hwSetTitle(String title)
{
title().setValue(title);
rebuildCompositeTermVector();
}
/**
* Heavy Weight Direct setter method for title
**/
public void hwSetTitleMetadata(MetadataString title)
{
if (!isTitleNull() && hasTermVector())
termVector().remove(getTitleMetadata().termVector());
setTitleMetadata(title);
rebuildCompositeTermVector();
}
public boolean isTitleNull()
{
return this.getTitleMetadata() == null || this.getTitleMetadata().getValue() == null;
}
/**
* The heavy weight setter method for field description
**/
public void hwSetDescription(String description)
{
description().setValue(description);
rebuildCompositeTermVector();
}
/**
* Heavy Weight Direct setter method for description
**/
public void hwSetDescriptionMetadata(MetadataString description)
{
if (!isDescriptionNull() && hasTermVector())
termVector().remove(getDescriptionMetadata().termVector());
setDescriptionMetadata(description);
rebuildCompositeTermVector();
}
public boolean isDescriptionNull()
{
return getDescriptionMetadata() == null || getDescriptionMetadata().getValue() == null;
}
/**
* Lazy evaluation of clippings field.
* If rootDocument non-null, get and construct in that, as necessary; else get and construct in this, as necessary.
* @return
*/
public List<Clipping> clippings()
{
return /* getRootDocument() != null ? getRootDocument().selfClippings() : */ selfClippings(); //FIXME andruid and yin 2014-08-12
}
private List<Clipping> selfClippings()
{
List<Clipping> result = this.getClippings();
if (result == null)
{
result = new ArrayList<Clipping>();
this.setClippings(result);
}
return result;
}
/**
* @return the clippings
*/
public List<Clipping> getSelfClippings()
{
return getClippings();
}
/**
* Add to collection of clippings, representing our Rich documentness.
*/
public void addClipping(Clipping clipping)
{
clippings().add(clipping);
}
/**
*
* @return The number of Clippings that have been collected, if any.
*/
public int numClippings()
{
return getClippings() == null ? 0 : getClippings().size();
}
/**
* Used when oldDocument turns out to be re-directed from this.
* @param oldDocument
*/
@Override
public void inheritValues(Document oldDocument)
{
super.inheritValues(oldDocument);
if (oldDocument instanceof RichDocument)
{
RichDocument oldRich= (RichDocument) oldDocument;
String queryString = this.getQuery();
if (queryString == null || queryString.length() == 0)
this.setQueryMetadata(oldRich.getQueryMetadata());
oldRich.setQueryMetadata(null);
List<Clipping> oldClippings = oldRich.getClippings();
if (this.getClippings() == null && oldClippings != null)
this.setClippings(oldClippings);
}
}
}