package ecologylab.bigsemantics.html.documentstructure;
import java.util.ArrayList;
import ecologylab.bigsemantics.model.text.ITermVector;
import ecologylab.bigsemantics.model.text.TermVector;
import ecologylab.bigsemantics.model.text.TermVectorFeature;
import ecologylab.net.ParsedURL;
/**
* Create a semantic anchor object to represent a link to a new (probably unparsed) Container
* from one that is currently being parsed.
* These are like AnchorContexts, but use term vector representation instead of holding onto the strings
* @author andruid
*
*/
//TODO -- make serializable!
public class SemanticAnchor implements TermVectorFeature
{
static final float NO_SPECIAL_SIGNIFICANCE = 1;
static final float CONTENT_BODY_SIGNIFICANCE = 1.5f;
static final float SAME_DOMAIN_SIGNIFICANCE_PENALTY = .5f;
static final int CITATION_SIGNIFICANCE = 4;
/**
* hrefPurl of the container pointing to this container. <br>
*/
protected ParsedURL sourcePurl;
TermVector tv;
//FIXME -- these fields must be merged!!!
LinkType linkType;
private float significance;
public static final Double TEXT_OVER_CONTEXT_EMPHASIS_FACTOR = 3.0;
/**
*
* @param linkType
* @param destinationPurl The linked destination document that this refers to.
* @param anchorContexts
* @param sourcePurl The source document that this link originated from.
* @param significanceVal
*/
public SemanticAnchor(LinkType linkType,
ParsedURL destinationPurl,
ArrayList<AnchorContext> anchorContexts,
ParsedURL sourcePurl,
float significanceVal)
{
this.sourcePurl = sourcePurl;
this.linkType = linkType;
tv = new TermVector();
switch (linkType)
{
case CITATION_SEMANTIC_ACTION:
this.significance = CITATION_SIGNIFICANCE * significanceVal;
break;
case WILD_CONTENT_BODY:
this.significance = CONTENT_BODY_SIGNIFICANCE;
//TODO should there be some (but less) penalty here if same domain?
break;
case WILD:
if (sourcePurl != null && sourcePurl.domain().equals(destinationPurl.domain()))
{
this.significance = SAME_DOMAIN_SIGNIFICANCE_PENALTY;
break;
}
default:
this.significance = NO_SPECIAL_SIGNIFICANCE;
break;
}
if(anchorContexts != null)
{
for(AnchorContext anchorContext : anchorContexts)
addAnchorContextToTV(anchorContext);
}
}
public void addAnchorContextToTV(AnchorContext anchorContext)
{
String anchorText = anchorContext.getAnchorText();
String anchorContextString = anchorContext.getAnchorContextString();
addAnchorContextToTV(anchorText, anchorContextString);
}
/**
* Directly add the strings to TV
* @param anchorText
* @param anchorContextString
*/
public void addAnchorContextToTV(String anchorText, String anchorContextString)
{
if (anchorText != null && anchorText.length() > 0)
tv.add(anchorText, TEXT_OVER_CONTEXT_EMPHASIS_FACTOR);
if (anchorContextString != null && anchorContextString.length() > 0)
tv.add(anchorContextString);
}
public ITermVector termVector()
{
return tv;
}
public String toString()
{
return "SemanticAnchor:\n\t\t\tSourcePurl: " + sourcePurl + "\n\t\t\t"+ tv.toString();
}
public void recycle()
{
if (tv != null)
{
tv.recycle();
tv = null;
}
// inlinkPurl = null;
}
public float getSignificance()
{
return significance;
}
public ParsedURL sourcePurl()
{
return sourcePurl;
}
public boolean fromSemanticAction()
{
return linkType == LinkType.TRUSTED_SEMANTIC_ACTION || linkType == LinkType.SITE_BOOSTED_SEMANTIC_ACTION;
}
}