/**
*
*/
package ecologylab.bigsemantics.html;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import ecologylab.bigsemantics.html.documentstructure.ImageFeatures;
import ecologylab.generic.Generic;
import ecologylab.net.ParsedURL;
/**
* HTMLElement that corresponds to the img tag + our textContext and extractedCaption.
*
* @author andruid
*
*/
public class ImgElement extends WithPurlElement
{
ParsedURL src;
String alt;
int width;
int height;
boolean isMap;
/**
* Text context recognized in document.
*/
String textContext;
/**
* Extracted caption recognized in document.
*/
String extractedCaption;
/**
* @param node
* @param basePurl TODO
*/
public ImgElement(Node node, ParsedURL basePurl)
{
super(node, basePurl);
}
@Override
protected void setAttribute(String key, String value)
{
if (SRC.equals(key))
{
src = ((value != null) && value.startsWith("data:")) ? null : constructPurl(basePurl, value);
}
else if (ALT.equals(key))
alt = value;
else if (WIDTH.equals(key))
width = value == null ? INDEX_NOT_CALCULATED : Generic.parseInt(value, INDEX_NOT_CALCULATED );
else if (HEIGHT.equals(key))
height = value == null ? INDEX_NOT_CALCULATED : Generic.parseInt(value, INDEX_NOT_CALCULATED );
else if (ISMAP.equals(key))
isMap = value != null && "true".equals(value);
else
super.setAttribute(key, value);
}
public static ParsedURL constructPurl(ParsedURL basePurl, String value)
{
return (basePurl == null) ? ParsedURL.getAbsolute(value) : basePurl.createFromHTML(value);
}
public ParsedURL getSrc()
{
return src;
}
public void setSrc(ParsedURL src)
{
this.src = src;
}
public void setAlt(String alt)
{
this.alt = alt;
}
public int getWidth()
{
return width;
}
public void setWidth(int width)
{
this.width = width;
}
public int getHeight()
{
return height;
}
public void setHeight(int height)
{
this.height = height;
}
public boolean isMap()
{
return isMap;
}
public void setMap(boolean isMap)
{
this.isMap = isMap;
}
public String getTextContext()
{
return textContext;
}
public void setTextContext(String textContext)
{
this.textContext = textContext;
}
public void setTextContext(StringBuilder buffy)
{
this.textContext = buffy.toString();
}
public String getExtractedCaption()
{
return extractedCaption;
}
public void setExtractedCaption(String extractedCaption)
{
this.extractedCaption = extractedCaption;
}
public String getAlt()
{
return alt;
}
/**
* Get the alt text attribute from the image node, if there is one.
* Check to see if it is not bogus (not empty, "null", a url, contains advertis).
* If it is bogus, clear the attribute in the image node.
* Otherwise, return it.
*
* @param imageNode
* @return null, or a usable alt String.
*/
public String getNonBogusAlt()
{
String altText = this.getAlt();
if ((altText != null) && (ImageFeatures.altIsBogus(altText)))
{
altText = null;
alt = null;
}
return altText;
}
/**
* Recognize whether the image is informative or not based on its attributes and size, aspect ratio.
*
* @param imageNode HTML node from <code>img</code> tag, with attributes.
*
* @return true if image is recognized as informative otherwise false.
*/
public boolean isInformativeImage()
{
if (src == null)
return false;
String alt = getNonBogusAlt();
boolean informImg = !(alt!=null && alt.toLowerCase().contains("advertis")) ;
// String imgUrl = imageNode.getAttribute(SRC);
//TODO -- should we do more advertisement filtering here?!
//TODO -- should we use an encompassing hyperlink and its destination as features ???!
if (informImg)
{
int mimeIndex = src.mimeIndex();;
int designRole = ImageFeatures.designRole(width, height, mimeIndex, isMap);
informImg = (designRole == INFORMATIVE) || (designRole == UNKNOWN);
}
return informImg;
}
public String toString()
{
StringBuilder buffy = new StringBuilder();
if (node.getNodeType() == Node.TEXT_NODE)
{
buffy.append(node.getNodeValue());
}
else
{
buffy.append('<').append(node.getNodeName());
NamedNodeMap attributes = node.getAttributes();
for (int i=0; i<attributes.getLength(); i++)
{
Node attr = attributes.item(i);
buffy.append(' ').append(attr.getNodeName()).append('=').append('"').append(attr.getNodeValue()).append('"');
}
buffy.append('>');
}
buffy.append('\n');
return buffy.toString();
}
}