ImgElement.java example

Explorer
BigSemanticsJava-master
/**
 * 
 */
package ecologylab.bigsemantics.html;

import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

import ecologylab.bigsemantics.html.documentstructure.ImageFeatures;
import ecologylab.generic.Generic;
import ecologylab.net.ParsedURL;

/**
 * HTMLElement that corresponds to the img tag + our textContext and extractedCaption.
 * 
 * @author andruid
 *
 */
public class ImgElement extends WithPurlElement
{
	ParsedURL				src;
	
	String					alt;	
	int							width;
	int							height;
	boolean					isMap;
	
	/**
	 * Text context recognized in document.
	 */
	String					textContext;
	/**
	 * Extracted caption recognized in document.
	 */
	String					extractedCaption;
	

	/**
	 * @param node
	 * @param basePurl TODO
	 */
	public ImgElement(Node node, ParsedURL basePurl)
	{
		super(node, basePurl);
	}

	@Override
	protected void setAttribute(String key, String value)
	{
		if (SRC.equals(key))
		{
			src 		= ((value != null) && value.startsWith("data:")) ? null : constructPurl(basePurl, value);
		}
		else if (ALT.equals(key))
			alt			= value;
		else if (WIDTH.equals(key))
			width		= value == null ? INDEX_NOT_CALCULATED : Generic.parseInt(value, INDEX_NOT_CALCULATED );
		else if (HEIGHT.equals(key))
			height	= value == null ? INDEX_NOT_CALCULATED : Generic.parseInt(value, INDEX_NOT_CALCULATED );
		else if (ISMAP.equals(key))
			isMap		= value != null && "true".equals(value);
		else
			super.setAttribute(key, value);
	}

	public static ParsedURL constructPurl(ParsedURL basePurl, String value)
	{
		return (basePurl == null) ? ParsedURL.getAbsolute(value) : basePurl.createFromHTML(value);
	}

	public ParsedURL getSrc()
	{
		return src;
	}

	public void setSrc(ParsedURL src)
	{
		this.src = src;
	}
	


	public void setAlt(String alt)
	{
		this.alt = alt;
	}

	public int getWidth()
	{
		return width;
	}

	public void setWidth(int width)
	{
		this.width = width;
	}

	public int getHeight()
	{
		return height;
	}

	public void setHeight(int height)
	{
		this.height = height;
	}

	public boolean isMap()
	{
		return isMap;
	}

	public void setMap(boolean isMap)
	{
		this.isMap = isMap;
	}

	public String getTextContext()
	{
		return textContext;
	}

	public void setTextContext(String textContext)
	{
		this.textContext = textContext;
	}

	public void setTextContext(StringBuilder buffy)
	{
		this.textContext = buffy.toString();
	}

	public String getExtractedCaption()
	{
		return extractedCaption;
	}

	public void setExtractedCaption(String extractedCaption)
	{
		this.extractedCaption = extractedCaption;
	}


	public String getAlt()
	{
		return alt;
	}
	
	/**
	 * Get the alt text attribute from the image node, if there is one.
	 * Check to see if it is not bogus (not empty, "null", a url, contains advertis).
	 * If it is bogus, clear the attribute in the image node.
	 * Otherwise, return it.
	 * 
	 * @param imageNode
	 * @return		null, or a usable alt String.
	 */
	public String getNonBogusAlt()
	{
		String altText 					= this.getAlt();
		if ((altText != null) && (ImageFeatures.altIsBogus(altText)))
		{
			altText								= null;
			alt										= null;
		}
		return altText;
	}

	
	/**
	 * Recognize whether the image is informative or not based on its attributes and size, aspect ratio. 
	 * 
	 * @param imageNode		HTML node from <code>img</code> tag, with attributes.
	 * 
	 * @return						true if image is recognized as informative otherwise false.
	 */
	public boolean isInformativeImage() 
	{
		if (src == null)
			return false;
		
		String alt 		= getNonBogusAlt();
		
		boolean informImg = !(alt!=null && alt.toLowerCase().contains("advertis")) ;
//	String imgUrl = imageNode.getAttribute(SRC);
		//TODO -- should we do more advertisement filtering here?!
		
		//TODO -- should we use an encompassing hyperlink and its destination as features ???!
		
		if (informImg)
		{
			int mimeIndex		= src.mimeIndex();;
			int designRole	= ImageFeatures.designRole(width, height, mimeIndex, isMap);
			informImg				= (designRole == INFORMATIVE) || (designRole == UNKNOWN);
		}
		return informImg;
	}

	public String toString()
	{
		StringBuilder buffy		= new StringBuilder();
		if (node.getNodeType() == Node.TEXT_NODE)
		{
			buffy.append(node.getNodeValue());
		}
		else
		{
			buffy.append('<').append(node.getNodeName());
			NamedNodeMap attributes = node.getAttributes();
			for (int i=0; i<attributes.getLength(); i++)
			{
				Node attr = attributes.item(i);
				buffy.append(' ').append(attr.getNodeName()).append('=').append('"').append(attr.getNodeValue()).append('"');
			}
			buffy.append('>');
		}

		buffy.append('\n');
		
		return buffy.toString();
	}
}