/**
*
*/
package ecologylab.bigsemantics.html.documentstructure;
import java.util.regex.Pattern;
import ecologylab.bigsemantics.html.utils.HTMLNames;
import ecologylab.generic.StringTools;
/**
* Static methods for operating on images, and recognizing features during information extraction.
*
* @author andruid
*/
public class ImageFeatures
implements HTMLNames, ImageConstants
{
/**
* Test to see if alt attribute from HTML is garbage.
*
* @param altitude
*
* @return true if its null, empty string, "null", or looks like a filename.
*/
static Pattern BOGUS_ALT_MATCHER = Pattern.compile("^([[:alpha:]]+_*)+$");
public static boolean altIsBogus(String alt)
{
boolean result = (alt == null) || (alt.length() == 0) || "null".equals(alt) || "image".equals(alt) || alt.endsWith(".jpg") || BOGUS_ALT_MATCHER.matcher(alt).matches();
if (!result)
{
if (!StringTools.contains(alt, ' ')) // no spaces
{
// contains ., and not at end?
int dotIndex = alt.indexOf('.');
if ((dotIndex > -1) && (dotIndex < (alt.length() - 1)))
{
// debug("This alt is really a filename: " + caption);
result = true;
}
}
}
return result;
}
/**
* Use heuristics on width & height to analyze whether this image is junk, such as a
* spacer (very small), a nav element, or an advertisement.
* <p/>
* If width or height is set to 0, the assumption is that we have no data about size, so
* we must return UNKNOWN.
*/
//FIXME -- unify with ImageFeatures.isInformativeImage()
public static int designRole(int width, int height, int mimeIndex, boolean isMap)
{
float aspectRatio = (float) width / (float) height;
if (aspectRatio > 1.0f)
aspectRatio = (float) 1.0f/aspectRatio;
if (aspectRatio < 0.35f)
return UN_INFORMATIVE;
//TODO -- should area be a feature? int area = width * height;
int result = UNKNOWN;
if ((width > 0) && (height > 0))
{
if ((width < MIN_WIDTH) || (height < MIN_HEIGHT))
result = UN_INFORMATIVE;
else
{
result = INFORMATIVE;
if ((mimeIndex != JPG) && (mimeIndex != UNKNOWN_MIME))
{ /* if (mimeType == GIF, PNG) */
if (isMap || (aspectRatio > 0.3f)) // stricter criteria for these mime types
result = UN_INFORMATIVE;
}
}
}
else // no width/height params
if (mimeIndex != JPG)
result = UN_INFORMATIVE; // only give benefit of doubt to jpegs
return result;
}
/**
* Use heuristics on width & height to analyze whether this image is junk, such as a
* spacer (very small), a nav element, or an advertisement.
* <p/>
* If width or height is set to 0, the assumption is that we have no data about size, so
* we must return UNKNOWN.
*/
public static int designRole(int width, int height)
{
// (JPG images are most privleged
return designRole(width, height, JPG, false);
}
/* We don't need this as we have an advertisement filter in the container.createImgElement(), and the filter is in cf.model.Filter.
* -- Eunyee
if( imgUrl!=null )
{
String urlChunks[] = imgUrl.split("/");
for(int j=0; j<urlChunks.length; j++)
{
String temp = urlChunks[j];
// System.out.println("url Chunk:" + temp);
if( temp.toLowerCase().equals("ad") || temp.toLowerCase().equals("adv") ||
temp.toLowerCase().contains("advertis") )
informImg = false;
}
}
*/
}