/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.meaningfulweb.imgext;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.math.BigInteger;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.meaningfulweb.imgext.ImageSizeExtractor.ImageSize;
import org.meaningfulweb.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Filters and sorts images based on data about the IMG tags from HTML. The filtering and sorting uses a variety of heuristics
* that aim to give the highest score to the IMG most likely to represent the article.
* @version $Revision: 113091 $
*/
public class ImageFilter{
private static Logger logger = LoggerFactory.getLogger(ImageFilter.class);
private final int _maxAcceptableAspectRatio;
private final double _minAcceptableAspectRatio;
private final int _preferedImageSizeInPixels;
private final int _minAcceptableImageHeight;
private final int _minAcceptableImageWidth;
private final int _numberOfImagesBoostedAfterTitle; // number of images found after the title that get a score boost for proximity to the title
private final int _minAcceptableImageSizeInBytes;
private final double _minScoreThreshold;
private final boolean _allowAllImageFormats;
private final int _positionScoreWeight;
private final double _formatScoreWeight;
private final int _sizeScoreWeight;
private final double _filenameScoreWeight;
private final double _attributeScoreWeight;
public ImageFilter(int maxAcceptableAspectRatio,
double minAcceptableAspectRatio,
int preferedImageSizeInPixels,
int minAcceptableImageHeight,
int minAcceptableImageWidth,
int numberOfImagesBoostedAfterTitle,
int minAcceptableImageSizeInBytes,
double minScoreThreshold,
boolean allowAllImageFormats,
int positionScoreWeight,
double formatScoreWeight,
int sizeScoreWeight,
double filenameScoreWeight,
double attributeScoreWeight)
{
_maxAcceptableAspectRatio = maxAcceptableAspectRatio;
_minAcceptableAspectRatio = minAcceptableAspectRatio;
_preferedImageSizeInPixels = preferedImageSizeInPixels;
_minAcceptableImageHeight = minAcceptableImageHeight;
_minAcceptableImageWidth = minAcceptableImageWidth;
_numberOfImagesBoostedAfterTitle = numberOfImagesBoostedAfterTitle;
_minAcceptableImageSizeInBytes = minAcceptableImageSizeInBytes;
_minScoreThreshold = minScoreThreshold;
_allowAllImageFormats = allowAllImageFormats;
_positionScoreWeight = positionScoreWeight;
_formatScoreWeight = formatScoreWeight;
_sizeScoreWeight = sizeScoreWeight;
_filenameScoreWeight = filenameScoreWeight;
_attributeScoreWeight = attributeScoreWeight;
}
public ImageFilter(){
this(4,0.25,6400,50,50,2,2048,0.1,true,5,1.5,1,0.3,0);
}
/**
* Scores the images on a variety of factors including size, aspect ratio, placement and URL. Sorts the images by score and filters out
* any that are unacceptable (too small, bad image, anchor, domain, etc..).
*
* @param baseURL
* @param images
* @return
*/
public ExtractedContents sortAndFilterByMetadataOnly(ExtractedContents extractedContents)
{
List<ImageMeta> images = extractedContents.getImages();
List<ImageMeta> uniqueImages = removeDuplicatesByURL(images);
Map<ImageMeta, Double> scores = new HashMap<ImageMeta, Double>();
List<ImageMeta> result1 = new ArrayList<ImageMeta>();
List<ImageMeta> results = new ArrayList<ImageMeta>();
String baseURL = extractedContents.getBaseURL();
if (baseURL!=null){
baseURL = baseURL.trim();
}
String domain = URLUtil.extractDomainFromUrl(baseURL);
int adjustedTitlePosition = extractedContents.getTitlePosition();
long maxSize = 0L;
for(ImageMeta image : uniqueImages)
{
if(accept(image))
{
long s = 0L;
if (image.getWidth()!=null && image.getHeight()!=null){
if (image.getWidth()==1 || image.getHeight()==1) continue;
if (image.getWidth()==0 || image.getHeight()==0) continue;
s = image.getWidth()*image.getHeight();
}
if (s>maxSize) maxSize=s;
result1.add(image);
}
else if (image.getPosition() > adjustedTitlePosition)
{
adjustedTitlePosition++; // we need the title position to be relative only to
// acceptable images, so if an image is unacceptable, we
// increment to adjust for the gap in the numbering the
// unacceptable image creates. This is a bit of a hack...
}
}
for(ImageMeta image : result1)
{
double score = score(adjustedTitlePosition, domain, image, null,maxSize);
if(score > _minScoreThreshold)
{
scores.put(image, score);
results.add(image);
}
}
Collections.sort(results, new ScoreComparator(scores));
return extractedContents.clone(results);
}
/**
* Filters images using data available only by retrieving http headers for the images.
* @param images
* @param headers
* @return
*/
public ExtractedContents sortAndFilterByImageHeaders(ExtractedContents extractedContents, Map<ImageMeta, ImageHeader> headers)
{
List<ImageMeta> images = extractedContents.getImages();
List<ImageMeta> uniqueImages = removeDuplicatesByURL(images);
// TODO LOW we could potentially remove duplicates by hash or signature for this case as well
List<ImageMeta> results = new ArrayList<ImageMeta>();
for(ImageMeta image : uniqueImages)
{
ImageHeader header = headers.get(image);
if(header != null && accept(header))
{
results.add(image);
}
}
// TODO: sort
return extractedContents.clone(results);
}
/**
* Filters images using data available only be retireving
* @param images
* @param imagesContents
* @return
*/
public ExtractedContents sortAndFilterByImageContents(ExtractedContents extractedContents, Map<ImageMeta, ImageSize> imagesContents)
{
List<ImageMeta> images = extractedContents.getImages();
//List<ImageMeta> uniqueImages = removeDuplicatesBySignature(removeDuplicatesByURL(images), imagesContents);
Map<ImageMeta, Double> scores = new HashMap<ImageMeta, Double>();
List<ImageMeta> results = new ArrayList<ImageMeta>();
String baseURL = extractedContents.getBaseURL();
if (baseURL!=null){
baseURL=baseURL.trim();
}
String domain = URLUtil.extractDomainFromUrl(baseURL);
long maxSize = 0L;
for(ImageMeta image : images)
{
ImageSize imgSize = imagesContents.get(image);
if(imgSize != null && accept(imgSize))
{
long s = imgSize.width * imgSize.height;
if (maxSize<s) maxSize=s;
image.setSize(imgSize.size);
results.add(image);
}
}
for (ImageMeta result : results){
ImageSize imgSize = imagesContents.get(result);
double score = score(extractedContents.getTitlePosition(), domain, result, imgSize,maxSize);
scores.put(result, score);
}
Collections.sort(results, new ScoreComparator(scores));
return extractedContents.clone(results);
}
// TODO LOW: This might be the wrong approach, rather than just keeping a single copy of each image, should we keep the count for each
// image, we will likely need to either not show images that appear more than once or penalize them when scoring
public List<ImageMeta> removeDuplicatesByURL(List<ImageMeta> images)
{
Set<String> urls = new HashSet<String>();
ArrayList<ImageMeta> results = new ArrayList<ImageMeta>();
for(ImageMeta image : images)
{
if(image.getUri() != null && urls.contains(image.getUri()) == false)
{
urls.add(image.getUri());
results.add(image);
}
}
return results;
}
public static String generateSignature(byte[] bytes) throws Exception
{
MessageDigest md;
try
{
md = MessageDigest.getInstance("MD5");
md.reset();
md.update(bytes);
byte[] digest = md.digest();
BigInteger bigInt = new BigInteger(1,digest);
String hashtext = bigInt.toString(16);
while(hashtext.length() < 32)
{
hashtext = "0" + hashtext;
}
return hashtext;
}
catch (NoSuchAlgorithmException e)
{
throw new Exception("unable to generate image signature", e);
}
}
/**
* Filters out duplicate images by checking if they have the same signatures, which means they are almost certainly identical.
* @param images
* @param imagesContents
* @return
*/
public List<ImageMeta> removeDuplicatesBySignature(List<ImageMeta> images, Map<ImageMeta, BufferedImage> imagesContents)
{
Set<String> signatures = new HashSet<String>();
ArrayList<ImageMeta> results = new ArrayList<ImageMeta>();
for(ImageMeta image : images)
{
BufferedImage imageContents = imagesContents.get(image);
if(imageContents == null) continue;
byte[] ds;
String signature = null;
try
{
ByteArrayOutputStream baos = new ByteArrayOutputStream(1000);
ImageIO.write(imageContents, "jpeg", baos);
baos.flush();
ds = baos.toByteArray();
baos.close();
signature = generateSignature(ds);
if(signature != null && signatures.contains(signature) == false)
{
signatures.add(signature);
results.add(image);
}
}
catch (IOException e)
{
logger.error("IOException generating signature for: " + image.getUri(),e);
}
catch (Exception e)
{
logger.error(" InternalException generating signature for: " + image.getUri(),e);
}
}
return results;
}
/**
* Sorts entries using the provided scores.
* @version $Revision: 113091 $
*/
private static class ScoreComparator implements Comparator<ImageMeta>
{
private final Map<ImageMeta, Double> _scores;
public ScoreComparator(Map<ImageMeta, Double> scores)
{
_scores = scores;
}
public int compare(ImageMeta image1, ImageMeta image2)
{
final Double score1 = _scores.get(image1);
final Double score2 = _scores.get(image2);
return Double.compare(score2, score1); // swapped params here to sort by score from highest to lowest
}
}
public boolean accept(ImageMeta image)
{
if(image == null) return false;
return isAcceptableFormat(image);
// using width/height as a filter for html image attributes does not work, it results in to many false negatives
//return isAcceptableSize(image) &&
// isAcceptableAspectRatio(image);
}
public boolean accept(ImageSize imageSize)
{
if(imageSize == null) return false;
return isAcceptableSize(imageSize.width, imageSize.height) &&
isAcceptableAspectRatio(imageSize.width, imageSize.height);
}
public boolean accept(ImageHeader imageHeader)
{
if(imageHeader == null) return false;
return isAcceptableSize(imageHeader);
}
private double score(int titlePosition, String domain, ImageMeta imageMetadata, ImageSize imageSize,long maxSize)
{
// TODO: weigh the various factors?
double attributesScore = scoreAttributes(imageMetadata);
//double domainScore = scoreDomain(domain, image); // initial numbers seem to indicate this just makes things worse
double filenameScore = scoreImageName(imageMetadata);
double sizeScore;
if(imageSize != null) // use the best data available, if we have the actual image, use it's size info, otherwise try to use html width/height attributes
{
sizeScore = scoreSize(imageSize.width,imageSize.height,maxSize);
}
else
{
sizeScore = scoreSize(imageMetadata.getWidth(),imageMetadata.getHeight(),maxSize);
}
double formatScore = scoreFormat(imageMetadata);
double positionScore = scorePosition(imageMetadata, titlePosition);
double score = attributesScore*_attributeScoreWeight +
/*domainScore*DOMAIN_SCORE_WEIGHT +*/
filenameScore*_filenameScoreWeight +
sizeScore*_sizeScoreWeight +
formatScore*_formatScoreWeight +
positionScore*_positionScoreWeight;
return score;
}
private boolean isAcceptableSize(ImageMeta image)
{
if(image.getWidth() == null || image.getHeight() == null) return false;
return isAcceptableSize(image.getWidth(), image.getHeight());
}
private boolean isAcceptableSize(int width, int height)
{
return (width > _minAcceptableImageWidth) && (height > _minAcceptableImageHeight);
}
private boolean isAcceptableSize(ImageHeader imageHeader)
{
return (imageHeader.getContentLength() > _minAcceptableImageSizeInBytes);
}
private boolean isAcceptableAspectRatio(int width, int height)
{
double aspectRatio = ((double)width)/height;
boolean okayRatio = (aspectRatio > _minAcceptableAspectRatio && aspectRatio < _maxAcceptableAspectRatio);
return okayRatio;
}
private boolean isAcceptableFormat(ImageMeta image)
{
// images from a variety of sources, including bloomberg do not include image format as a file extension in the URL... so disabling the filter
if(_allowAllImageFormats) return true;
String uri = image.getUri();
try
{
// we first try getting the path from the URL because this will strip off any query parameters, giving us the best possible filename to check
URL url = new URL(uri);
String path = url.getPath().toUpperCase();
// TODO: extract image file extension list
if(path.endsWith(".GIF")) return false;
else return true;
}
catch (MalformedURLException e)
{
// if we can't work with the path from the URL we fallback to a simple URL string check
logger.warn("unable to determine format because URL could not be parsed by java.net.URL");
String url = uri.trim().toUpperCase();
if(url.endsWith(".GIF")) return false;
else return true;
}
}
private double scoreSize(Integer width,Integer height,long maxSize)
{
if(width == null || height == null) return 0;
long size = width*height;
if(isAcceptableAspectRatio(width,height) == false) return 0;
long ms = maxSize == 0L ? _preferedImageSizeInPixels : maxSize;
return (double)size/(double)ms;
}
private double scoreAttributes(ImageMeta image)
{
double altScore = (image.getAlt() == null) ? 0 : .5;
double titleScore = (image.getTitle() == null) ? 0 : .5;
// score can be improved by looking at length, then potentially language of text, keywords ...
return altScore + titleScore;
}
private double scoreDomain(String domain, ImageMeta image)
{
// TODO: this is way to simple, most images come from CDNs now, we either need to whitelist those (difficult)
// or blacklist ad sites, but that is also relatively difficult..
if(image.getUri() == null) return 0;
String imgUri = image.getUri();
if (imgUri!=null){
imgUri=URLUtil.extractDomainFromUrl(imgUri.trim());
return imgUri.equals(domain) ? 1 : 0;
}
return 0;
}
private double scoreImageName(ImageMeta image)
{
// facebook_connect.png
String src = image.getUri();
int slashPosition = src.lastIndexOf('/');
if(slashPosition < 0) return 0;
String filename = src.substring(slashPosition, src.length());
if(filename.contains("twitter") ||
filename.contains("facebook") ||
filename.contains("rss") ||
filename.contains("logo") ||
filename.contains("spacer")) return 0; // TODO: put a blacklist somewhere
return 1;
}
private double scoreFormat(ImageMeta image)
{
try
{
String uri = image.getUri();
URL url = new URL(uri);
String path = url.getPath().toUpperCase();
if(path.endsWith(".JPG") || path.endsWith(".JPEG")) return 1;
if(path.endsWith(".PNG")) return .4;
if(path.endsWith(".BMP")) return .3;
if(path.endsWith(".GIF")) return 0; // we don't like gifs
else return .1; // unknown format-- we score these slightly higher than gifs
}
catch (MalformedURLException e)
{
logger.warn("unable to determine format because URL could not be parsed by java.net.URL");
return 0;
}
}
private double scorePosition(ImageMeta image, int titlePosition)
{
if(titlePosition == -1) return 0; // we don't know where the title is so we cannot use it to help with scoring
// position may be negative if it is before where the article title appears in the body
// for first pass we are only looking at images appearing after the title.
int relativePosition = image.getPosition() - (titlePosition+1);
if(relativePosition < 0 || relativePosition > _numberOfImagesBoostedAfterTitle) return 0;
return (double)(_numberOfImagesBoostedAfterTitle - relativePosition) / (double)_numberOfImagesBoostedAfterTitle;
}
}