ImageFilter.java example

Explorer

meaningfulweb-master
- meaningfulweb-app
  - src
    - main
      - java
        org
        meaningfulweb
        servlet
        FileFactoryBean.java
        HtmlExtractorController.java
        MeaningfulWebServlet.java
- meaningfulweb-core
  - src
    - main
      - java
        org
        meaningfulweb
        api
        MeaningfulWebObject.java
        MetaContentExtractor.java
        StressTest.java
        cext
        Extract.java
        ExtractForm.java
        ExtractUtils.java
        HtmlContentPipeline.java
        HtmlContentProcessor.java
        HtmlContentProcessorFactory.java
        HtmlExtractor.java
        processors
        ArticleProcessor.java
        BestImageProcessor.java
        BoilerpipeArticleProcessor.java
        DomainSpecifiedImageProcessor.java
        ElementProcessor.java
        FullContentProcessor.java
        HyperlinkProcessor.java
        ImageProcessor.java
        MainContentProcessor.java
        MeaningfulwebCompositeProcessor.java
        OpengraphContentProcessor.java
        ParagraphProcessor.java
        RegexProcessor.java
        ScriptProcessor.java
        SystemCommandProcessor.java
        TwitpicExtractionHandler.java
        XPathCleanerProcessor.java
        XPathProcessor.java
        detector
        DetectorFactory.java
        imgext
        ExtractedContents.java
        ImageFetcher.java
        ImageFilter.java
        ImageHeader.java
        ImageInfo.java
        ImageMeta.java
        ImageProp.java
        ImageSelector.java
        ImageSizeExtractor.java
        util
        EncodingUtils.java
        HTMLOutputter.java
        HtmlExtractUtils.java
        ImageUtil.java
        JDomUtils.java
        JsonUtils.java
        ProcessResponse.java
        ProcessUtils.java
        SystemCommand.java
        TempDirUtils.java
        URIUtils.java
        URLUtil.java
        XMLUtils.java
        domain
        DomainSuffix.java
        DomainSuffixes.java
        DomainSuffixesReader.java
        TopLevelDomain.java
        http
        HttpClientFactory.java
        HttpClientService.java
        HttpComponentsServiceImpl.java
        HttpException.java
        security
        AuthenticationService.java
        ReloadableFileAuthenticationServiceImpl.java
    - test
      - java
        org
        meaningfulweb
        core
        test
        MWCoreTest.java
- meaningfulweb-opengraph
  - src
    - main
      - java
        org
        meaningfulweb
        opengraph
        OGObject.java
        OpenGraphContentHandler.java
        OpenGraphParser.java
        OpenGraphVocabulary.java
    - test
      - java
        org
        meaningfulweb
        opengraph
        test
        Og4jTestCase.java
        Og4jTestSuite.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.meaningfulweb.imgext;


import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.math.BigInteger;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.imageio.ImageIO;

import org.meaningfulweb.imgext.ImageSizeExtractor.ImageSize;
import org.meaningfulweb.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Filters and sorts images based on data about the IMG tags from HTML.  The filtering and sorting uses a variety of heuristics
 * that aim to give the highest score to the IMG most likely to represent the article.
 * @version $Revision: 113091 $
 */
public class ImageFilter{
  private static Logger logger = LoggerFactory.getLogger(ImageFilter.class);
  
  private final int _maxAcceptableAspectRatio;
  private final double _minAcceptableAspectRatio;
  private final int _preferedImageSizeInPixels;
  private final int _minAcceptableImageHeight;
  private final int _minAcceptableImageWidth;
  private final int _numberOfImagesBoostedAfterTitle; // number of images found after the title that get a score boost for proximity to the title
  private final int _minAcceptableImageSizeInBytes;
  private final double _minScoreThreshold;
  private final boolean _allowAllImageFormats;

  private final int _positionScoreWeight;
  private final double _formatScoreWeight;
  private final int _sizeScoreWeight;
  private final double _filenameScoreWeight;
  private final double _attributeScoreWeight;
  
  public ImageFilter(int maxAcceptableAspectRatio,
                     double minAcceptableAspectRatio,
                     int preferedImageSizeInPixels,
                     int minAcceptableImageHeight,
                     int minAcceptableImageWidth,
                     int numberOfImagesBoostedAfterTitle,
                     int minAcceptableImageSizeInBytes,
                     double minScoreThreshold,
                     boolean allowAllImageFormats,
                     
                     int positionScoreWeight,
                     double formatScoreWeight,
                     int sizeScoreWeight,
                     double filenameScoreWeight,
                     double attributeScoreWeight)
  {
    _maxAcceptableAspectRatio = maxAcceptableAspectRatio;
    _minAcceptableAspectRatio = minAcceptableAspectRatio;
    _preferedImageSizeInPixels = preferedImageSizeInPixels;
    _minAcceptableImageHeight = minAcceptableImageHeight;
    _minAcceptableImageWidth = minAcceptableImageWidth;
    _numberOfImagesBoostedAfterTitle = numberOfImagesBoostedAfterTitle;
    _minAcceptableImageSizeInBytes = minAcceptableImageSizeInBytes;
    _minScoreThreshold = minScoreThreshold;
    _allowAllImageFormats = allowAllImageFormats;
    
    _positionScoreWeight = positionScoreWeight;
    _formatScoreWeight = formatScoreWeight;
    _sizeScoreWeight = sizeScoreWeight;
    _filenameScoreWeight = filenameScoreWeight;
    _attributeScoreWeight = attributeScoreWeight;
  }
  
  public ImageFilter(){
	  this(4,0.25,6400,50,50,2,2048,0.1,true,5,1.5,1,0.3,0);
  }
  
  /**
   * Scores the images on a variety of factors including size, aspect ratio, placement and URL.  Sorts the images by score and filters out
   * any that are unacceptable (too small, bad image, anchor, domain, etc..).
   * 
   * @param baseURL
   * @param images
   * @return
   */
  public ExtractedContents sortAndFilterByMetadataOnly(ExtractedContents extractedContents)
  {
    List<ImageMeta> images = extractedContents.getImages();
    List<ImageMeta> uniqueImages = removeDuplicatesByURL(images);
    
    Map<ImageMeta, Double> scores = new HashMap<ImageMeta, Double>();
    List<ImageMeta> result1 = new ArrayList<ImageMeta>();
    List<ImageMeta> results = new ArrayList<ImageMeta>();
    String baseURL = extractedContents.getBaseURL();
    if (baseURL!=null){
    	baseURL = baseURL.trim();
    }
    String domain = URLUtil.extractDomainFromUrl(baseURL);
    
    int adjustedTitlePosition = extractedContents.getTitlePosition();
    long maxSize = 0L;
    for(ImageMeta image : uniqueImages)
    {
      if(accept(image))
      {
    	long s = 0L;
    	if (image.getWidth()!=null && image.getHeight()!=null){
    		if (image.getWidth()==1 || image.getHeight()==1) continue;
    		if (image.getWidth()==0 || image.getHeight()==0) continue;
    		s = image.getWidth()*image.getHeight();
    	}
    	if (s>maxSize) maxSize=s;
    	result1.add(image);	  
      }
      else if (image.getPosition() > adjustedTitlePosition)
      {
        adjustedTitlePosition++; // we need the title position to be relative only to
                                 // acceptable images, so if an image is unacceptable, we
                                 // increment to adjust for the gap in the numbering the
                                 // unacceptable image creates.  This is a bit of a hack...
      }
    }
    
    for(ImageMeta image : result1)
    {
      double score = score(adjustedTitlePosition, domain, image, null,maxSize);
      if(score > _minScoreThreshold)
      {
        scores.put(image, score);
        results.add(image);
      }
    }
    
    Collections.sort(results, new ScoreComparator(scores));
    
    return extractedContents.clone(results);
  }
  
  /**
   * Filters images using data available only by retrieving http headers for the images.
   * @param images
   * @param headers
   * @return
   */
  public ExtractedContents sortAndFilterByImageHeaders(ExtractedContents extractedContents, Map<ImageMeta, ImageHeader> headers)
  {
    List<ImageMeta> images = extractedContents.getImages();
    List<ImageMeta> uniqueImages = removeDuplicatesByURL(images);
    // TODO LOW we could potentially remove duplicates by hash or signature for this case as well
    List<ImageMeta> results = new ArrayList<ImageMeta>();
    for(ImageMeta image : uniqueImages)
    {
    	ImageHeader header = headers.get(image);
      if(header != null && accept(header))
      {
        results.add(image);
      }
    }
    // TODO: sort
    return extractedContents.clone(results);
  }
  
  /**
   * Filters images using data available only be retireving 
   * @param images
   * @param imagesContents
   * @return
   */
  public ExtractedContents sortAndFilterByImageContents(ExtractedContents extractedContents, Map<ImageMeta, ImageSize> imagesContents)
  {
    List<ImageMeta> images = extractedContents.getImages();
    //List<ImageMeta> uniqueImages = removeDuplicatesBySignature(removeDuplicatesByURL(images), imagesContents);
    Map<ImageMeta, Double> scores = new HashMap<ImageMeta, Double>();
    List<ImageMeta> results = new ArrayList<ImageMeta>();
    String baseURL = extractedContents.getBaseURL();
    if (baseURL!=null){
    	baseURL=baseURL.trim();
    }
    String domain = URLUtil.extractDomainFromUrl(baseURL);
    long maxSize = 0L;
    for(ImageMeta image : images)
    {
      ImageSize imgSize = imagesContents.get(image);
      if(imgSize != null && accept(imgSize))
      {
    	long s = imgSize.width * imgSize.height;
    	if (maxSize<s) maxSize=s;
    	image.setSize(imgSize.size);
    	results.add(image);
      }
    }
    
    for (ImageMeta result : results){
    	ImageSize imgSize = imagesContents.get(result);
    	double score = score(extractedContents.getTitlePosition(), domain, result, imgSize,maxSize);
        scores.put(result, score);	
    }
  
    Collections.sort(results, new ScoreComparator(scores));
    
    return extractedContents.clone(results);
  }
  
  // TODO LOW: This might be the wrong approach, rather than just keeping a single copy of each image, should we keep the count for each
  // image,  we will likely need to either not show images that appear more than once or penalize them when scoring
  public List<ImageMeta> removeDuplicatesByURL(List<ImageMeta> images)
  {
    Set<String> urls = new HashSet<String>();
    ArrayList<ImageMeta> results = new ArrayList<ImageMeta>();
    for(ImageMeta image : images)
    {
      if(image.getUri() != null && urls.contains(image.getUri()) == false)
      {
        urls.add(image.getUri());
        results.add(image);
      }
    }
    return results;
  }
  
  public static String generateSignature(byte[] bytes) throws Exception
  {
    MessageDigest md;
    try
    {
      md = MessageDigest.getInstance("MD5");
      md.reset();
      md.update(bytes);
      byte[] digest = md.digest();
      BigInteger bigInt = new BigInteger(1,digest);
      String hashtext = bigInt.toString(16);
      while(hashtext.length() < 32)
      {
        hashtext = "0" + hashtext;
      }
      return hashtext;
    }
    catch (NoSuchAlgorithmException e)
    {
      throw new Exception("unable to generate image signature", e);
    }

  }
  
  /**
   * Filters out duplicate images by checking if they have the same signatures, which means they are almost certainly identical.
   * @param images
   * @param imagesContents
   * @return
   */
  public List<ImageMeta> removeDuplicatesBySignature(List<ImageMeta> images, Map<ImageMeta, BufferedImage> imagesContents)
  {
    Set<String> signatures = new HashSet<String>();
    ArrayList<ImageMeta> results = new ArrayList<ImageMeta>();
    for(ImageMeta image : images)
    {
      BufferedImage imageContents = imagesContents.get(image);
      if(imageContents == null) continue;
      byte[] ds;
      String signature = null;
      try
      {
    	ByteArrayOutputStream baos = new ByteArrayOutputStream(1000);
    	ImageIO.write(imageContents, "jpeg", baos);
    	baos.flush();
    	ds = baos.toByteArray();
    	baos.close();
        signature = generateSignature(ds);
        
        if(signature != null && signatures.contains(signature) == false)
        {
          signatures.add(signature);
          results.add(image);
        }
      }
      catch (IOException e)
      {
        logger.error("IOException generating signature for: " + image.getUri(),e);
      }
      catch (Exception e)
      {
    	logger.error(" InternalException generating signature for: " + image.getUri(),e);
      }
    }
    return results;
  }
  
  /**
   * Sorts entries using the provided scores.
   * @version $Revision: 113091 $
   */
  private static class ScoreComparator implements Comparator<ImageMeta>
  {
    private final Map<ImageMeta, Double> _scores;

    public ScoreComparator(Map<ImageMeta, Double> scores)
    {
      _scores = scores;
    }

    public int compare(ImageMeta image1, ImageMeta image2)
    {
      final Double score1 = _scores.get(image1);
      final Double score2 = _scores.get(image2);
      return Double.compare(score2, score1); // swapped params here to sort by score from highest to lowest
    }
  }
  
  public boolean accept(ImageMeta image)
  {
    if(image == null) return false;
    return isAcceptableFormat(image);
    
    // using width/height as a filter for html image attributes does not work, it results in to many false negatives
    //return isAcceptableSize(image) &&
    //       isAcceptableAspectRatio(image);
  }
  
  public boolean accept(ImageSize imageSize)
  {
    if(imageSize == null) return false;
    return isAcceptableSize(imageSize.width, imageSize.height) &&
            isAcceptableAspectRatio(imageSize.width, imageSize.height);
  }
  
  public boolean accept(ImageHeader imageHeader)
  {
    if(imageHeader == null) return false;
    return isAcceptableSize(imageHeader);
  }
  
  private double score(int titlePosition, String domain, ImageMeta imageMetadata, ImageSize imageSize,long maxSize)
  {
    // TODO: weigh the various factors?
    double attributesScore = scoreAttributes(imageMetadata); 
    //double domainScore = scoreDomain(domain, image);  // initial numbers seem to indicate this just makes things worse
    double filenameScore = scoreImageName(imageMetadata);
    
    double sizeScore;
    if(imageSize != null) // use the best data available, if we have the actual image, use it's size info, otherwise try to use html width/height attributes
    {
      sizeScore = scoreSize(imageSize.width,imageSize.height,maxSize);
    }
    else
    {
      sizeScore = scoreSize(imageMetadata.getWidth(),imageMetadata.getHeight(),maxSize);
    }
    
    double formatScore = scoreFormat(imageMetadata);
    double positionScore = scorePosition(imageMetadata, titlePosition);
    double score = attributesScore*_attributeScoreWeight + 
                    /*domainScore*DOMAIN_SCORE_WEIGHT +*/ 
                    filenameScore*_filenameScoreWeight + 
                    sizeScore*_sizeScoreWeight + 
                    formatScore*_formatScoreWeight + 
                    positionScore*_positionScoreWeight;
    return score;
  }
  
  private boolean isAcceptableSize(ImageMeta image)
  {
    if(image.getWidth() == null || image.getHeight() == null) return false;
    return isAcceptableSize(image.getWidth(), image.getHeight());
  }
  
  private boolean isAcceptableSize(int width, int height)
  {
    return (width > _minAcceptableImageWidth) && (height > _minAcceptableImageHeight);
  }
  
  private boolean isAcceptableSize(ImageHeader imageHeader)
  {
    return (imageHeader.getContentLength() > _minAcceptableImageSizeInBytes);
  }
  
  private boolean isAcceptableAspectRatio(int width, int height)
  {
    double aspectRatio =  ((double)width)/height;
    boolean okayRatio = (aspectRatio > _minAcceptableAspectRatio && aspectRatio < _maxAcceptableAspectRatio);
    
    return okayRatio;
  }
  
  private boolean isAcceptableFormat(ImageMeta image)
  {
    // images from a variety of sources, including bloomberg do not include image format as a file extension in the URL... so disabling the filter
    if(_allowAllImageFormats) return true;
    
    String uri = image.getUri();
    try
    {
      // we first try getting the path from the URL because this will strip off any query parameters, giving us the best possible filename to check
      URL url = new URL(uri);
      String path = url.getPath().toUpperCase();
      // TODO: extract image file extension list
      if(path.endsWith(".GIF")) return false;
      else return true;
    }
    catch (MalformedURLException e)
    {
      // if we can't work with the path from the URL we fallback to a simple URL string check
      logger.warn("unable to determine format because URL could not be parsed by java.net.URL");
      String url = uri.trim().toUpperCase();
      if(url.endsWith(".GIF")) return false;
      else return true;
    }
  }
  
  private double scoreSize(Integer width,Integer height,long maxSize)
  {
    if(width == null || height == null) return 0;
    long size = width*height;
    if(isAcceptableAspectRatio(width,height) == false) return 0;
    long ms = maxSize == 0L ? _preferedImageSizeInPixels : maxSize;
    return (double)size/(double)ms;
  }
  
  private double scoreAttributes(ImageMeta image)
  {
    double altScore = (image.getAlt() == null) ? 0 : .5;
    double titleScore = (image.getTitle() == null) ? 0 : .5;
    // score can be improved by looking at length, then potentially language of text, keywords ...
    
    return altScore + titleScore;
  }
  
 
  
  private double scoreDomain(String domain, ImageMeta image)
  {
    // TODO: this is way to simple, most images come from CDNs now, we either need to whitelist those (difficult)
    // or blacklist ad sites, but that is also relatively difficult..
    if(image.getUri() == null) return 0;
    String imgUri = image.getUri();
    if (imgUri!=null){
    	imgUri=URLUtil.extractDomainFromUrl(imgUri.trim());
    	return imgUri.equals(domain) ? 1 : 0;
    }
    return 0;
  }
  
  private double scoreImageName(ImageMeta image)
  {
    // facebook_connect.png
    String src = image.getUri();
    int slashPosition = src.lastIndexOf('/');
    if(slashPosition < 0) return 0;
    String filename = src.substring(slashPosition, src.length());
    if(filename.contains("twitter") || 
        filename.contains("facebook") || 
        filename.contains("rss") || 
        filename.contains("logo") ||
        filename.contains("spacer")) return 0; // TODO: put a blacklist somewhere
    return 1;
  }
  
  private double scoreFormat(ImageMeta image)
  {
    try
    {
      String uri = image.getUri();
      URL url = new URL(uri);
      String path = url.getPath().toUpperCase();
      if(path.endsWith(".JPG") || path.endsWith(".JPEG")) return 1;
      if(path.endsWith(".PNG")) return .4;
      if(path.endsWith(".BMP")) return .3;
      if(path.endsWith(".GIF")) return 0; // we don't like gifs
      else return .1; // unknown format-- we score these slightly higher than gifs
    }
    catch (MalformedURLException e)
    {
      logger.warn("unable to determine format because URL could not be parsed by java.net.URL");
      return 0;
    }
  }
  
  private double scorePosition(ImageMeta image, int titlePosition)
  {
    if(titlePosition == -1) return 0; // we don't know where the title is so we cannot use it to help with scoring
    
    // position may be negative if it is before where the article title appears in the body
    // for first pass we are only looking at images appearing after the title.
    int relativePosition = image.getPosition() - (titlePosition+1);
    if(relativePosition < 0 || relativePosition > _numberOfImagesBoostedAfterTitle) return 0;
    return (double)(_numberOfImagesBoostedAfterTitle - relativePosition) / (double)_numberOfImagesBoostedAfterTitle;
  }
}