/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.meaningfulweb.imgext; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import org.meaningfulweb.imgext.ImageSizeExtractor.ImageSize; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Manages selection of best images from a list of possible images. This uses the ImageFilter to sort and filter images and additionally downloads * select images to improve the sorting and filtering process. */ public class ImageSelector{ private static Logger logger = LoggerFactory.getLogger(ImageSelector.class); private final ImageFilter _imageFilter; private final ImageSizeExtractor _imageSizeExtractor; private final int _imagesToFetchPerArticle; private final int _maxImagesToScrape; public ImageSelector(ImageFilter imageFilter, ImageFetcher imageFetcher) { this(imageFilter,imageFetcher,10,7); } public ImageSelector(ImageFilter imageFilter, ImageSizeExtractor imageSizeExtractor, int imagesToFetchPerArticle, int maxImagesToScrape) { _imageFilter = imageFilter; _imageSizeExtractor = imageSizeExtractor; _imagesToFetchPerArticle = imagesToFetchPerArticle; _maxImagesToScrape = maxImagesToScrape; } /** * Given the extracted contents of a web page, sorts and filters the images by their HTML metadata, then downloads the first N images and filters them again * using content size, dimensions, etc... * * @param extractedContents * @param thumbnailImageUrl * @param articleURL * @param filterByImageContents * @param filterFirstImage * @return */ public List<ImageInfo> fetchAndSortImages(ExtractedContents extractedContents, String articleURL, boolean filterByImageContents, boolean filterFirstImage) { // split up the list, note we're dealing with very small lists here and we always do this since to keep later image list lookups cheap ExtractedContents sortedByMetadata = _imageFilter.sortAndFilterByMetadataOnly(extractedContents); // fetch images for the first N items List<ImageMeta> images; int count = sortedByMetadata._images.size(); if(filterByImageContents && count>1) { ExtractedContents imagesToFetch = sortedByMetadata.subList(0, 0, 0, _imagesToFetchPerArticle); List<ImageMeta> imagesNotToFetch = sortedByMetadata.getImages().subList(Math.min(sortedByMetadata.getImages().size(), _imagesToFetchPerArticle), sortedByMetadata.getImages().size()); Map<ImageMeta, ImageSize> fetchedImages = _imageSizeExtractor.extractSize(imagesToFetch.getImages()); ExtractedContents imagesSortedByContents = _imageFilter.sortAndFilterByImageContents(imagesToFetch, fetchedImages); images = new ArrayList<ImageMeta>(imagesSortedByContents.getImages()); images.addAll(imagesNotToFetch); } else if (filterFirstImage && count>1) { images = filterFirstImage(new ArrayList<ImageMeta>(sortedByMetadata.getImages())); } else { images = new ArrayList<ImageMeta>(sortedByMetadata.getImages()); } // prefer the image_src if we have it if(extractedContents.getThumbnailImageUrl() != null) { ImageMeta thumbnail = new ImageMeta(-1, null, null, null, null, null,extractedContents.getThumbnailImageUrl(), null); try { ImageSize imgSize = _imageSizeExtractor.extractSize(thumbnail); if(filterFirstImage == false || _imageFilter.accept(imgSize)) { thumbnail.setSize(imgSize.size); images.add(0, thumbnail); } } catch (Exception e) { logger.error("failed to fetch image_src image: " + thumbnail.getUri(),e); } } List<ImageInfo> imageInfoList = new ArrayList<ImageInfo>(images); return imageInfoList.subList(0, Math.min(_maxImagesToScrape, imageInfoList.size())); } public ImageInfo getBestImage(ExtractedContents extractedContents,String articleURL, boolean filterByImageContents, boolean filterFirstImage){ List<ImageInfo> images = fetchAndSortImages(extractedContents,articleURL,filterByImageContents,filterFirstImage); return images.size() > 0 ? images.get(0) : null; } /** * Fetches and removes unacceptable images from the beginning ordered image list until an acceptable first image has been found. * @param imagesSortedByMetadata * @return */ private List<ImageMeta> filterFirstImage(List<ImageMeta> imagesSortedByMetadata) { for(Iterator<ImageMeta> imageIter = imagesSortedByMetadata.iterator(); imageIter.hasNext(); ) { try { ImageMeta image = imageIter.next(); ImageSize imageSize = _imageSizeExtractor.extractSize(image); if(_imageFilter.accept(imageSize)) { return imagesSortedByMetadata; } else { imageIter.remove(); } } catch (Exception e){ imageIter.remove(); } } return imagesSortedByMetadata; } }