/*
* Copyright (c) 2009-2010 Lockheed Martin Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.eurekastreams.server.service.actions.strategies.links;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eurekastreams.server.domain.stream.LinkInformation;
/**
* Parses images out of HTML.
*/
public class BasicLinkImageParser implements HtmlLinkInformationParserStrategy
{
/**
* URL Utilities.
*/
private ConnectionFacade urlUtilities;
/**
* Minimum size for images.
*/
public static final int MIN_IMG_SIZE = 60;
/**
* The max results to look for.
*/
private int maxResults = 0;
/**
* Time out in milliseconds.
*/
private long timeOut;
/**
* Constructor.
*
* @param inUrlUtilities
* The URL utilities.
* @param inMaxResults
* the max number of images to return.
* @param inTimeOut
* the amount of milliseconds to allow image searching for.
*/
public BasicLinkImageParser(final ConnectionFacade inUrlUtilities, final int inMaxResults, final long inTimeOut)
{
urlUtilities = inUrlUtilities;
maxResults = inMaxResults;
timeOut = inTimeOut;
}
/**
* Logger.
*/
private Log log = LogFactory.getLog(BasicLinkImageParser.class);
/**
* Parse the HTML.
*
* @param htmlString
* the HTML as a string.
* @param link
* the link.
* @param inAccountId
* account id of the user making the request for the link image.
*/
public void parseInformation(final String htmlString, final LinkInformation link, final String inAccountId)
{
Set<String> imageUrls = new HashSet<String>();
/**
* Determine if the link is an image.
*/
Matcher isImagePattern = Pattern.compile(".*\\.(jpg|png|gif|jpeg)", Pattern.CASE_INSENSITIVE).matcher(
link.getUrl());
if (isImagePattern.find())
{
imageUrls.add(link.getUrl());
link.setImageUrls(imageUrls);
return;
}
/**
* Find images.
*/
Pattern imagePattern = Pattern.compile("<img[^>]*>", Pattern.CASE_INSENSITIVE);
Matcher imgMatcher = imagePattern.matcher(htmlString);
long start = System.currentTimeMillis();
int largestImgScore = 0;
while (imgMatcher.find())
{
String img = imgMatcher.group();
// Look for size tags to not download images that are clearly too small.
Matcher widthHeightMatch = Pattern.compile("(width|height)\\s?=\\s?[\"\']?([0-5]?[0-9])[\"\'\\s]").matcher(
img);
if (!widthHeightMatch.find())
{
Pattern imgSrcPattern = Pattern.compile("src\\s?=\\s?[\"\']?([^\"\'\\s]*)", Pattern.CASE_INSENSITIVE);
Matcher srcMatcher = imgSrcPattern.matcher(img);
if (srcMatcher.find())
{
String imgUrl = srcMatcher.group(1);
try
{
String linkUrl = link.getUrl();
if (imgUrl.startsWith("/"))
{
log.trace("Image URL started with '/'");
imgUrl = urlUtilities.getProtocol(linkUrl) + "://" + urlUtilities.getHost(linkUrl) + imgUrl;
}
else if (!(imgUrl.startsWith("http://") || imgUrl.startsWith("https://")))
{
log.trace("No protocol found");
log.trace("Link URL: " + linkUrl);
if (linkUrl.indexOf("/", "https://".length() + 1) != -1)
{
imgUrl = linkUrl.substring(0, linkUrl.lastIndexOf("/") + 1) + imgUrl;
}
else
{
imgUrl = linkUrl + "/" + imgUrl;
}
log.trace("New Image Url: " + imgUrl);
}
int height = urlUtilities.getImgHeight(imgUrl, inAccountId);
int width = urlUtilities.getImgWidth(imgUrl, inAccountId);
if (height > MIN_IMG_SIZE && width > MIN_IMG_SIZE)
{
if ((height * width) > largestImgScore)
{
largestImgScore = height * width;
link.setLargestImageUrl(imgUrl);
}
imageUrls.add(imgUrl);
if (imageUrls.size() == maxResults)
{
break;
}
}
}
catch (MalformedURLException e)
{
log.error("Error parsing URL: " + e);
}
catch (IOException e)
{
log.error("Error parsing URL: " + e);
}
}
}
if (System.currentTimeMillis() - start > timeOut)
{
break;
}
}
link.setImageUrls(imageUrls);
}
}