package ecologylab.bigsemantics.documentparsers;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.Map;
import java.util.regex.Pattern;
import ecologylab.net.ParsedURL;
/**
*
* @author quyin
*
*/
public class SpecialImageUrlHandler
{
/**
* For Google Image, we need to change the source of the image since we don't support HTTPS.
*
* @param imgSrcAttr
* The HTTPS image source.
* @return The HTTP image source that points to the same image.
*/
public String changeImageUrlIfNeeded(String imgSrcAttr)
{
if (imgSrcAttr != null)
{
if (Pattern.matches("https://encrypted-tbn\\d+.google.com/images?.*", imgSrcAttr)
|| Pattern.matches("https://encrypted-tbn\\d+.gstatic.com/images?.*", imgSrcAttr))
{
imgSrcAttr = imgSrcAttr.replace("https://", "http://");
imgSrcAttr = imgSrcAttr.replace("//encrypted-tbn", "//tbn");
imgSrcAttr = imgSrcAttr.replace("gstatic.com", "google.com");
}
}
return imgSrcAttr;
}
/**
* For some other cases with Google Image, we need to parse the image URL from URL parameters.
*
* @param hrefString
* @return The true image URL if contained in the URL parameters, or null if not contained.
*/
public String getImageUrlFromParameters(String hrefString)
{
if (hrefString != null)
{
if (hrefString.startsWith("http://www.google.com/imgres?")
|| hrefString.startsWith("http://images.google.com/imgres?"))
{
ParsedURL hrefPURL = ParsedURL.getAbsolute(hrefString);
Map<String, String> params = hrefPURL.extractParams(false);
if (params != null && params.containsKey("imgurl"))
return params.get("imgurl");
}
}
return null;
}
/**
* Image ref URL is the URL of the referring page where the image appears in. In some cases, like
* Google Image, this ref URL is encoded as a URL parameter, and we need to extract it.
*
* @param imgHref
* The original image ref URL.
* @param outNewImgHref
* Buffer to hold the real image ref URL. By default it is the same as the imgHref, but
* in cases needed it will be different.
* @return If we should change the image's source_doc to outNewImgHref.
* @throws UnsupportedEncodingException
*/
public boolean changeImageRefUrlAndSourceDocIfNeeded(String imgHref,
StringBuilder outNewImgHref)
throws UnsupportedEncodingException
{
if (imgHref != null)
{
if (imgHref.startsWith("http://www.google.com/imgres?")
|| imgHref.startsWith("http://images.google.com/imgres?"))
{
ParsedURL hrefPURL = ParsedURL.getAbsolute(imgHref);
Map<String, String> params = hrefPURL.extractParams(false);
if (params != null)
{
if (params.containsKey("imgrefurl"))
{
String newImgHref = params.get("imgrefurl");
newImgHref = URLDecoder.decode(newImgHref, "utf-8");
if (outNewImgHref != null)
outNewImgHref.append(newImgHref);
return true;
}
}
}
else
{
if (outNewImgHref != null)
outNewImgHref.append(imgHref);
}
}
return false;
}
}