/* * Copyright 2012 The Stanford MobiSocial Laboratory * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package mobisocial.musubi.util; import java.io.ByteArrayOutputStream; import java.net.URL; import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import android.graphics.Bitmap; import android.graphics.Bitmap.CompressFormat; import android.graphics.BitmapFactory; import android.util.Log; public class OGUtil { public static class OGData { String mTitle; String mUrl; byte[] mImage; String mDescription; String mMimeType; } private static final String TAG = "OGUtil"; //TODO: these could be better, unit tests as well private static Pattern sTitleRegex = Pattern.compile("<\\s*title\\s*>([^<]+)<\\s*/title\\s*>", Pattern.CASE_INSENSITIVE); private static Pattern sImageRegex = Pattern.compile("<\\s*img\\s+[^>]+>", Pattern.CASE_INSENSITIVE); private static Pattern sMetaRegex = Pattern.compile("<\\s*meta\\s+[^>]+>", Pattern.CASE_INSENSITIVE); private static Pattern sPropertyOfMeta = Pattern.compile("\\b(?:name|property)\\s*=\\s*(\"[^\"]+\"|'[^']+')", Pattern.CASE_INSENSITIVE); private static Pattern sContentOfMeta = Pattern.compile("\\bcontent\\s*=\\s*(\"[^\"]+\"|'[^']+')", Pattern.CASE_INSENSITIVE); private static Pattern sSrcOfImage = Pattern.compile("\\bsrc\\s*=\\s*(\"[^\"]+\"|'[^']+')", Pattern.CASE_INSENSITIVE); public static OGData getOrGuess(String url) { DefaultHttpClient hc = new DefaultHttpClient(); HttpResponse res; try { HttpGet hg = new HttpGet(url); res = hc.execute(hg); } catch (Exception e) { Log.e(TAG, "unable to fetch page to get og tags", e); return null; } String location = url; //TODO: if some kind of redirect magic happened, then //make the location match that OGData og = new OGData(); HttpEntity he = res.getEntity(); Header content_type = he.getContentType(); //TODO: check the content directly if they forget the type header if(content_type == null || content_type.getValue() == null) { Log.e(TAG, "page missing content type ..abandoning: " + url); return null; } og.mMimeType = content_type.getValue(); //just make a thumbnail if the shared item is an image if(og.mMimeType.startsWith("image/")) { Bitmap b; try { b = BitmapFactory.decodeStream(he.getContent()); } catch (Exception e) { return null; } //TODO: scaling int w = b.getWidth(); int h = b.getHeight(); if(w > h) { h = h * 200 / w; w = 200; } else { w = w * 200 / h; h = 200; } Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true); b.recycle(); b = b2; ByteArrayOutputStream baos = new ByteArrayOutputStream(); b.compress(CompressFormat.PNG, 100, baos); og.mImage = baos.toByteArray(); b.recycle(); return og; } //if its not html, we can't extract more details, the caller //should rely on what they already know. if(!og.mMimeType.startsWith("text/html") && !og.mMimeType.startsWith("application/xhtml")) { Log.e(TAG, "shared content is not a known type for meta data processing " + og.mMimeType); return og; } String html; try { html = IOUtils.toString(he.getContent()); } catch (Exception e) { Log.e(TAG, "failed to read html content", e); return og; } Matcher m = sTitleRegex.matcher(html); if(m.find()) { og.mTitle = StringEscapeUtils.unescapeHtml4(m.group(1)); } m = sMetaRegex.matcher(html); int offset = 0; String raw_description = null; while(m.find(offset)) { try { String meta_tag = m.group(); Matcher mp = sPropertyOfMeta.matcher(meta_tag); if(!mp.find()) continue; String type = mp.group(1); type = type.substring(1, type.length() - 1); Matcher md = sContentOfMeta.matcher(meta_tag); if(!md.find()) continue; String data = md.group(1); //remove quotes data = data.substring(1, data.length() - 1); data = StringEscapeUtils.unescapeHtml4(data); if(type.equalsIgnoreCase("og:title")) { og.mTitle = data; } else if(type.equalsIgnoreCase("og:image")) { HttpResponse resi; try { HttpGet hgi = new HttpGet(data); resi = hc.execute(hgi); } catch (Exception e) { Log.e(TAG, "unable to fetch og image url", e); continue; } HttpEntity hei = resi.getEntity(); if(!hei.getContentType().getValue().startsWith("image/")) { Log.e(TAG, "image og tag points to non image data" + hei.getContentType().getValue()); } try { Bitmap b; try { b = BitmapFactory.decodeStream(hei.getContent()); } catch (Exception e) { return null; } //TODO: scaling int w = b.getWidth(); int h = b.getHeight(); if(w > h) { h = h * Math.min(200, w) / w; w = Math.min(200, w); } else { w = w * Math.min(200, h) / h; h = Math.min(200, h); } Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true); b.recycle(); b = b2; ByteArrayOutputStream baos = new ByteArrayOutputStream(); b.compress(CompressFormat.PNG, 100, baos); b.recycle(); og.mImage = baos.toByteArray(); } catch(Exception e) { Log.e(TAG, "failed to fetch image for og", e); continue; } } else if(type.equalsIgnoreCase("description")) { raw_description = data; } else if(type.equalsIgnoreCase("og:description")) { og.mDescription = data; } else if(type.equalsIgnoreCase("og:url")) { og.mUrl = data; } } finally { offset = m.end(); } } HashSet<String> already_fetched = new HashSet<String>(); if(og.mImage == null) { int max_area = 0; m = sImageRegex.matcher(html); int img_offset = 0; while(m.find(img_offset)) { try { String img_tag = m.group(); Matcher ms = sSrcOfImage.matcher(img_tag); if(!ms.find()) continue; String img_src = ms.group(1); img_src = img_src.substring(1, img_src.length() - 1); img_src = StringEscapeUtils.unescapeHtml4(img_src); //don't fetch an image twice (like little 1x1 images) if(already_fetched.contains(img_src)) continue; already_fetched.add(img_src); HttpResponse resi; try { HttpGet hgi = new HttpGet(new URL(new URL(location), img_src).toString()); resi = hc.execute(hgi); } catch (Exception e) { Log.e(TAG, "unable to fetch image url for biggest image search" + img_src, e); continue; } HttpEntity hei = resi.getEntity(); if(hei == null) { Log.w(TAG, "image missing en ..trying entity response: " + url); continue; } Header content_type_image = hei.getContentType(); if(content_type_image == null || content_type_image.getValue() == null) { Log.w(TAG, "image missing content type ..trying anyway: " + url); } if(!content_type_image.getValue().startsWith("image/")) { Log.w(TAG, "image tag points to non image data " + hei.getContentType().getValue() + " " + img_src); } try { Bitmap b; try { b = BitmapFactory.decodeStream(hei.getContent()); } catch (Exception e) { return null; } //TODO: scaling int w = b.getWidth(); int h = b.getHeight(); if(w * h <= max_area) { continue; } if(w < 32 || h < 32) { //skip dinky crap continue; } if(w > h) { h = h * Math.min(200, w) / w; w = Math.min(200, w); } else { w = w * Math.min(200, h) / h; h = Math.min(200, h); } Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true); b.recycle(); b = b2; ByteArrayOutputStream baos = new ByteArrayOutputStream(); b.compress(CompressFormat.PNG, 100, baos); og.mImage = baos.toByteArray(); b.recycle(); max_area = w * h; } catch(Exception e) { Log.e(TAG, "failed to fetch image for og", e); continue; } } finally { img_offset = m.end(); } } } if(og.mDescription == null) og.mDescription = raw_description; return og; } }