package org.loklak.harvester; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.*; import org.jsoup.select.Elements; import org.loklak.http.CookieRequestManager; import java.util.regex.Matcher; public class WeiboInfoScraper { /** * This is a helper function that helps user to extract html nested inside of html script * @param raw_html * @return nested html String */ private static String getNestedHtml(String raw_html){ String html = raw_html.replace("\\",""); Document doc = Jsoup.parse(html); //get the script tag Elements scripts = doc.getElementsByTag("script"); //pattern for extracting html Pattern pttrn = Pattern.compile("\"html\":\""); String nested_html = ""; for (Element script:scripts){ Matcher m = pttrn.matcher(html = script.html()); if(m.find()){ nested_html += html.substring(m.end(), html.length() -3); } } return nested_html; } private CookieRequestManager manager = new CookieRequestManager(); public WeiboInfoScraper(){} public void addCookie(String cookie){ manager.addCookie(cookie); } public String get(String url){ return getNestedHtml(manager.buildRequest(url).makeRequest().body()); } public static void main(String args[]) { String test_url = "http://weibo.com/p/1005051666978981/info?mod=pedit_more"; WeiboInfoScraper scraper = new WeiboInfoScraper(); scraper.addCookie("SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5CpyuN-4ys57wHiW6l3Lk35JpX5KMhUgL.FoeE1h2c1Ke7ehM2dJLoI74WMJf0UGiadJUkxsHV97tt; SINAGLOBAL=7613128031177.77.1468064007468; ULV=1469247293505:4:4:2:6888389703798.142.1469247293469:1469195743689; SCF=AiBpidaOFHvAe4IdkfIvwMnQPbwC_X6-mWARH-VfeZgGBg2aI_9nTP3RooOHIZTUgc-HvQ3WJ0i3lJoxBuuV5MI.; SUHB=0u5hf-IIrTOdnN; wb_bub_hot_3281693007=1; UOR=,,developer.51cto.com; ALF=1500780807; _s_tentry=developer.51cto.com; SUB=_2A256lpXXDeTxGeVM41MX-S3MyzuIHXVZ5YAfrDV8PUNbmtBeLXflkW9PNY1IURQCXFeXkAya64Lev-0VGQ..; SSOLoginState=1469244807; YF-Ugrow-G0=56862bac2f6bf97368b95873bc687eef; wvr=6; YF-V5-G0=c99031715427fe982b79bf287ae448f6; Apache=6888389703798.142.1469247293469; YF-Page-G0=c6cf9d248b30287d0e884a20bac2c5ff"); String nested = scraper.get(test_url); System.out.println(nested); } }