/******************************************************************************* * Copyright 2015 htd0324@gmail.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package com.laudandjolynn.mytv.utils; import java.io.IOException; import java.net.MalformedURLException; import java.util.Random; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.ProxyConfig; import com.gargoylesoftware.htmlunit.WebClient; import com.laudandjolynn.mytv.exception.MyTvException; import com.laudandjolynn.mytv.model.Proxy; import com.laudandjolynn.mytv.proxy.MyTvProxyManager; /** * @author: Laud * @email: htd0324@gmail.com * @date: 2015年3月24日 下午2:25:00 * @copyright: www.laudandjolynn.com */ public class WebCrawler { private final static Logger logger = LoggerFactory .getLogger(WebCrawler.class); private final static BrowserVersion[] USER_AGENTS = new BrowserVersion[] { BrowserVersion.CHROME, BrowserVersion.FIREFOX_24, BrowserVersion.INTERNET_EXPLORER_11 }; /** * 使用htmlunit抓取网页 * * @param url * @return */ public static Page crawl(String url) { return crawl(url, randomUserAgent()); } /** * 使用htmlunit抓取网页 * * @param url * @param userAgent * @param cookie * @return */ public static Page crawl(String url, String userAgent) { return crawl(url, new BrowserVersion(WebCrawler.class.getName(), "1.0", userAgent, 1.0f)); } /** * 使用htmlunit抓取网页 * * @param url * @param browserVersion * @return */ private static Page crawl(String url, BrowserVersion browserVersion) { Proxy proxy = MyTvProxyManager.getInstance().pickProxy(); WebClient webClient = new WebClient(browserVersion); if (proxy != null) { ProxyConfig pc = new ProxyConfig(proxy.getIp(), proxy.getPort()); webClient.getOptions().setProxyConfig(pc); } webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); try { logger.info("begin to get page: " + url + (proxy != null ? ", using: " + proxy : "")); return webClient.getPage(url); } catch (FailingHttpStatusCodeException e) { throw new MyTvException("can't connect to " + url, e); } catch (MalformedURLException e) { throw new MyTvException("invalid url " + url, e); } catch (IOException e) { throw new MyTvException("error occur while connect to " + url, e); } finally { webClient.closeAllWindows(); } } /** * 取得随机浏览器标识 * * @return */ private static BrowserVersion randomUserAgent() { Random random = new Random(); int max = USER_AGENTS.length; return USER_AGENTS[random.nextInt(max)]; } }