/** * Copyright 2014 Eediom Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.araqne.logdb.crawler.query; import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.security.cert.X509Certificate; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSession; import javax.net.ssl.SSLSocketFactory; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; import org.araqne.api.Io; import org.araqne.codec.Base64; import org.araqne.logdb.DriverQueryCommand; import org.araqne.logdb.Row; import org.araqne.logdb.Strings; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author xeraph@eediom.com */ public class WgetQueryCommand extends DriverQueryCommand { private static long WGET_MAX_SIZE; static { WGET_MAX_SIZE = 10485760; // override default max size String s = System.getProperty("araqne.logdb.wget_max_size"); if (s != null) { try { WGET_MAX_SIZE = Long.valueOf(s); } catch (NumberFormatException e) { } } } private final Logger slog = LoggerFactory.getLogger(WgetQueryCommand.class); private final TrustManager[] trustAllCerts = new TrustManager[] { new IgnoreTrustManager() }; private final HostnameVerifier hostnameVerifier = new IgnoreHostnameVerifier(); private String url; private String selector; private int timeout; private String method; private String encoding; private String auth; private String authHeader; private boolean isHttps; public WgetQueryCommand(String url, String selector, int timeout, String method, String encoding, String auth) { this.url = url; this.selector = selector; this.timeout = timeout; this.method = method; this.encoding = encoding; this.auth = auth; if (auth != null) { try { String base64Auth = new String(Base64.encode(auth.getBytes("utf-8"))); this.authHeader = "Basic " + base64Auth; } catch (UnsupportedEncodingException e) { // unreachable } } this.isHttps = false; try { new URL(url).getProtocol().equals("https"); } catch (MalformedURLException e) { } } @Override public String getName() { return "wget"; } @Override public boolean isDriver() { return url != null; } @Override public void run() { try { Row row = new Row(); if (selector != null) fetchUrlByJsoup(row, url); else fetchUrl(row, url, true); pushPipe(row); } catch (FileNotFoundException e) { throw new IllegalStateException("404 not found: " + e.getMessage()); } catch (Throwable t) { slog.debug("araqne logdb crawler: wget failed - " + url, t); throw new IllegalStateException("wget: " + t.getMessage()); } } @Override public void onPush(Row row) { Object o = row.get("url"); String url = null; try { if (o == null) return; url = o.toString(); if (selector != null) fetchUrlByJsoup(row, url); else fetchUrl(row, url, false); } catch (Throwable t) { if (slog.isDebugEnabled()) slog.debug("araqne logdb crawler: wget failed - " + url, t); } finally { pushPipe(row); } } private void fetchUrl(Row row, String url, boolean throwException) throws Exception { HttpURLConnection conn = null; InputStream is = null; byte[] b = new byte[8096]; ByteArrayOutputStream bos = new ByteArrayOutputStream(10240); int total = 0; try { conn = (HttpURLConnection) new URL(url).openConnection(); if (conn instanceof HttpsURLConnection) { final SSLContext sslContext = SSLContext.getInstance("SSL"); sslContext.init(null, trustAllCerts, new java.security.SecureRandom()); ((HttpsURLConnection) conn).setSSLSocketFactory(sslContext.getSocketFactory()); ((HttpsURLConnection) conn).setHostnameVerifier(hostnameVerifier); } conn.setConnectTimeout(timeout * 1000); conn.setReadTimeout(timeout * 1000); conn.setRequestMethod(method.toUpperCase()); if (authHeader != null) conn.setRequestProperty("Authorization", authHeader); is = conn.getInputStream(); while (true) { int len = is.read(b); if (len < 0) break; total += len; if (total >= WGET_MAX_SIZE) { if (throwException) throw new IllegalStateException("Too large HTTP response, exceeds max size " + WGET_MAX_SIZE); row.put("_wget_error", "exceeds-max-size"); return; } bos.write(b, 0, len); } row.put("line", bos.toString(encoding)); } finally { Io.ensureClose(is); conn.disconnect(); } } private void fetchUrlByJsoup(Row row, String url) throws Exception { SSLSocketFactory oldSocketFactory = null; HostnameVerifier oldHostnameVerifier = null; try { if (isHttps) { oldSocketFactory = HttpsURLConnection.getDefaultSSLSocketFactory(); oldHostnameVerifier = HttpsURLConnection.getDefaultHostnameVerifier(); final SSLContext sslContext = SSLContext.getInstance("SSL"); sslContext.init(null, trustAllCerts, new java.security.SecureRandom()); HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory()); HttpsURLConnection.setDefaultHostnameVerifier(hostnameVerifier); } Connection conn = Jsoup.connect(url); if (authHeader != null) conn.header("Authorization", authHeader); conn.ignoreContentType(true); conn.timeout(timeout * 1000); Document doc = null; if (method.equals("get")) doc = conn.get(); else if (method.equals("post")) doc = conn.post(); if (doc != null) { if (selector != null) { Elements elements = doc.select(selector); ArrayList<Object> l = new ArrayList<Object>(elements.size()); for (Element e : elements) { Map<String, Object> m = new HashMap<String, Object>(); for (Attribute attr : e.attributes()) { m.put(attr.getKey(), attr.getValue()); } m.put("own_text", e.ownText()); m.put("text", e.text()); l.add(m); } row.put("elements", l); } else { row.put("html", doc.outerHtml()); } } } finally { if (isHttps) { HttpsURLConnection.setDefaultSSLSocketFactory(oldSocketFactory); HttpsURLConnection.setDefaultHostnameVerifier(oldHostnameVerifier); } } } @Override public String toString() { String s = "wget"; if (url != null) s += " url=\"" + url + "\""; if (selector != null) s += " selector=\"" + selector + "\""; if (timeout != 30000) s += " timeout=" + timeout; if (!method.equals("get")) s += " method=" + method; if (!encoding.equals("utf-8")) s += " encoding=" + encoding; if (auth != null) s += " auth=" + Strings.doubleQuote(auth); return s; } static class IgnoreTrustManager implements X509TrustManager { @Override public void checkClientTrusted(final X509Certificate[] chain, final String authType) { } @Override public void checkServerTrusted(final X509Certificate[] chain, final String authType) { } @Override public X509Certificate[] getAcceptedIssuers() { return null; } } static class IgnoreHostnameVerifier implements HostnameVerifier { @Override public boolean verify(String s, SSLSession sslSession) { return true; } } }