/* * Copyright 2011 ArcBees Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.gwtplatform.crawler.server; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.logging.Logger; import javax.inject.Inject; import javax.inject.Singleton; import javax.servlet.Filter; import javax.servlet.FilterChain; import javax.servlet.FilterConfig; import javax.servlet.ServletException; import javax.servlet.ServletRequest; import javax.servlet.ServletResponse; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; /** * Servlet that makes this application crawlable. */ @Singleton public final class CrawlFilter implements Filter { private static final String CHAR_ENCODING = "UTF-8"; /** * Special URL token that gets passed from the crawler to the servlet filter. * This token is used in case there are already existing query parameters. */ private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_="; /** * Special URL token that gets passed from the crawler to the servlet filter. * This token is used in case there are not already existing query parameters. */ private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1; private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length(); private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length(); private final String serviceUrl; private final String key; private final Logger log; @Inject CrawlFilter(@ServiceUrl String serviceUrl, @ServiceKey String key, Logger log) { this.serviceUrl = serviceUrl; this.key = key; this.log = log; } /** * Maps from the query string that contains _escaped_fragment_ to one that * doesn't, but is instead followed by a hash fragment. It also unescapes any * characters that were escaped by the crawler. If the query string does not * contain _escaped_fragment_, it is not modified. * * @param queryString query string * @return A modified query string followed by a hash fragment if applicable. * The non-modified query string otherwise. * @throws UnsupportedEncodingException */ private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException { int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2); int length = ESCAPED_FRAGMENT_LENGTH2; if (index == -1) { index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1); length = ESCAPED_FRAGMENT_LENGTH1; } if (index != -1) { StringBuilder queryStringSb = new StringBuilder(); if (index > 0) { queryStringSb.append("?"); queryStringSb.append(queryString.substring(0, index)); } String hashFragmentWithoutBang = URLDecoder.decode( queryString.substring(index + length, queryString.length()), CHAR_ENCODING).trim(); if (hashFragmentWithoutBang.length() > 0) { queryStringSb.append("#!"); queryStringSb.append(hashFragmentWithoutBang); } return queryStringSb.toString(); } return queryString; } /** * Destroys the filter configuration. */ @Override public void destroy() { } /** * Filters all requests and invokes the external service if necessary. */ @Override public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException { HttpServletRequest req = (HttpServletRequest) request; HttpServletResponse res = (HttpServletResponse) response; String queryString = req.getQueryString(); // Only process calls to the main HTML page, and the empty one if desired final String requestURI = req.getRequestURI(); // Does this request contain an _escaped_fragment_? if (queryString != null && queryString.contains(ESCAPED_FRAGMENT_FORMAT1)) { res.setHeader("Content-Type", "text/html; charset=" + CHAR_ENCODING); res.setCharacterEncoding(CHAR_ENCODING); PrintWriter writer = res.getWriter(); try { StringBuilder pageNameSb = new StringBuilder(req.getScheme() + "://"); pageNameSb.append(req.getServerName()); if (req.getServerPort() != 0) { pageNameSb.append(":"); pageNameSb.append(req.getServerPort()); } pageNameSb.append(requestURI); queryString = rewriteQueryString(queryString); pageNameSb.append(queryString); String pageName = pageNameSb.toString(); log.info("Crawl filter encountered escaped fragment, will open: " + pageName); String serviceRequest = serviceUrl + "?key=" + URLEncoder.encode(key, CHAR_ENCODING) + "&url=" + URLEncoder.encode(pageName, CHAR_ENCODING); log.info("Full service request: " + serviceRequest); // Retry until we're cut off while (true) { BufferedReader reader = null; try { URL url = new URL(serviceRequest); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setConnectTimeout(10000); connection.setReadTimeout(10000); connection.setRequestProperty("charset", CHAR_ENCODING); reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), CHAR_ENCODING)); String line; line = reader.readLine(); if (!"FETCH_IN_PROGRESS".equals(line)) { writer.println(line); while ((line = reader.readLine()) != null) { writer.println(line); } break; } } catch (IOException exception) { if (!exception.getMessage().contains("Timeout")) { throw exception; } } finally { if (reader != null) { reader.close(); } } } } catch (IOException e) { e.printStackTrace(); } finally { writer.close(); } log.info("Crawl filter exiting, no chaining."); } else { chain.doFilter(request, response); } } @Override public void init(FilterConfig filterConfig) throws ServletException { } }