ParsedUrl.java example

Explorer
manager.v3-master
- projects
// Copyright 2007 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.enterprise.connector.servlet;

import com.google.common.base.Strings;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ParsedUrl {
  // TODO (bmj): This discards fragments at the end of the URLs,
  // which possibly makes Issue 214 (b/6514016) worse.
  private static final Pattern GOOGLECONNECTOR_URL_PATTERN =
      Pattern.compile("^" + ServletUtil.PROTOCOL + "([^./]*)(?:[^/]*)?"
          + "(?:/[dD][oO][cC]\\?(?:[^&]*&)*[dD][oO][cC][iI][dD]=([^&#]*))?");

  private static final Pattern RETRIEVER_URL_PATTERN =
      Pattern.compile("^http.+/getDocumentContent\\?[cC][oO][nN][nN][eE][cC]"
          + "[tT][oO][rR][nN][aA][mM][eE]=([^&]*)&[dD][oO][cC][iI][dD]="
          + "([^&#]*)");
  // TODO: We should handle the case where the query parameters are swapped.

  private int urlStatus = ConnectorMessageCode.SUCCESS;
  private String url = null;
  private String connectorName = null;
  private String docid = null;

  ParsedUrl(String urlparam) {
    url = urlparam;
    Matcher matcher = GOOGLECONNECTOR_URL_PATTERN.matcher(url);
    boolean found = matcher.find();

    if (found) {
      try {
        connectorName = matcher.group(1);
      } catch (IllegalStateException e) {
        // just leave the connectorName null - we'll catch the error later
      }
      try {
        docid = matcher.group(2);
      } catch (IllegalStateException e) {
        // just leave the docid null - we'll catch the error later
      }
    } else {
      // TODO: Use java.net.URI instead of URLDecoder. Better we should write
      // our own RFC 3986 compliant decoder instead.
      matcher = RETRIEVER_URL_PATTERN.matcher(url);
      found = matcher.find();
      if (found) {
        try {
          connectorName = URLDecoder.decode(matcher.group(1), "UTF-8");
        } catch (UnsupportedEncodingException ignored) {
          // Can't happen with UTF-8.
        } catch (IllegalStateException e) {
          // just leave the connectorName null - we'll catch the error later
        }
        try {
          docid = URLDecoder.decode(matcher.group(2), "UTF-8");
        } catch (UnsupportedEncodingException ignored) {
          // Can't happen with UTF-8.
        } catch (IllegalStateException e) {
          // just leave the docid null - we'll catch the error later
        }
      }
    }

    if (!found || Strings.isNullOrEmpty(connectorName)) {
      urlStatus = ConnectorMessageCode.RESPONSE_NULL_CONNECTOR;
    } else if (Strings.isNullOrEmpty(docid)) {
      urlStatus = ConnectorMessageCode.RESPONSE_NULL_DOCID;
    }
  }

  public String getConnectorName() {
    return connectorName;
  }

  public String getDocid() {
    return docid;
  }

  public int getStatus() {
    return urlStatus;
  }

  public String getUrl() {
    return url;
  }
}