// Copyright 2007 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.servlet;
import com.google.common.base.Strings;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ParsedUrl {
// TODO (bmj): This discards fragments at the end of the URLs,
// which possibly makes Issue 214 (b/6514016) worse.
private static final Pattern GOOGLECONNECTOR_URL_PATTERN =
Pattern.compile("^" + ServletUtil.PROTOCOL + "([^./]*)(?:[^/]*)?"
+ "(?:/[dD][oO][cC]\\?(?:[^&]*&)*[dD][oO][cC][iI][dD]=([^]*))?");
private static final Pattern RETRIEVER_URL_PATTERN =
Pattern.compile("^http.+/getDocumentContent\\?[cC][oO][nN][nN][eE][cC]"
+ "[tT][oO][rR][nN][aA][mM][eE]=([^&]*)&[dD][oO][cC][iI][dD]="
+ "([^]*)");
// TODO: We should handle the case where the query parameters are swapped.
private int urlStatus = ConnectorMessageCode.SUCCESS;
private String url = null;
private String connectorName = null;
private String docid = null;
ParsedUrl(String urlparam) {
url = urlparam;
Matcher matcher = GOOGLECONNECTOR_URL_PATTERN.matcher(url);
boolean found = matcher.find();
if (found) {
try {
connectorName = matcher.group(1);
} catch (IllegalStateException e) {
// just leave the connectorName null - we'll catch the error later
}
try {
docid = matcher.group(2);
} catch (IllegalStateException e) {
// just leave the docid null - we'll catch the error later
}
} else {
// TODO: Use java.net.URI instead of URLDecoder. Better we should write
// our own RFC 3986 compliant decoder instead.
matcher = RETRIEVER_URL_PATTERN.matcher(url);
found = matcher.find();
if (found) {
try {
connectorName = URLDecoder.decode(matcher.group(1), "UTF-8");
} catch (UnsupportedEncodingException ignored) {
// Can't happen with UTF-8.
} catch (IllegalStateException e) {
// just leave the connectorName null - we'll catch the error later
}
try {
docid = URLDecoder.decode(matcher.group(2), "UTF-8");
} catch (UnsupportedEncodingException ignored) {
// Can't happen with UTF-8.
} catch (IllegalStateException e) {
// just leave the docid null - we'll catch the error later
}
}
}
if (!found || Strings.isNullOrEmpty(connectorName)) {
urlStatus = ConnectorMessageCode.RESPONSE_NULL_CONNECTOR;
} else if (Strings.isNullOrEmpty(docid)) {
urlStatus = ConnectorMessageCode.RESPONSE_NULL_DOCID;
}
}
public String getConnectorName() {
return connectorName;
}
public String getDocid() {
return docid;
}
public int getStatus() {
return urlStatus;
}
public String getUrl() {
return url;
}
}