/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
/**
*
* @author rana
*
*/
public class GoogleURL {
static public final String emptyString = "";
static {
// Ensure native JNI library is loaded
System.loadLibrary("GoogleURL_jni");
internal_init(GoogleURLComponent.class);
}
private native void initializeFromURL(String urlStirng);
@SuppressWarnings("unchecked")
private native static void internal_init(Class componentClass);
public GoogleURL(String urlString) {
initializeFromURL(urlString);
}
// Identifies different components.
enum ComponentType {
SCHEME,
USERNAME,
PASSWORD,
HOST,
PORT,
PATH,
QUERY,
REF,
}
public boolean isValid() {
return _isValid;
}
public boolean has_scheme() {
return _scheme.len >= 0;
}
public boolean has_username() {
return _userName.len >= 0;
}
public boolean has_password() {
return _password.len >= 0;
}
public boolean has_host() {
// Note that hosts are special, absense of host means length 0.
return _host.len > 0;
}
public boolean has_port() {
return _port.len >= 0;
}
public boolean has_path() {
// Note that http://www.google.com/" has a path, the path is "/". This can
// return false only for invalid or nonstandard URLs.
return _path.len >= 0;
}
public boolean has_query() {
return _query.len >= 0;
}
public boolean has_ref() {
return _ref.len >= 0;
}
// Getters for various components of the URL. The returned string will be
// empty if the component is empty or is not present.
public String getScheme(){ // Not including the colon. See also SchemeIs.
return getComponentString(_scheme);
}
public String getUserName(){
return getComponentString(_userName);
}
public String getPassword(){
return getComponentString(_password);
}
public String getHost(){
return getComponentString(_host);
}
public GoogleURLComponent getHostComponent() {
return _host;
}
public String getPort(){ // Returns -1 if "default"
return getComponentString(_port);
}
public String getPath(){ // Including first slash following host
return getComponentString(_path);
}
public String getQuery(){ // Stuff following '?'
return getComponentString(_query);
}
public String getPathAndQuery() {
if (_canonicalURL != null) {
int startIndex = (_path.len > 0) ? _path.begin
: (_query.len >0) ? _query.begin -1 : -1;
if (startIndex != -1) {
int len = (_path.len >0 ) ? _path.len : 0;
len += (_query.len >0) ? (_query.len + 1) : 0;
if (len != 0) {
return _canonicalURL.substring(startIndex,startIndex + len);
}
}
}
return emptyString;
}
public String getRef(){ // Stuff following '#'
return getComponentString(_ref);
}
public GoogleURLComponent getRefComponent(){ // Stuff following '#'
return _ref;
}
public String getCanonicalURL() {
return _canonicalURL;
}
public void dump() {
System.out.println("Scheme:" + getScheme());
System.out.println("UserName:" + getUserName());
System.out.println("Password:" + getPassword());
System.out.println("Host:" + getHost());
System.out.println("Port:" + getPort());
System.out.println("Path:" + getPath());
System.out.println("Query:" + getQuery());
System.out.println("Ref:" + getRef());
}
public String getComponentString(GoogleURLComponent comp) {
if (_canonicalURL == null || comp.len <= 0)
return emptyString;
return _canonicalURL.substring(comp.begin,comp.begin + comp.len);
}
private boolean _isValid = false;
public GoogleURLComponent _scheme = new GoogleURLComponent();
public GoogleURLComponent _userName = new GoogleURLComponent();
public GoogleURLComponent _password = new GoogleURLComponent();
public GoogleURLComponent _host = new GoogleURLComponent();
public GoogleURLComponent _port = new GoogleURLComponent();
public GoogleURLComponent _path = new GoogleURLComponent();
public GoogleURLComponent _query = new GoogleURLComponent();
public GoogleURLComponent _ref = new GoogleURLComponent();
public String _canonicalURL = null;
public static void main(String[] args) {
GoogleURL test = new GoogleURL("http://www.google.com/");
System.out.print(test.isValid());
System.out.print(test.getCanonicalURL());
}
}