/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.uci.ics.crawler4j.url; import java.io.Serializable; import com.sleepycat.persist.model.Entity; import com.sleepycat.persist.model.PrimaryKey; /** * @author Yasser Ganjisaffar <lastname at gmail dot com> */ @Entity public class WebURL implements Serializable { private static final long serialVersionUID = 1L; @PrimaryKey private String url; private int docid; private int parentDocid; private String parentUrl; private short depth; private String domain; private String subDomain; private String path; private String anchor; private byte priority; /** * Returns the unique document id assigned to this Url. */ public int getDocid() { return docid; } public void setDocid(int docid) { this.docid = docid; } @Override public int hashCode() { return url.hashCode(); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } WebURL otherUrl = (WebURL) o; return url != null && url.equals(otherUrl.getURL()); } @Override public String toString() { return url; } /** * Returns the Url string */ public String getURL() { return url; } public void setURL(String url) { this.url = url; int domainStartIdx = url.indexOf("//") + 2; int domainEndIdx = url.indexOf('/', domainStartIdx); domain = url.substring(domainStartIdx, domainEndIdx); subDomain = ""; String[] parts = domain.split("\\."); if (parts.length > 2) { domain = parts[parts.length - 2] + "." + parts[parts.length - 1]; int limit = 2; if (TLDList.getInstance().contains(domain)) { domain = parts[parts.length - 3] + "." + domain; limit = 3; } for (int i = 0; i < parts.length - limit; i++) { if (subDomain.length() > 0) { subDomain += "."; } subDomain += parts[i]; } } path = url.substring(domainEndIdx); int pathEndIdx = path.indexOf('?'); if (pathEndIdx >= 0) { path = path.substring(0, pathEndIdx); } } /** * Returns the unique document id of the parent page. The parent page is the * page in which the Url of this page is first observed. */ public int getParentDocid() { return parentDocid; } public void setParentDocid(int parentDocid) { this.parentDocid = parentDocid; } /** * Returns the url of the parent page. The parent page is the page in which * the Url of this page is first observed. */ public String getParentUrl() { return parentUrl; } public void setParentUrl(String parentUrl) { this.parentUrl = parentUrl; } /** * Returns the crawl depth at which this Url is first observed. Seed Urls * are at depth 0. Urls that are extracted from seed Urls are at depth 1, * etc. */ public short getDepth() { return depth; } public void setDepth(short depth) { this.depth = depth; } /** * Returns the domain of this Url. For 'http://www.example.com/sample.htm', * domain will be 'example.com' */ public String getDomain() { return domain; } public String getSubDomain() { return subDomain; } /** * Returns the path of this Url. For 'http://www.example.com/sample.htm', * domain will be 'sample.htm' */ public String getPath() { return path; } public void setPath(String path) { this.path = path; } /** * Returns the anchor string. For example, in <a href="example.com">A sample anchor</a> * the anchor string is 'A sample anchor' */ public String getAnchor() { return anchor; } public void setAnchor(String anchor) { this.anchor = anchor; } /** * Returns the priority for crawling this URL. * A lower number results in higher priority. */ public byte getPriority() { return priority; } public void setPriority(byte priority) { this.priority = priority; } }