/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.crawl; import java.io.*; import java.util.*; import java.util.Map.Entry; import org.apache.hadoop.io.*; import org.apache.nutch.util.*; /* The crawl state of a url. */ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable { public static final String GENERATE_DIR_NAME = "crawl_generate"; public static final String FETCH_DIR_NAME = "crawl_fetch"; public static final String PARSE_DIR_NAME = "crawl_parse"; private final static byte CUR_VERSION = 7; /** Compatibility values for on-the-fly conversion from versions < 5. */ private static final byte OLD_STATUS_SIGNATURE = 0; private static final byte OLD_STATUS_DB_UNFETCHED = 1; private static final byte OLD_STATUS_DB_FETCHED = 2; private static final byte OLD_STATUS_DB_GONE = 3; private static final byte OLD_STATUS_LINKED = 4; private static final byte OLD_STATUS_FETCH_SUCCESS = 5; private static final byte OLD_STATUS_FETCH_RETRY = 6; private static final byte OLD_STATUS_FETCH_GONE = 7; private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>(); /** Page was not fetched yet. */ public static final byte STATUS_DB_UNFETCHED = 0x01; /** Page was successfully fetched. */ public static final byte STATUS_DB_FETCHED = 0x02; /** Page no longer exists. */ public static final byte STATUS_DB_GONE = 0x03; /** Page temporarily redirects to other page. */ public static final byte STATUS_DB_REDIR_TEMP = 0x04; /** Page permanently redirects to other page. */ public static final byte STATUS_DB_REDIR_PERM = 0x05; /** Page was successfully fetched and found not modified. */ public static final byte STATUS_DB_NOTMODIFIED = 0x06; /** Maximum value of DB-related status. */ public static final byte STATUS_DB_MAX = 0x1f; /** Fetching was successful. */ public static final byte STATUS_FETCH_SUCCESS = 0x21; /** Fetching unsuccessful, needs to be retried (transient errors). */ public static final byte STATUS_FETCH_RETRY = 0x22; /** Fetching temporarily redirected to other page. */ public static final byte STATUS_FETCH_REDIR_TEMP = 0x23; /** Fetching permanently redirected to other page. */ public static final byte STATUS_FETCH_REDIR_PERM = 0x24; /** Fetching unsuccessful - page is gone. */ public static final byte STATUS_FETCH_GONE = 0x25; /** Fetching successful - page is not modified. */ public static final byte STATUS_FETCH_NOTMODIFIED = 0x26; /** Maximum value of fetch-related status. */ public static final byte STATUS_FETCH_MAX = 0x3f; /** Page signature. */ public static final byte STATUS_SIGNATURE = 0x41; /** Page was newly injected. */ public static final byte STATUS_INJECTED = 0x42; /** Page discovered through a link. */ public static final byte STATUS_LINKED = 0x43; public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>(); static { statNames.put(STATUS_DB_UNFETCHED, "db_unfetched"); statNames.put(STATUS_DB_FETCHED, "db_fetched"); statNames.put(STATUS_DB_GONE, "db_gone"); statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp"); statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm"); statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified"); statNames.put(STATUS_SIGNATURE, "signature"); statNames.put(STATUS_INJECTED, "injected"); statNames.put(STATUS_LINKED, "linked"); statNames.put(STATUS_FETCH_SUCCESS, "fetch_success"); statNames.put(STATUS_FETCH_RETRY, "fetch_retry"); statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp"); statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm"); statNames.put(STATUS_FETCH_GONE, "fetch_gone"); statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified"); oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED); oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED); oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE); oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE); oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS); oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY); oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED); oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE); } private byte status; private long fetchTime = System.currentTimeMillis(); private byte retries; private int fetchInterval; private float score = 1.0f; private byte[] signature = null; private long modifiedTime; private org.apache.hadoop.io.MapWritable metaData; public static boolean hasDbStatus(CrawlDatum datum) { if (datum.status <= STATUS_DB_MAX) return true; return false; } public static boolean hasFetchStatus(CrawlDatum datum) { if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX) return true; return false; } public CrawlDatum() { metaData = new org.apache.hadoop.io.MapWritable(); } public CrawlDatum(int status, int fetchInterval) { this(); this.status = (byte)status; this.fetchInterval = fetchInterval; } public CrawlDatum(int status, int fetchInterval, float score) { this(status, fetchInterval); this.score = score; } // // accessor methods // public byte getStatus() { return status; } public static String getStatusName(byte value) { String res = statNames.get(value); if (res == null) res = "unknown"; return res; } public void setStatus(int status) { this.status = (byte)status; } /** * Returns either the time of the last fetch, or the next fetch time, * depending on whether Fetcher or CrawlDbReducer set the time. */ public long getFetchTime() { return fetchTime; } /** * Sets either the time of the last fetch or the next fetch time, * depending on whether Fetcher or CrawlDbReducer set the time. */ public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; } public long getModifiedTime() { return modifiedTime; } public void setModifiedTime(long modifiedTime) { this.modifiedTime = modifiedTime; } public byte getRetriesSinceFetch() { return retries; } public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;} public int getFetchInterval() { return fetchInterval; } public void setFetchInterval(int fetchInterval) { this.fetchInterval = fetchInterval; } public void setFetchInterval(float fetchInterval) { this.fetchInterval = Math.round(fetchInterval); } public float getScore() { return score; } public void setScore(float score) { this.score = score; } public byte[] getSignature() { return signature; } public void setSignature(byte[] signature) { if (signature != null && signature.length > 256) throw new RuntimeException("Max signature length (256) exceeded: " + signature.length); this.signature = signature; } public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) { this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable); } /** Add all metadata from other CrawlDatum to this CrawlDatum. * * @param other CrawlDatum */ public void putAllMetaData(CrawlDatum other) { for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) { metaData.put(e.getKey(), e.getValue()); } } /** * returns a MapWritable if it was set or read in @see readFields(DataInput), * returns empty map in case CrawlDatum was freshly created (lazily instantiated). */ public org.apache.hadoop.io.MapWritable getMetaData() { if (this.metaData == null) this.metaData = new org.apache.hadoop.io.MapWritable(); return this.metaData; } // // writable methods // public static CrawlDatum read(DataInput in) throws IOException { CrawlDatum result = new CrawlDatum(); result.readFields(in); return result; } public void readFields(DataInput in) throws IOException { byte version = in.readByte(); // read version if (version > CUR_VERSION) // check version throw new VersionMismatchException(CUR_VERSION, version); status = in.readByte(); fetchTime = in.readLong(); retries = in.readByte(); if (version > 5) { fetchInterval = in.readInt(); } else fetchInterval = Math.round(in.readFloat()); score = in.readFloat(); if (version > 2) { modifiedTime = in.readLong(); int cnt = in.readByte(); if (cnt > 0) { signature = new byte[cnt]; in.readFully(signature); } else signature = null; } metaData = new org.apache.hadoop.io.MapWritable(); if (version > 3) { if (version < 7) { MapWritable oldMetaData = new MapWritable(); if (in.readBoolean()) { oldMetaData.readFields(in); } for (Writable key : oldMetaData.keySet()) { metaData.put(key, oldMetaData.get(key)); } } else { if (in.readBoolean()) { metaData.readFields(in); } } } // translate status codes if (version < 5) { if (oldToNew.containsKey(status)) status = oldToNew.get(status); else status = STATUS_DB_UNFETCHED; } } /** The number of bytes into a CrawlDatum that the score is stored. */ private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4; private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8; public void write(DataOutput out) throws IOException { out.writeByte(CUR_VERSION); // store current version out.writeByte(status); out.writeLong(fetchTime); out.writeByte(retries); out.writeInt(fetchInterval); out.writeFloat(score); out.writeLong(modifiedTime); if (signature == null) { out.writeByte(0); } else { out.writeByte(signature.length); out.write(signature); } if (metaData.size() > 0) { out.writeBoolean(true); metaData.write(out); } else { out.writeBoolean(false); } } /** Copy the contents of another instance into this instance. */ public void set(CrawlDatum that) { this.status = that.status; this.fetchTime = that.fetchTime; this.retries = that.retries; this.fetchInterval = that.fetchInterval; this.score = that.score; this.modifiedTime = that.modifiedTime; this.signature = that.signature; this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy } // // compare methods // /** Sort by decreasing score. */ public int compareTo(CrawlDatum that) { if (that.score != this.score) return (that.score - this.score) > 0 ? 1 : -1; if (that.status != this.status) return this.status - that.status; if (that.fetchTime != this.fetchTime) return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1; if (that.retries != this.retries) return that.retries - this.retries; if (that.fetchInterval != this.fetchInterval) return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1; if (that.modifiedTime != this.modifiedTime) return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1; return SignatureComparator._compare(this, that); } /** A Comparator optimized for CrawlDatum. */ public static class Comparator extends WritableComparator { public Comparator() { super(CrawlDatum.class); } public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { float score1 = readFloat(b1,s1+SCORE_OFFSET); float score2 = readFloat(b2,s2+SCORE_OFFSET); if (score2 != score1) { return (score2 - score1) > 0 ? 1 : -1; } int status1 = b1[s1+1]; int status2 = b2[s2+1]; if (status2 != status1) return status1 - status2; long fetchTime1 = readLong(b1, s1+1+1); long fetchTime2 = readLong(b2, s2+1+1); if (fetchTime2 != fetchTime1) return (fetchTime2 - fetchTime1) > 0 ? 1 : -1; int retries1 = b1[s1+1+1+8]; int retries2 = b2[s2+1+1+8]; if (retries2 != retries1) return retries2 - retries1; int fetchInterval1 = readInt(b1, s1+1+1+8+1); int fetchInterval2 = readInt(b2, s2+1+1+8+1); if (fetchInterval2 != fetchInterval1) return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1; long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4); long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4); if (modifiedTime2 != modifiedTime1) return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1; int sigl1 = b1[s1+SIG_OFFSET]; int sigl2 = b2[s2+SIG_OFFSET]; return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, SIG_OFFSET, sigl2); } } static { // register this comparator WritableComparator.define(CrawlDatum.class, new Comparator()); } // // basic methods // public String toString() { StringBuilder buf = new StringBuilder(); buf.append("Version: " + CUR_VERSION + "\n"); buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + ")\n"); buf.append("Fetch time: " + new Date(getFetchTime()) + "\n"); buf.append("Modified time: " + new Date(getModifiedTime()) + "\n"); buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n"); buf.append("Retry interval: " + getFetchInterval() + " seconds (" + (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n"); buf.append("Score: " + getScore() + "\n"); buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n"); buf.append("Metadata: "); for (Entry<Writable, Writable> e : metaData.entrySet()) { buf.append(e.getKey()); buf.append(": "); buf.append(e.getValue()); } buf.append('\n'); return buf.toString(); } private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) { HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable,Writable>>(metaData.entrySet()); HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet()); return set1.equals(set2); } public boolean equals(Object o) { if (!(o instanceof CrawlDatum)) return false; CrawlDatum other = (CrawlDatum)o; boolean res = (this.status == other.status) && (this.fetchTime == other.fetchTime) && (this.modifiedTime == other.modifiedTime) && (this.retries == other.retries) && (this.fetchInterval == other.fetchInterval) && (SignatureComparator._compare(this.signature, other.signature) == 0) && (this.score == other.score); if (!res) return res; return metadataEquals(other.metaData); } public int hashCode() { int res = 0; if (signature != null) { for (int i = 0; i < signature.length / 4; i += 4) { res ^= (int)(signature[i] << 24 + signature[i+1] << 16 + signature[i+2] << 8 + signature[i+3]); } } res ^= metaData.entrySet().hashCode(); return res ^ status ^ ((int)fetchTime) ^ ((int)modifiedTime) ^ retries ^ fetchInterval ^ Float.floatToIntBits(score); } public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } } }