package org.apache.nutchbase.util.hbase; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.*; import org.apache.hadoop.hbase.io.BatchUpdate; import org.apache.hadoop.hbase.io.RowResult; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Writables; import org.apache.nutch.crawl.Inlink; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.protocol.ProtocolStatus; public class RowPart extends ImmutableRowPart { private final Map<byte[], byte[]> opMap = new TreeMap<byte[], byte[]>(Bytes.BYTES_COMPARATOR); /** For Writable. Do not use directly. */ public RowPart() { super(); } public RowPart(RowResult rowResult) { super(rowResult); } public RowPart(byte[] rowId) { super(rowId); } private static void checkForNull(Object o) { if (o == null) { // If null, the PUT becomes a DELETE operation. throw new IllegalArgumentException("Passed value cannot be null"); } } // TODO: waay too slow. Find a faster approach private void deleteColumnAll(String colStart) { // first clear new additions/deletions final Iterator<Map.Entry<byte[], byte[]>> it = opMap.entrySet().iterator(); while (it.hasNext()) { final byte[] key = it.next().getKey(); final String keyString = Bytes.toString(key); if (keyString.startsWith(colStart)) { it.remove(); } } // then add 'delete' operations for existing columns for (final byte[] col : rowResult.keySet()) { final String column = Bytes.toString(col); if (column.startsWith(colStart)) { opMap.put(col, null); } } } @Override public String getBaseUrl() { if (!opMap.containsKey(BASE_URL)) return super.getBaseUrl(); return Bytes.toString(opMap.get(BASE_URL)); } public void setBaseUrl(String baseUrl) { checkForNull(baseUrl); opMap.put(BASE_URL, Bytes.toBytes(baseUrl)); } @Override public byte[] getContent() { if (!opMap.containsKey(CONTENT)) return super.getContent(); return opMap.get(CONTENT); } public void setContent(byte[] content) { checkForNull(content); opMap.put(CONTENT, content); } @Override public String getContentType() { if (!opMap.containsKey(CONTENT_TYPE)) return super.getContentType(); return Bytes.toString(opMap.get(CONTENT_TYPE)); } public void setContentType(String contentType) { checkForNull(contentType); opMap.put(CONTENT_TYPE, Bytes.toBytes(contentType)); } @Override public int getFetchInterval() { if (!opMap.containsKey(FETCH_INTERVAL)) return super.getFetchInterval(); return Bytes.toInt(opMap.get(FETCH_INTERVAL)); } public void setFetchInterval(int fetchInterval) { opMap.put(FETCH_INTERVAL, Bytes.toBytes(fetchInterval)); } @Override public long getFetchTime() { if (!opMap.containsKey(FETCH_TIME)) return super.getFetchTime(); return Bytes.toLong(opMap.get(FETCH_TIME)); } public void setFetchTime(long fetchTime) { opMap.put(FETCH_TIME, Bytes.toBytes(fetchTime)); } @Override public long getPrevFetchTime() { if (!opMap.containsKey(PREV_FETCH_TIME)) return super.getPrevFetchTime(); final byte[] val = opMap.get(PREV_FETCH_TIME); if (val == null) return -1L; return Bytes.toLong(opMap.get(PREV_FETCH_TIME)); } public void setPrevFetchTime(long prevFetchTime) { opMap.put(PREV_FETCH_TIME, Bytes.toBytes(prevFetchTime)); } @Override public Collection<Outlink> getOutlinks() { final Collection<Outlink> outlinks = super.getOutlinks(); final Map<String, Outlink> linkMap = new HashMap<String, Outlink>(); for (final Outlink outlink : outlinks) { linkMap.put(outlink.getToUrl(), outlink); } for (final Map.Entry<byte[], byte[]> entry : opMap.entrySet()) { final String key = Bytes.toString(entry.getKey()); if (key.startsWith(OUTLINKS_STR)) { final byte[] val = entry.getValue(); if (val == null) { // outlink deleted linkMap.remove(key); } else { // new outlink final String toUrl = key.substring(OUTLINKS_STR_LEN); final String anchor = Bytes.toString(val); linkMap.put(key, new Outlink(toUrl, anchor)); } } } return linkMap.values(); } public void addOutlink(Outlink outlink) { final byte[] key = Bytes.toBytes(OUTLINKS_STR + outlink.getToUrl()); opMap.put(key, Bytes.toBytes(outlink.getAnchor())); } public void deleteAllOutlinks() { deleteColumnAll(OUTLINKS_STR); } @Override public Collection<Inlink> getInlinks() { final Collection<Inlink> inlinks = super.getInlinks(); final Map<String, Inlink> linkMap = new HashMap<String, Inlink>(); for (final Inlink inlink : inlinks) { linkMap.put(inlink.getFromUrl(), inlink); } for (final Map.Entry<byte[], byte[]> entry : opMap.entrySet()) { final String key = Bytes.toString(entry.getKey()); if (key.startsWith(INLINKS_STR)) { final byte[] val = entry.getValue(); if (val == null) { // inlink deleted linkMap.remove(key); } else { // new outlink final String fromUrl = key.substring(INLINKS_STR_LEN); final String anchor = Bytes.toString(val); linkMap.put(key, new Inlink(fromUrl, anchor)); } } } return linkMap.values(); } public void addInlink(Inlink inlink) { checkForNull(inlink); final String fullKey = INLINKS_STR + inlink.getFromUrl(); opMap.put(Bytes.toBytes(fullKey), Bytes.toBytes(inlink.getAnchor())); } public void deleteAllInlinks() { deleteColumnAll(INLINKS_STR); } @Override public ParseStatus getParseStatus() { if (!opMap.containsKey(PARSE_STATUS)) return super.getParseStatus(); final ParseStatus parseStatus = new ParseStatus(); try { return (ParseStatus) Writables.getWritable(opMap.get(PARSE_STATUS), parseStatus); } catch (final IOException e) { throw new RuntimeException(e); // TODO: really? } } public void setParseStatus(ParseStatus parseStatus) { checkForNull(parseStatus); try { opMap.put(PARSE_STATUS, Writables.getBytes(parseStatus)); } catch (final IOException e) { throw new RuntimeException(e); // TODO: really? } } @Override public String getReprUrl() { if (!opMap.containsKey(REPR_URL)) return super.getReprUrl(); return Bytes.toString(opMap.get(REPR_URL)); } public void setReprUrl(String reprUrl) { checkForNull(reprUrl); opMap.put(REPR_URL, Bytes.toBytes(reprUrl)); } @Override public int getRetriesSinceFetch() { if (!opMap.containsKey(RETRIES)) return super.getRetriesSinceFetch(); return Bytes.toInt(opMap.get(RETRIES)); } public void setRetriesSinceFetch(int retries) { opMap.put(RETRIES, Bytes.toBytes(retries)); } @Override public ProtocolStatus getProtocolStatus() { if (!opMap.containsKey(PROTOCOL_STATUS)) return super.getProtocolStatus(); final ProtocolStatus protocolStatus = new ProtocolStatus(); final byte[] val = opMap.get(PROTOCOL_STATUS); try { return (ProtocolStatus) Writables.getWritable(val, protocolStatus); } catch (final IOException e) { throw new RuntimeException(e); } } public void setProtocolStatus(ProtocolStatus protocolStatus) { checkForNull(protocolStatus); try { opMap.put(PROTOCOL_STATUS, Writables.getBytes(protocolStatus)); } catch (final IOException e) { throw new RuntimeException(e); } } @Override public float getScore() { if (!opMap.containsKey(SCORE)) return super.getScore(); return TableUtil.toFloat(opMap.get(SCORE)); } public void setScore(float score) { opMap.put(SCORE, TableUtil.toBytes(score)); } @Override public float getPagerank() { if (!opMap.containsKey(PAGERANK)) return super.getPagerank(); return TableUtil.toFloat(opMap.get(PAGERANK)); } public void setPagerank(float PR) { opMap.put(PAGERANK, TableUtil.toBytes(PR)); } @Override public float getVotes() { if (!opMap.containsKey(VOTES)) return super.getVotes(); return TableUtil.toFloat(opMap.get(VOTES)); } public void setVotes(float votes) { opMap.put(VOTES, TableUtil.toBytes(votes)); } @Override public byte getStatus() { if (!opMap.containsKey(STATUS)) return super.getStatus(); return opMap.get(STATUS)[0]; } public void setStatus(byte status) { opMap.put(STATUS, new byte[] { status }); } @Override public byte[] getSignature() { if (!opMap.containsKey(SIGNATURE)) return super.getSignature(); return opMap.get(SIGNATURE); } public void setSignature(byte[] signature) { checkForNull(signature); opMap.put(SIGNATURE, signature); } @Override public byte[] getPrevSignature() { if (!opMap.containsKey(PREV_SIGNATURE)) return super.getPrevSignature(); return opMap.get(PREV_SIGNATURE); } public void setPrevSignature(byte[] prevSig) { checkForNull(prevSig); opMap.put(PREV_SIGNATURE, prevSig); } @Override public long getModifiedTime() { return Bytes.toLong(opMap.get(MODIFIED_TIME)); } public void setModifiedTime(long modifiedTime) { opMap.put(MODIFIED_TIME, Bytes.toBytes(modifiedTime)); } @Override public String getText() { if (!opMap.containsKey(TEXT)) return super.getText(); return Bytes.toString(opMap.get(TEXT)); } public void setText(String text) { checkForNull(text); opMap.put(TEXT, Bytes.toBytes(text)); } @Override public String getTitle() { if (!opMap.containsKey(TITLE)) return super.getText(); return Bytes.toString(opMap.get(TITLE)); } public void setTitle(String title) { checkForNull(title); opMap.put(TITLE, Bytes.toBytes(title)); } @Override public boolean hasColumn(byte[] col) { if (!opMap.containsKey(col)) return super.hasColumn(col); return opMap.get(col) != null; // check if column is deleted } public void putColumn(byte[] key, byte[] val) { opMap.put(key, val); } public void putColumn(String key, String val) { opMap.put(Bytes.toBytes(key), Bytes.toBytes(val)); } public void deleteColumn(String key) { opMap.put(Bytes.toBytes(key), null); } @Override public boolean hasMeta(String metaKey) { return hasColumn(Bytes.toBytes(METADATA_STR + metaKey)); } @Override public String getHeader(String key) { final byte[] headerKey = Bytes.toBytes(HEADERS_STR + key); if (opMap.containsKey(headerKey)) { final byte[] val = opMap.get(headerKey); if (val == null) { // deleted !!! return null; } return Bytes.toString(val); } return stringify(rowResult.get(headerKey)); } public void addHeader(String key, String value) { checkForNull(value); opMap.put(Bytes.toBytes(HEADERS_STR + key), Bytes.toBytes(value)); } public void deleteHeaders() { deleteColumnAll(HEADERS_STR); } @Override public byte[] getColumn(String key) { byte[] bKey = Bytes.toBytes(key); if (opMap.containsKey(bKey)) return opMap.get(bKey); return super.getColumn(key); } @Override public Set<byte[]> getColumns() { Set<byte[]> columns = new HashSet<byte[]>(super.getColumns()); columns.addAll(opMap.keySet()); return columns; } @Override public byte[] getMeta(String metaKey) { final String fullKeyString = METADATA_STR + metaKey; final byte[] key = Bytes.toBytes(fullKeyString); if (!opMap.containsKey(key)) return super.get(key); return opMap.get(key); } @Override public String getMetaAsString(String metaKey) { final byte[] val = getMeta(metaKey); return val == null ? null : Bytes.toString(val); } public void putMeta(String metaKey, byte[] val) { checkForNull(val); opMap.put(Bytes.toBytes(METADATA_STR + metaKey), val); } public void deleteMeta(String metaKey) { opMap.put(Bytes.toBytes(METADATA_STR + metaKey), null); } public BatchUpdate makeBatchUpdate() { final BatchUpdate bu = new BatchUpdate(getRowId(), System.currentTimeMillis()); for (final Map.Entry<byte[], byte[]> entry : opMap.entrySet()) { final byte[] val = entry.getValue(); if (val == null) { // delete op bu.delete(entry.getKey()); } else { // put op bu.put(entry.getKey(), val); } } return bu; } @Override public void readFields(DataInput in) throws IOException { super.readFields(in); opMap.clear(); final int size = in.readInt(); for (int i = 0; i < size; i++) { final byte[] key = Bytes.readByteArray(in); byte[] val = null; if (in.readBoolean()) { val = Bytes.readByteArray(in); } opMap.put(key, val); } } @Override public void write(DataOutput out) throws IOException { super.write(out); out.writeInt(opMap.size()); for (final Map.Entry<byte[], byte[]> op : opMap.entrySet()) { Bytes.writeByteArray(out, op.getKey()); final byte[] val = op.getValue(); if (val == null) { // a delete operation out.writeBoolean(false); } else { // a put operation out.writeBoolean(true); Bytes.writeByteArray(out, val); } } } }