/* * Cloud9: A MapReduce Library for Hadoop * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package edu.umd.cloud9.webgraph.data; import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.io.WritableComparable; /** * <p> * This data structure represents a line of anchor text. A line of anchor text has * some text, a weight, and a set of sources (targets) associated with it. * Sources (targets) are the pages a line of anchor text originates * from (points to) when the underlying link is an incoming (outgoing) link. * </p> * * <p> * The implemented iterator makes it possible to iterate through the source or target * documents for each line of anchor text. * </p> * * @author Nima Asadi * */ public class AnchorText implements WritableComparable<AnchorText>, AnchorTextConstants, Iterable<Integer> { //AnchorText objects can be incoming or outgoing, //external or internal, etc. depending on the type private byte type; //holds the text for a line of anchor text private String text; //sources (or targets in case the underlying link is an outgoing one) private IntOpenHashSet documentList; //weight for a line of anchor text, if defined private float weight; /** * Creates an empty Internal Incoming Link AnchorText object */ public AnchorText() { documentList = new IntOpenHashSet(); resetToType(Type.INTERNAL_IN_LINK.val); } /** * Creates a new AnchorText object * * @param type Internal or external, incoming or outgoing, etc. * (see {@link AnchorTextConstants}) * @param text Text associated with a line of anchor text */ public AnchorText(byte type, String text) { this(); resetToType(type); setText(text); } /** * Creates a new AnchorText object and adds a new source/target document if * the AnchorText object is allowed to have text. * (see {@link AnchorTextConstants interface} * * @param type Internal or external, incoming or outgoing, etc. * (see {@link AnchorTextConstants} * @param text Text associated with a line of anchor text * @param docno Source/Target document id */ public AnchorText(byte type, String text, int docno) { this(); resetToType(type); setText(text); if(hasValidDocumentList()) { addDocument(docno); } } /** * Deserializes an AnchorText object. * * @param in Input Stream */ public void readFields(DataInput in) throws IOException { this.type = in.readByte(); resetToType(this.type); if(hasValidText()) { this.text = in.readUTF(); } if(hasValidDocumentList()) { int size = in.readInt(); for(int i = 0; i < size; i++) { this.documentList.add(in.readInt()); } } if(hasValidWeight()) { this.weight = in.readFloat(); } } /** * Serializes an AnchorText object * * @param out * Output Stream */ public void write(DataOutput out) throws IOException { out.writeByte(type); if(hasValidText()) { out.writeUTF(text); } if(hasValidDocumentList()) { out.writeInt(documentList.size()); IntIterator iterator = documentList.iterator(); while(iterator.hasNext()) { out.writeInt(iterator.next()); } } if(hasValidWeight()) { out.writeFloat(weight); } } /** * @return the type of this AnchorText */ public byte getType() { return type; } /** * Clears this object and initializes the type * * @param type * New type */ public void resetToType(byte type) { this.type = type; if(hasValidText()) { this.text = EMPTY_STRING; } else { this.text = null; } weight = 0; documentList.clear(); } /** * @return the text of this line of anchor text. * Null if the AnchorText object does not * have any text associated with it. */ public String getText() { return text; } /** * Sets the text for this line of anchor text * @param text New text for this anchor text */ public void setText(String text) { if(hasValidText()) { this.text = text; } } /** * @return the weight for this anchor text */ public float getWeight() { return weight; } /** * Sets a new weight for this line of anchor text and changes the type to "weighted" * * @param weight New weight */ public void setWeight(float weight) { if(isExternalInLink()) { this.weight = weight; this.type = Type.WEIGHTED_EXTERNAL_IN_LINK.val; } } /** * @return the cardinality of the set of sources/targets */ public int getSize() { return documentList.size(); } /** * Returns a list of all the sources/targets * * @return An array of ints that contains all the * sources/targets of the current object */ public int[] getDocuments() { return documentList.toIntArray(); } /** * Adds a new source/target to this anchor text. * * @param docno The new document id to be added to the source/target list */ public void addDocument(int docno) { if(!hasValidDocumentList()) { return; } documentList.add(docno); } /** * Adds the sources/targets from another AnchorText to the current object * * @param other The other AnchorText object from which the * sources/targets are to be copied. */ public void addDocumentsFrom(AnchorText other) { if(!hasValidDocumentList()) { return; } IntIterator iterator = other.documentList.iterator(); while(iterator.hasNext()) { addDocument(iterator.next()); } } /** * Checks whether a document is a source or a target for this anchor text * * @param docno Document to be checked * @return True if the document is a source/target of this anchor text */ public boolean containsDocument(int docno) { if(!hasValidDocumentList()) { return false; } return documentList.contains(docno); } /** * Checks whether two lines of anchor text share a source/target document * * @param other The other anchor text * @return True if this line of anchor text has a common source/target * with the other line of anchor text. */ public boolean intersects(AnchorText other) { if(!hasValidDocumentList() || !other.hasValidDocumentList()) { return false; } if(getSize() < other.getSize()) { IntIterator iterator = documentList.iterator(); while(iterator.hasNext()) { if(other.containsDocument(iterator.next())) { return true; } } } else { IntIterator iterator = other.documentList.iterator(); while(iterator.hasNext()) { if(documentList.contains(iterator.next())) { return true; } } } return false; } /** * Checks whether two lines of anchor text are equal, * regardless of their source/target lists. * * @param other The other anchor text to check against. * @return True if the text fields, weights, and types of two * AnchorText objects are equal, regardless * of their source/targets lists. */ public boolean equalsIgnoreSources(AnchorText other) { if(type != other.getType()) { return false; } if(hasValidWeight() && weight != other.getWeight()) { return false; } if(hasValidText()) { return text.equals(other.getText()); } return true; } /** * Does a thorough comparison of two AnchorText objects. */ public boolean equals(Object obj) { AnchorText other = (AnchorText) obj; if(hasValidDocumentList() && other.hasValidDocumentList()) { if(documentList.size() != other.documentList.size()) { return false; } IntIterator iterator = documentList.iterator(); while(iterator.hasNext()) { if(!other.containsDocument(iterator.next())) { return false; } } } return equalsIgnoreSources(other); } /** * For sorting purposes, the comparison is only * limited to the type and the text * of two AnchorText objects. To check complete * equality of the objects use the equals method. */ public int compareTo(AnchorText obj) { byte fl = obj.getType(); String text = obj.getText(); if(type != fl) { return type < fl ? -1 : 1; } if(hasValidText()) { return this.text.compareTo(text); } return 0; } public int hashCode() { if(hasValidText()) { return text.hashCode() + type; } return type; } public String toString() { StringBuilder builder = new StringBuilder(); builder.append("("); if(isExternalInLink()) { builder.append("ExternalInLink"); } else if(isInternalInLink()) { builder.append("InternalInLink"); } else if(isExternalOutLink()) { builder.append("ExternalOutLink"); } else if(isInternalOutLink()) { builder.append("InternalOutLink"); } else if(isDocnoField()) { builder.append("Docno"); } else if(isURL()) { builder.append("URL"); } else if(isInDegree()) { builder.append("Indegree"); } else if(isOutDegree()) { builder.append("Outdegree"); } else { builder.append("OtherType"); } if(hasValidText()) { builder.append(", " + text); } if(hasValidDocumentList()) { builder.append(", " + documentList.toString()); } if(hasValidWeight()) { builder.append(", w:" + weight); } builder.append(")"); return builder.toString(); } /** * Clones (deep copies) this object and returns a new AnchorText object. * * @return A new AnchorText object that is a copy of the current object. */ public AnchorText clone() { AnchorText cloned = new AnchorText(); cloned.resetToType(type); cloned.setText(text); if(hasValidDocumentList()) { cloned.addDocumentsFrom(this); } if(hasValidWeight()) { cloned.weight = weight; } return cloned; } /** * @return True if the current object is any type of * external incoming link (i.e., Weighted, non-weighted, etc.). */ public boolean isExternalInLink() { return type == Type.EXTERNAL_IN_LINK.val || type == Type.WEIGHTED_EXTERNAL_IN_LINK.val; } /** * @return True if the current object is any type of internal incoming link. */ public boolean isInternalInLink() { return type == Type.INTERNAL_IN_LINK.val; } /** * @return True if the current object is any type of external outgoing link. */ public boolean isExternalOutLink() { return type == Type.EXTERNAL_OUT_LINK.val; } /** * @return True if the current object is any type of internal outgoing link. */ public boolean isInternalOutLink() { return type == Type.INTERNAL_OUT_LINK.val; } /** * @return True if the current line of anchor text has a weight associated with it. */ public boolean isWeighted() { return type == Type.WEIGHTED_EXTERNAL_IN_LINK.val; } /** * @return True if the current object is a special type of * AnchorText (i.e., is not a real line of anchor text) - InDegree */ public boolean isInDegree() { return type == Type.IN_DEGREE.val; } /** * @return True if the current object is a special type of * AnchorText (i.e., is not a real line of anchor text) - OutDegree */ public boolean isOutDegree() { return type == Type.OUT_DEGREE.val; } /** * @return True if the current object is a special type of * AnchorText (i.e., is not a real line of anchor text) - Holds document number */ public boolean isDocnoField() { return type == Type.DOCNO_FIELD.val; } /** * @return True if the current object is a special type of * AnchorText (i.e., is not a real line of anchor text) - Holds a URL address */ public boolean isURL() { return type == Type.URL_FIELD.val; } /** * @return True if the current object is a special type of * AnchorText (i.e., is not a real line of anchor text) - Any other types of AnchorText */ public boolean isOfOtherTypes() { return type == Type.OTHER_TYPES.val; } //checks whether the anchor text object has a valid text field. public boolean hasValidText() { return (type == Type.EXTERNAL_IN_LINK.val || type == Type.INTERNAL_IN_LINK.val || type == Type.URL_FIELD.val || type == Type.OTHER_TYPES.val || type == Type.WEIGHTED_EXTERNAL_IN_LINK.val); } //checks whether the current object has a valid list of sources/targets. private boolean hasValidDocumentList() { return type != Type.URL_FIELD.val; } //checks wether the current object has a valid weight associated with it. private boolean hasValidWeight() { return type == Type.WEIGHTED_EXTERNAL_IN_LINK.val; } /** * Creates a new iterator for the current object. The iterator can be used to iterate through * the list of sources/targets associated with the current line of anchor text. * * @return A new iterator object. */ public Iterator<Integer> iterator() { return new Iterator<Integer>() { IntIterator iterator = documentList.iterator(); public boolean hasNext() { return iterator.hasNext(); } public Integer next() { return iterator.next(); } public void remove() { iterator.remove(); } }; } }