/*
* Cloud9: A MapReduce Library for Hadoop
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package edu.umd.cloud9.webgraph.data;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import tl.lin.data.array.ArrayListWritable;
import com.google.common.base.Preconditions;
import edu.umd.cloud9.collection.Indexable;
/**
*
* An Indexable implementation for anchor text/web graph collections, used in generating ForwardIndex.
*
* @author Nima Asadi
*
*/
public class IndexableAnchorText extends Indexable {
private static final int DEFAULT_MAX_CONTENT_LENGTH = 1024 * 1024;
private StringBuilder content;
private boolean hasDocid = false;
private String docid = null;
public IndexableAnchorText() {
content = new StringBuilder();
hasDocid = false;
docid = null;
}
public IndexableAnchorText(String docid, ArrayListWritable<AnchorText> anchors) {
setDocid(docid);
concatenateAnchors(anchors, DEFAULT_MAX_CONTENT_LENGTH);
}
public IndexableAnchorText(String docid, ArrayListWritable<AnchorText> anchors, int maxContentLength) {
setDocid(docid);
concatenateAnchors(anchors, maxContentLength);
}
public void clear() {
content.delete(0, content.length());
hasDocid = false;
docid = null;
}
public void setDocid(String docid) {
if(docid != null) {
this.docid = docid;
hasDocid = true;
}
}
public void concatenateAnchors(ArrayListWritable<AnchorText> anchors) {
concatenateAnchors(anchors, DEFAULT_MAX_CONTENT_LENGTH);
}
public void concatenateAnchors(ArrayListWritable<AnchorText> anchors, int maxContentLength) {
Preconditions.checkNotNull(anchors);
content.delete(0, content.length());
Collections.sort(anchors, new AnchorWeightComparator());
String previous = "";
// Concatenate anchors
for(AnchorText anchor: anchors) {
if(!anchor.isExternalInLink()) {
continue;
}
String anchorText = anchor.getText();
if(content.length() + anchorText.length() > maxContentLength) {
break;
}
if(!previous.equals(anchorText)) {
content.append(anchorText + " ");
previous = anchorText;
}
}
}
public void createHTML(ArrayListWritable<AnchorText> anchors) {
content.delete(0, content.length());
String url = "";
for(AnchorText anchor : anchors) {
if(anchor.isURL()) {
url = anchor.getText();
}
}
content.append("<html><head><title>" + url + "</title></head><body> Incoming Links:<br />");
for(AnchorText anchor : anchors) {
if(anchor.isExternalInLink() || anchor.isInternalInLink()) {
content.append(anchor.toString() + "<br />");
}
}
content.append("<br />Outgoing Links: <br />");
for(AnchorText anchor : anchors) {
if(anchor.isExternalOutLink() || anchor.isInternalOutLink()) {
content.append(anchor.toString() + "<br />");
}
}
String html = content.toString();
Matcher m = Pattern.compile("[\\[,]([\\d&&[^,\\[\\]]]*)[,\\]]").matcher(content.toString());
int start = 0;
while(m.find(start)) {
html = html.replace(m.group(), m.group().charAt(0) + "<a href=\"/fetch_docno?docno=" + m.group(1) + "\">" +
m.group(1) + "</a>" + m.group().charAt(m.group().length() - 1));
start = m.end() - 1;
}
content.delete(0, content.length());
content.append(html);
}
@Override
public String getContent() {
return content.toString().trim();
}
@Override
public String getDisplayContentType() {
return "text/html";
}
@Override
public String getDocid() {
return docid;
}
@Override public String toString() {
return "Docid: " + docid + "\n" + content.toString();
}
public void readFields(DataInput in) throws IOException {
content.delete(0, content.length());
docid = null;
char[] stream = new char[in.readInt()];
for(int i = 0; i < stream.length; i++) {
stream[i] = in.readChar();
}
content.append(new String(stream));
hasDocid = in.readBoolean();
if(hasDocid) {
docid = in.readUTF();
}
}
public void write(DataOutput out) throws IOException {
String text = content.toString();
out.writeInt(text.length());
out.writeChars(text);
out.writeBoolean(hasDocid);
if(hasDocid) {
out.writeUTF(docid);
}
}
private static class AnchorWeightComparator implements Comparator<AnchorText> {
public int compare(AnchorText a, AnchorText b) {
if(a.getType() != b.getType()) {
return a.getType() < b.getType() ? -1 : 1;
}
if(a.isWeighted() && b.isWeighted()) {
return a.getWeight() > b.getWeight() ? -1 : 1;
}
if(a.hasValidText() && b.hasValidText()) {
return a.getText().compareTo(b.getText());
}
return 0;
}
}
}