IndexableAnchorTextForwardIndex.java example

Explorer
Cloud9-master
- src
/*
 * Cloud9: A MapReduce Library for Hadoop
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.webgraph.data;

import java.io.IOException;
import java.text.DecimalFormat;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;

import tl.lin.data.array.ArrayListWritable;
import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.DocumentForwardIndex;

public class IndexableAnchorTextForwardIndex implements DocumentForwardIndex<IndexableAnchorText> {
	
	private static final IndexableAnchorText indexableAnchorText = new IndexableAnchorText();
	private static final DecimalFormat df = new DecimalFormat("00000");

	private Configuration conf;
	private FileSystem fs;

	private int[] docnos;
	private int[] offsets;
	private short[] filenos;
	private String collectionPath;

	private DocnoMapping docnoMapping;

	public IndexableAnchorTextForwardIndex(DocnoMapping docnoMapping) {
		this.docnoMapping = docnoMapping; 
	}

	@Override
	public void loadIndex(Path index, Path mapping, FileSystem fs) throws IOException {
		docnoMapping.loadMapping(mapping, fs);

		FSDataInputStream in = fs.open(index);

		// class name; throw away
		in.readUTF();
		collectionPath = in.readUTF();

		int blocks = in.readInt();

		docnos = new int[blocks];
		offsets = new int[blocks];
		filenos = new short[blocks];

		for (int i = 0; i < blocks; i++) {
			docnos[i] = in.readInt();
			offsets[i] = in.readInt();
			filenos[i] = in.readShort();
		}

		in.close();
	}

	public String getCollectionPath() {
		return collectionPath;
	}

	public IndexableAnchorText getDocument(int docno) {
		int idx = Arrays.binarySearch(docnos, docno);

		if (idx < 0)
			idx = -idx - 2;

		DecimalFormat df = new DecimalFormat("00000");
		String file = collectionPath + "/part-" + df.format(filenos[idx]);
		
		try {

			SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf);

			IntWritable key = new IntWritable();
			ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();

			reader.seek(offsets[idx]);

			while (reader.next(key)) {
				if (key.get() == docno)
					break;
			}

			reader.getCurrentValue(value);
			reader.close();
			
			indexableAnchorText.createHTML(value);
			return indexableAnchorText;
		} catch (IOException e) {
			e.printStackTrace();
		}

		return null;
	}

	public IndexableAnchorText getDocument(String docid) {
		return getDocument(docnoMapping.getDocno(docid));
	}

	public int getDocno(String docid) {
		return docnoMapping.getDocno(docid);
	}

	public String getDocid(int docno) {
		return docnoMapping.getDocid(docno);
	}

	public int getFirstDocno() {
		return docnos[0];
	}

	private int mLastDocno = -1;

	public int getLastDocno() {
		if (mLastDocno != -1)
			return mLastDocno;

		// find the last entry, and then see all the way to the end of the
		// collection
		int idx = docnos.length - 1;

		String file = collectionPath + "/part-" + df.format(filenos[idx]);

		try {
			SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf);
			IntWritable key = new IntWritable();

			reader.seek(offsets[idx]);

			while (reader.next(key))
				;
			mLastDocno = key.get();
		} catch (IOException e) {
			e.printStackTrace();
		}

		return mLastDocno;
	}

	public String getContentType() {
		return "text/html";
	}
	
	public String getDisplayContentType() {
		return "text/html";
	}

}