package edu.umd.cloud9.collection.aquaint2;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import edu.umd.cloud9.collection.DocumentForwardIndex;
/**
* Object representing a document forward index for AQUAINT2 collections.
*
* @author Jimmy Lin
*/
public class Aquaint2ForwardIndex implements DocumentForwardIndex<Aquaint2Document> {
private static final Logger LOG = Logger.getLogger(Aquaint2ForwardIndex.class);
private long[] offsets;
private int[] lengths;
private FSDataInputStream input;
private Aquaint2DocnoMapping docnoMapping = new Aquaint2DocnoMapping();
private String collectionPath;
@Override
public int getDocno(String docid) {
return docnoMapping.getDocno(docid);
}
@Override
public String getDocid(int docno) {
return docnoMapping.getDocid(docno);
}
@Override
public int getLastDocno() {
return offsets.length-1;
}
@Override
public int getFirstDocno() {
return 1;
}
@Override
public String getCollectionPath() {
return collectionPath;
}
@Override
public Aquaint2Document getDocument(String docid) {
return getDocument(docnoMapping.getDocno(docid));
}
@Override
public Aquaint2Document getDocument(int docno) {
Aquaint2Document doc = new Aquaint2Document();
try {
LOG.debug("docno " + docno + ": byte offset " + offsets[docno] + ", length "
+ lengths[docno]);
input.seek(offsets[docno]);
byte[] arr = new byte[lengths[docno]];
input.read(arr);
Aquaint2Document.readDocument(doc, new String(arr));
} catch (IOException e) {
e.printStackTrace();
}
return doc;
}
@Override
public void loadIndex(Path index, Path mapping, FileSystem fs) throws IOException {
FSDataInputStream in = fs.open(index);
// Read and throw away.
in.readUTF();
collectionPath = in.readUTF();
// Docnos start at one, so we need an array that's one larger than number of docs.
int sz = in.readInt() + 1;
offsets = new long[sz];
lengths = new int[sz];
for (int i = 1; i < sz; i++) {
offsets[i] = in.readLong();
lengths[i] = in.readInt();
}
in.close();
input = fs.open(new Path(collectionPath));
docnoMapping.loadMapping(mapping, fs);
}
}