package com.twitter.elephantbird.mapreduce.input; import java.io.IOException; import java.util.Collection; import com.google.common.base.Preconditions; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import java.util.ArrayList; import java.util.List; /** * Implementation of a Lucene {@link Directory} for reading indexes directly off HDFS. * Note: This implementation is READ ONLY, it cannot be used to write to HDFS. * All non read only methods throw {@link UnsupportedOperationException} * * @author Jimmy Lin */ public class LuceneHdfsDirectory extends Directory { private static final String[] EMPTY_STRING_LIST = new String[0]; private final FileSystem fs; private final Path dir; public LuceneHdfsDirectory(String name, FileSystem fs) { this.fs = Preconditions.checkNotNull(fs, "FileSystem provided to LuceneHdfsDirectory cannot be null"); Preconditions.checkNotNull(name, "File name provided to LuceneHdfsDirectory cannot be null"); dir = new Path(name); try { Preconditions.checkArgument(fs.exists(dir), "Directory: " + dir + " does not exist!"); } catch (IOException e) { throw new RuntimeException(e); } } public LuceneHdfsDirectory(Path path, FileSystem fs) { this.fs = Preconditions.checkNotNull(fs); dir = path; } @Override public void close() throws IOException { } @Override public boolean fileExists(String name) throws IOException { return fs.exists(new Path(dir, name)); } @Override public long fileLength(String name) throws IOException { return fs.getFileStatus(new Path(dir, name)).getLen(); } @Override public String[] listAll() throws IOException { FileStatus[] statuses = fs.listStatus(dir); // some versions of hadoop return null instead of an empty list // or throwing an exception for non-existent directories if (statuses == null) { if (fs.exists(dir)) { return EMPTY_STRING_LIST; } else { throw new IllegalArgumentException("Directory: " + dir + " does not exist!"); } } String[] files = new String[statuses.length]; for (int i = 0; i < statuses.length; i++) { files[i] = statuses[i].getPath().getName(); } return files; } @Override public IndexInput openInput(String name, IOContext context) throws IOException { return new HDFSIndexInput(new Path(dir, name).toString()); } private class HDFSIndexInput extends IndexInput { private Path path; private final FSDataInputStream in; private String resourceDescription; // Lucene never closes cloned IndexInputs, it will only do this on the original one. private final List<HDFSIndexInput> clonedList; protected HDFSIndexInput(String resourceDescription) throws IOException { super(resourceDescription); this.resourceDescription = resourceDescription; path = new Path(resourceDescription); this.in = fs.open(path); this.clonedList = new ArrayList<HDFSIndexInput>(); } @Override public void close() throws IOException { for(HDFSIndexInput clonedIndexInput : clonedList) { clonedIndexInput.close(); } in.close(); } @Override public long getFilePointer() { try { return in.getPos(); } catch (IOException e) { throw new RuntimeException(e); } } @Override public long length() { try { return fs.getFileStatus(path).getLen(); } catch (IOException e) { throw new RuntimeException(e); } } @Override public byte readByte() throws IOException { return in.readByte(); } @Override public void readBytes(byte[] b, int offset, int len) throws IOException { // Important: use readFully instead of read. in.readFully(b, offset, len); } @Override public void seek(long pos) throws IOException { in.seek(pos); } @Override public IndexInput clone() { try { HDFSIndexInput copy = new HDFSIndexInput(resourceDescription); clonedList.add(copy); copy.seek(getFilePointer()); return copy; } catch (IOException e) { throw new RuntimeException(e); } } } // This is a read only implementation, so the following methods are not supported @Override public void deleteFile(String name) throws IOException { throw new UnsupportedOperationException(); } @Override public IndexOutput createOutput(String name, IOContext context) throws IOException { throw new UnsupportedOperationException(); } @Override public void sync(Collection<String> strings) throws IOException { throw new UnsupportedOperationException(); } }