PetaboxFileSystem.java example

Explorer

ia-hadoop-tools-master
- src
  - main
    - java
      - org
        archive
        cassandra
        BOP.java
        CDXImporter.java
        CDXToSSTable.java
        CassCDXOutputFormat.java
        CassCDXRecordWriter.java
        CassCDXStore.java
        hadoop
        cdx
        BlockLoader.java
        CDXCluster.java
        CDXClusterRangeDumper.java
        CDXConverterTool.java
        ClusterRange.java
        HDFSBlockLoader.java
        HDFSLSR.java
        HDFSRangeDumper.java
        ManifestAggregator.java
        SplitFile.java
        SummaryGenerator.java
        ZipNumBlock.java
        ZipNumBlockIterator.java
        fs
        CollectionIndexItemSearcher.java
        IAS3.java
        ItemSearcher.java
        MetaManagerItemSearcher.java
        PetaboxFileSystem.java
        SearchEngineItemSearcher.java
        io
        HDFSTouch.java
        MergeClusterRangesInputFormat.java
        MergeClusterRangesInputSplit.java
        jobs
        ArchiveFileExtractor.java
        BuildCluster.java
        CDXGenerator.java
        CDXTransformer.java
        HTTPImportJob.java
        JobDriver.java
        MergeCluster.java
        MergeClusterRangesJob.java
        MergeClusters.java
        ProgramDriver.java
        WARCMetadataRecordGenerator.java
        WATExtractorJob.java
        WATGenerator.java
        mapreduce
        AlphaPartitioner.java
        CDXInputFormat.java
        CDXMapper.java
        GZIPMembersLineInputFormat.java
        GZIPMembersLineRecordReader.java
        GZIPRangeLineDereferencingInputFormat.java
        GZIPRangeLineDereferencingRecordReader.java
        GlobalWaybackCDXReducer.java
        GlobalWaybackMergeMapper.java
        HTTPImportMapper.java
        HttpLineRecordReader.java
        HttpTextInputFormat.java
        IdentityTextReducer.java
        LFOnlyLineReader.java
        LFOnlyLineRecordReader.java
        LineDereferencingInputFormat.java
        LineDereferencingRecordReader.java
        SimpleTextMapper.java
        SortMergeInputFormat.java
        SortMergeInputSplit.java
        WATExtractorMapper.java
        ZipNumAllOutputFormat.java
        ZipNumAllShardRecordWriter.java
        ZipNumOutputFormat.java
        ZipNumPartitioner.java
        ZipNumRecordWriter.java
        ZipNumRecordWriterOld.java
        pig
        CDXLoader.java
        CrawlLogLoader.java
        DateFilter.java
        DateFilterLSRLoader.java
        DisablablePathFilter.java
        FirstPigJobOnlyFilter.java
        HBaseStorage.java
        HFileStorage.java
        HttpClusterInputSplit.java
        HttpInputLineRecordReader.java
        HttpTextLoader.java
        HttpZipNumDerefLineRecordReader.java
        LSRPigLoader.java
        PerMapTextLoader.java
        RecordReaderValueIterator.java
        SequenceFileStorage.java
        SortedDistinctBagFactory.java
        ZipNumInputFormat.java
        ZipNumLoader.java
        ZipNumPartitioner.java
        ZipNumRecordReader.java
        ZipNumStorage.java
        udf
        AccessControlAllowCapture.java
        DateTime14ToTimestamp.java
        FromJsonFunc.java
        Recanonicalize.java
        ToJsonFunc.java
        toSURT.java
        streaming
        CdxDedupReducer.java
        CdxFilterMap.java
        CombineZipNumInputFormat.java
        CombinedGzipInputFormat.java
        GzipInputFormat.java
        GzipSingleFileRecordReader.java
        NativeZipNumOutputFormat.java
        NativeZipNumRecordWriter.java
        RecordReaderValueIterator.java
        ZipNumInputFormat.java
        ZipNumOutputFormat.java
        ZipNumPartitioner.java
        ZipNumRecordReader.java
        ZipNumRecordWriter.java
        util
        FilenameInputFormat.java
        HDFSMove.java
        HDFSSync.java
        HDFSeeko.java
        HadoopUtil.java
        PartitionName.java
        io
        ZipNumWriterTool.java
        petabox
        CookieFilePetaboxCredentialProvider.java
        HMACPetaboxAuthProvider.java
        ItemFile.java
        ItemMetadata.java
        PetaboxAuthProvider.java
        PetaboxClient.java
        PetaboxClientConfig.java
        PetaboxCredentialProvider.java
        server
        FileBackedInputStream.java
        GZRangeClient.java
        GZRangeClientTool.java
        GZRangeServer.java
  - test
    - java
      - org
        archive
        hadoop
        cdx
        CDXClusterTest.java
        SplitFileTest.java
        fs
        PetaboxFileSystemTest.java
        func
        URLResolverFuncTest.java
        pig
        CDXLoaderTest.java
        udf
        DateTime14ToTimestampTest.java
        petabox
        ItemMetadataTest.java
        server
        GZRangeClientTest.java

/**
 * 
 */
package org.archive.hadoop.fs;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.collections.map.LRUMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.Progressable;
import org.apache.http.HttpEntity;
import org.apache.http.client.HttpClient;
import org.archive.hadoop.util.HadoopUtil;
import org.archive.petabox.CookieFilePetaboxCredentialProvider;
import org.archive.petabox.HMACPetaboxAuthProvider;
import org.archive.petabox.ItemFile;
import org.archive.petabox.ItemMetadata;
import org.archive.petabox.PetaboxClient;
import org.archive.petabox.PetaboxClientConfig;
import org.archive.petabox.PetaboxCredentialProvider;

/**
 * PetaboxFileSystem allows Hadoop MapReduce to read data directly out of Internet Archive's
 * Petabox storage.
 * Only read operations are supported.
 * @author Kenji Nagahashi
 *
 */
public class PetaboxFileSystem extends FileSystem {
	private static Log LOG = LogFactory.getLog(PetaboxFileSystem.class);
	
	/**
	 * default value for {@link #downloadPrefix}. 
	 * "/serve" is the preferred value as it doesn't increment download counter.
	 */
	public static final String DEFAULT_DOWNLOAD_PREFIX = "/serve";
	/**
	 * URL path prefix for downloading a file in an item.
	 */
	private String downloadPrefix;
	
	protected URI fsUri;
	private Path cwd = new Path("/");

	private static PetaboxFileSystem defaultInstance;

	protected PetaboxClient pbclient;

	protected String urlTemplate = null;

	//  protected HttpClient client;

	private String hadoopJobInfo;

	// metadata cache
	private LRUMap metadataCache = new LRUMap(200);
	/**
	 * maximum number of concurrent metadata API queries
	 * (i.e. max number of threads to be used for making metadata API
	 * queries)
	 */
    private int maxMetadataConcurrency = 10;


	private Class<? extends ItemSearcher> itemSearcherClass = SearchEngineItemSearcher.class;
	protected ItemSearcher itemSearcher;

	/**
	 * if non-null and non-emtpy, {@link #listStatus(Path)} will only return
	 * those files of specified media types (unless explicitly specified).
	 */
	protected String[] fileTypes;

	public PetaboxFileSystem() {
	}

	public static class HadoopPetaboxClientConfig implements PetaboxClientConfig {
		private Configuration conf;
		private String confbase;
		public HadoopPetaboxClientConfig(Configuration conf, String confbase) {
			this.conf = conf;
			this.confbase = confbase;
		}
		@Override
		public String getString(String name) {
			return conf.get(confbase + "." + name);
		}
		@Override
		public int getInt(String name, int defaultValue) {
			return conf.getInt(confbase + "." + name, defaultValue);
		}
		@Override
		public boolean getBoolean(String name, boolean defaultValue) {
			return conf.getBoolean(confbase + "." + name, defaultValue);
		}
	}

	/**
	 * loads credentials from cookie file if not set in configuration. loaded credentials
	 * are put into Configuration to make it available in MR tasks. this may not work well
	 * in non-Pig environment, where PBFS is not instantiated in submitting host.
	 * @author kenji
	 */
	public static class ConfigurationPetaboxCredentialProvider extends PetaboxCredentialProvider {
		private String user;
		private String sig;
		public ConfigurationPetaboxCredentialProvider(Configuration conf, String confbase) {
			user = conf.get(confbase + ".user");
			sig = conf.get(confbase + ".sig");
			// if not set as configuration property, try loading ~/.iaauth
			if (user == null) {
				CookieFilePetaboxCredentialProvider provider = new CookieFilePetaboxCredentialProvider();
				user = provider.getUser();
				sig = provider.getSignature();
				if (user == null) {
					LOG.warn("could not load petabox credentials with " + provider.getClass().getName());
				} else {
					LOG.info("loaded petabox credentials for user \"" + user + "\" with " +
							provider.getClass().getName());
					conf.set(confbase + ".user", user);
					conf.set(confbase + ".sig", sig);
				}
			} else {
				LOG.info("loaded petabox credentials for user \"" + user + "\" from Hadoop configuration");
			}
		}
		@Override
		public String getUser() {
			return user;
		}
		@Override
		public String getSignature() {
			return sig;
		}
	}
	/**
	 * new HMAC-based authentication.
	 * @author kenji
	 */
	public static class ConfigurationHMACPetaboxAuthProvider extends HMACPetaboxAuthProvider {
	    public ConfigurationHMACPetaboxAuthProvider(Configuration conf, String confbase) {
	        secretKey = conf.get(confbase + ".secret");
	        if (secretKey == null) {
	            LOG.warn("HMAC secret key is unavailable - please set " + confbase + ".secret parameter");
	            secretKey = "";
	        }
        }
	}
	
	@Override
	public void initialize(URI name, final Configuration conf) throws IOException {
		super.initialize(name, conf);
		// name.path is not necessarily be "/". it typically is what's passed to LOAD (i.e.
		// may contain wild-cards.)
		this.fsUri = name;
		this.cwd = new Path("/");

		final String confbase = "fs." + fsUri.getScheme();

		PetaboxClientConfig pbconf = new HadoopPetaboxClientConfig(conf, confbase);	
		this.pbclient = new PetaboxClient(pbconf);
		this.pbclient.setPetaboxHost(fsUri.getAuthority());
		this.pbclient.setReferer(getHadoopJobInfo());
		
		String authType = conf.get(confbase + ".auth-type", "hmac");
		if ("user".equals(authType)) {
	        this.pbclient.setAuthProvider(new ConfigurationPetaboxCredentialProvider(conf, confbase));
		} else {
		    this.pbclient.setAuthProvider(new ConfigurationHMACPetaboxAuthProvider(conf, confbase));
		}

		this.downloadPrefix = conf.get(confbase + ".download-prefix", DEFAULT_DOWNLOAD_PREFIX);
		
		this.urlTemplate = conf.get(confbase + ".url-template");

		// this is hdfs:// URI for job staging, which contains job tracker
		// host:port, user name and JobID. Somewhat ugly, but provides useful info.
		this.hadoopJobInfo = conf.get("mapreduce.job.dir", "-");

		// ensure dependencies get shipped
		HadoopUtil.addDependencyJars(conf, HttpClient.class, HttpEntity.class, LRUMap.class);

		String itemSearcherClassName = conf.get(confbase + ".itemsearcher");
		if (itemSearcherClassName != null) {
			try {
				this.itemSearcherClass = Class.forName(itemSearcherClassName).asSubclass(ItemSearcher.class);
			} catch (ClassNotFoundException ex) {
				throw new IOException(confbase + ".itemsearcher=" + itemSearcherClassName + ": undefined class");
			}
		}
		try {
			LOG.info("initializing ItemSearcher " + this.itemSearcherClass.getName());
			this.itemSearcher = this.itemSearcherClass.newInstance();
			this.itemSearcher.initialize(this, fsUri, conf);
		} catch (Exception ex) {
			throw new IOException(this.itemSearcherClass.getName() + ": instantiation failed");
		}

		String fileTypes = conf.get(confbase + ".file-types");
		if (fileTypes != null && !fileTypes.isEmpty()) {
			String[] a = fileTypes.split("(\\s*,)+\\s*");
			for (int i = 0; i < a.length; i++) {
				a[i] = a[i].trim();
			}
			this.fileTypes = a;
		}
		
		maxMetadataConcurrency = conf.getInt(confbase + ".max-metadata-concurrency",
		        maxMetadataConcurrency);

		LOG.info("PetaboxFileSystem.initialize:fsUri=" + fsUri);
		if (defaultInstance == null)
			defaultInstance = this;
	}

	/**
	 * return the first PetaboxFileSystem initialized in this JVM.
	 * convenience method for Hadoop scripting classes that want to provide
	 * quick access to PetaboxFileSystem instance.
	 * @return PetaboxFileSystem or null if no instance has been initialized yet.
	 */
	public static PetaboxFileSystem getDefaultInstance() {
		return defaultInstance;
	}

	protected String getHadoopJobInfo() {
		return hadoopJobInfo;
	}
	protected static class ReadOnlyFileSystemException extends IOException {
		private static final long serialVersionUID = -6540111017708582671L;
		public ReadOnlyFileSystemException() {
			super("PetaboxFileSystem is read-only");
		}
	}
	final static long parseLong(String o) {
		if (o == null || o.equals("")) return 0;
		try {
			return Long.parseLong(o);
		} catch (NumberFormatException ex) {
			return 0;
		}
	}
	final static String getString(Map<String, Object> map, String key) {
		Object o = map.get(key);
		return o != null ? o.toString() : null;
	}
	final static boolean getBoolean(Map<String, Object> map, String key) {
		return getBoolean(map, key, false);
	}
	final static boolean getBoolean(Map<String, Object> map, String key, boolean defaultValue) {
		Object o = map.get(key);
		if (o instanceof Boolean) {
			return (Boolean)o;
		} else {
			return defaultValue;
		}
	}
	protected ItemMetadata getItemMetadata(String itemid) throws IOException {
		ItemMetadata md;
		synchronized (metadataCache) {
		    md = (ItemMetadata)metadataCache.get(itemid);
		}
		if (md == null) {
			md = pbclient.getItemMetadata(itemid);
			if (md != null) {
			    synchronized (metadataCache) {
			        metadataCache.put(itemid, md);
			    }
			}
		}
		return md;
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#append(org.apache.hadoop.fs.Path, int, org.apache.hadoop.util.Progressable)
	 */
	@Override
	public FSDataOutputStream append(Path f, int bufferSize, Progressable progress)
			throws IOException {
		throw new ReadOnlyFileSystemException();
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#create(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.permission.FsPermission, boolean, int, short, long, org.apache.hadoop.util.Progressable)
	 */
	@Override
	public FSDataOutputStream create(Path f, FsPermission permission,
			boolean overwrite, int bufferSize, short replication, long blockSize,
			Progressable progress) throws IOException {
		throw new ReadOnlyFileSystemException();
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#delete(org.apache.hadoop.fs.Path)
	 */
	@Override
	public boolean delete(Path f) throws IOException {
		throw new ReadOnlyFileSystemException();
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#delete(org.apache.hadoop.fs.Path, boolean)
	 */
	@Override
	public boolean delete(Path f, boolean recursive) throws IOException {
		throw new ReadOnlyFileSystemException();
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#getFileStatus(org.apache.hadoop.fs.Path)
	 * Path is mapped to Petabox structure as follows:
	 * <ul>
	 * <li><tt>/</tt><it>ITEMID</it>: directory
	 * <li><tt>/</tt><it>ITEMID</it><tt>/</tt><it>FILENAME</it>: file
	 * </ul>
	 */
	@Override
	public FileStatus getFileStatus(Path f) throws IOException {
		LOG.info("getFileStatus:" + f);
		FileStatus fstat = null;
		int depth = f.depth();
		if (depth == 1) {
			// Item
			String itemid = f.getName();
			ItemMetadata md = getItemMetadata(itemid);
			//long mtime = md.getUpdated();
			// in a strict sense, mtime should be the largest of
			// addeddate, mtime of _meta.xml and mtime of _files.xml. As file entry for _files.xml
			// does not have mtime property, we'd need to take the largest of file's mtime. Since
			// we don't expect any serious use of FileStatus's mtime, we simply return addeddate as
			// mtime.
			long mtime = md.getAddedDate();
			Path qf = makeQualified(f);
			// Note mtime is in seconds and FileStatus wants milliseconds.
			fstat = new FileStatus(0, true, md.isSolo() ? 1 : 2, 4096, mtime * 1000, qf);
		} else if (depth == 2) {
			// Path is relative (more precisely, no scheme and authority) while Pig is 
			// enumerating target files, but it can be absolute in other use cases. 
			// Don't assume Path is relative.
			Path itempath = f.getParent();
			if (itempath == null) {
				// this should not happen because depth == 2
				throw new FileNotFoundException(f + "parent is null despite depth==2");
			}
			String itemid = itempath.getName();
			String fn = f.getName();
			ItemMetadata md = getItemMetadata(itemid);
			for (int i = 0; i < md.getFiles().length; i++) {
				ItemFile ifile = md.getFiles()[i];
				if (ifile != null && fn.equals(ifile.getName())) {
					// we need to fully URI-qualified Path. Since f has path part only, it will
					// be interpreted as a local/HDFS file.
					Path qf = makeQualified(f);
					// Note ifile.mtime is in seconds and FileStatus wants milliseconds.
					fstat = new FileStatus(ifile.getSize(), false, md.isSolo() ? 1 : 2, 4096, 
							ifile.getMtimeMS(), qf);
					break;
				}
			}
		} else {
			// file - currently only depth <= 2 is supported.
			throw new IOException(f.toString() + ": only 2-depth path is supported.");
		}
		return fstat;
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#getUri()
	 */
	@Override
	public URI getUri() {
		return fsUri;
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#getWorkingDirectory()
	 */
	@Override
	public Path getWorkingDirectory() {
		return cwd;
	}

	protected static long isodatetime2timestamp(String isodatetime) {
		if (isodatetime == null) return 0;
		DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ENGLISH);
		try {
			Date date = df.parse(isodatetime);
			return date.getTime();
		} catch (ParseException ex) {
			return 0;
		}
	}

	/**
	 * return PetaboxClient setup for this instance.
	 * this method may be removed by refactoring in near future.
	 * @return
	 */
	public PetaboxClient getPetaboxClient() {
		return pbclient;
	}

	/**
	 * enumerate items in collection {@code itemid} with Search Engine.
	 * only returns index items (i.e. NoIndex items are excluded).
	 * @param itemid collection identifier
	 * @return array of FileStatus for (non-collection) items. 
	 * @throws IOException
	 */
	protected FileStatus[] searchItems(String itemid) throws IOException {
	    LOG.info("querying items in collection " + itemid + " with " + 
	            itemSearcher.getClass().getName());
		return itemSearcher.searchItems(itemid);
	}

	private boolean accepted(ItemFile ifile) {
		if (fileTypes == null || fileTypes.length == 0) return true;
		if (ifile.getFormat() == null) return false;
		for (int i = 0; i < fileTypes.length; i++) {
			if (ifile.getFormat().equals(fileTypes[i])) return true;
		}
		return false;
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#listStatus(org.apache.hadoop.fs.Path)
	 */
	@Override
	public FileStatus[] listStatus(Path f) throws IOException {
		// TODO: we could optimize the query and return values, by inferring actually wanted files
		// from fsUri, which contains original FileSpec pattern string.
		int depth = f.depth();
		if (depth == 1) {
			// Item
			String itemid = f.getName();
			ItemMetadata md = getItemMetadata(itemid);
			// if itemid is a collection, list all items in the collection via search engine.
			// this returns FileStatuses with 1-depth canonical Path. Hadoop is just fine with it.  
			if (md.isCollection()) {
			    LOG.info("enumerating items in collection " + itemid);
				return searchItems(itemid);
			}
			// for regular item, return list of files in it.
			LOG.debug("enumerating files in item " + itemid);
			if (md.getFiles() != null) {
				List<FileStatus> files = new ArrayList<FileStatus>();
				Path qf = makeQualified(f);
				for (int i = 0; i < md.getFiles().length; i++) {
					ItemFile ifile = md.getFiles()[i];
					if (!accepted(ifile)) continue;
					// perhaps blocksize should be much larger than this to prevent Hadoop from splitting input
					// into overly small fragments, as range requests incur relatively high overhead.
					files.add(new FileStatus(ifile.getSize(), false, md.isSolo() ? 1 : 2, 4096, 
							ifile.getMtimeMS(), new Path(qf, ifile.getName())));
				}
				return files.toArray(new FileStatus[files.size()]);
			} else {
				// metadata API response had no "files" key - assume there's no files.
				return new FileStatus[0];
			}
		} else {
			// file - currently only depth 2 is supported.
			return new FileStatus[0];
		}
	}
	
	class ListStatusTask implements Runnable {
	    private List<FileStatus> results;
	    private Path file;
	    private PathFilter filter;
	    public ListStatusTask(List<FileStatus> results, Path file, PathFilter filter) {
	        this.results = results;
	        this.file = file;
	        this.filter = filter;
	    }
	    @Override
	    public void run() {
	        try {
	            FileStatus listing[] = listStatus(file);
	            if (listing != null) {
	                for (FileStatus s : listing) {
	                    if (filter.accept(s.getPath())) {
	                        results.add(s);
	                    }
	                }
	            }
	        } catch (IOException ex) {
	            // TODO: would want to abort the job if too many
	            // items are failing to resolve.
	            LOG.error("listStatus(" + file + " failed", ex);
	        }
	    }
	}
	
	/**
	 * {@inheritDoc}
	 * <p>overridden to make concurrent metadata API queries for large
	 * number of {@code files} (currently threashold is hard-coded as 50).
	 * maximum number of concurrent queries is controlled by
	 * configuration parameter {@code max-metadata-concurrency}.</p>
	 */
	@Override
	public FileStatus[] listStatus(Path[] files, PathFilter filter)
	        throws IOException {
	    // don't use multiple threads for small number of files.
	    if (files.length < 50 || maxMetadataConcurrency < 2) {
	        return super.listStatus(files, filter);
	    }

	    LOG.info("listing files in " + files.length + " paths concurrently.");
	    ExecutorService executor = Executors.newFixedThreadPool(
	            Math.min(maxMetadataConcurrency, files.length));
	    List<FileStatus> results = Collections.synchronizedList(new ArrayList<FileStatus>());
	    for (Path file : files) {
	        // listStatus(Path) can return something only for Path of depth==1.
	        if (file.depth() == 1) {
	            executor.execute(new ListStatusTask(results, file, filter));
	        }
	    }
	    executor.shutdown();
	    try {
	        executor.awaitTermination(24L, TimeUnit.HOURS);
	    } catch (InterruptedException ex) {
	        throw new IOException("interrupted.");
	    }
	    return results.toArray(new FileStatus[results.size()]);
	}
	
	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#mkdirs(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.permission.FsPermission)
	 */
	@Override
	public boolean mkdirs(Path f, FsPermission permission) throws IOException {
		throw new ReadOnlyFileSystemException();
	}

	/**
	 * returns real URI for reading a resource represented by FileSystem URI {@code uri}.
	 * {@link #urlTemplate} is used as template if non-null.
	 * @param uri
	 * @return
	 * @throws URISyntaxException generated URI is in bad syntax (bad uriTemplate)
	 */
	protected URI getRealURI(URI uri) throws URISyntaxException {
		if (urlTemplate == null) {
			return new URI("http", fsUri.getAuthority(), downloadPrefix + uri.getPath(), null);
		}
		String path = uri.getPath(); // should be in /ITEM/FILE format.
		StringBuffer sb = new StringBuffer();
		// TODO: we could pre-parse urlTemplate for faster processing and early error reporting.
		Pattern p = Pattern.compile("%.");
		Matcher m = p.matcher(urlTemplate);
		while (m.find()) {
			switch (urlTemplate.charAt(m.end() - 1)) {
			case 'f':
				m.appendReplacement(sb, path);
				break;
			case '%':
				m.appendReplacement(sb, "%");
				break;
			default:
				throw new URISyntaxException(urlTemplate, "unrecognized %-escape in urlTemplate:" + m.group());
			}
		}
		m.appendTail(sb);
		return URI.create(sb.toString());
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#open(org.apache.hadoop.fs.Path, int)
	 */
	@Override
	public FSDataInputStream open(Path f, int bufferSize) throws IOException {
		LOG.info("PetaboxFileSystem.open(" + f + ")");
		if (f.depth() == 1) {
			throw new IOException("is an item");
		}
		URI vURI = f.toUri();
		// XXX what if f has scheme and authority different from this FileSystem?
		// TODO: probably we should resolve item location by ourselves and let user put
		// preference on either primary or secondary.
		URI physURI;
		try {
			physURI = getRealURI(vURI);
			return new FSDataInputStream(pbclient.openURI(physURI));
		} catch (URISyntaxException ex) {
			throw new IOException("download URI construction failed: " + vURI.getPath());
		}
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#rename(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path)
	 */
	@Override
	public boolean rename(Path src, Path dst) throws IOException {
		throw new ReadOnlyFileSystemException();
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.fs.FileSystem#setWorkingDirectory(org.apache.hadoop.fs.Path)
	 */
	@Override
	public void setWorkingDirectory(Path newDir) {
		cwd = newDir;
	}

	@Override
	public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
			long len) throws IOException {
		if (file == null) return null;
		if (start < 0 || len < 0) throw new IllegalArgumentException("Invalid start or len parameter");
		if (file.getLen() < start) return new BlockLocation[0];

		Path f = file.getPath();
		if (f.depth() == 2) {
			// Path is qualified.
			URI uri = f.toUri();
			String sf = uri.getPath();
			int ps = sf.indexOf(Path.SEPARATOR_CHAR, 1);
			String itemid = ps == -1 ? /* should not happen */ sf : sf.substring(1, ps);
			ItemMetadata md = getItemMetadata(itemid);
			List<String> name = new ArrayList<String>(2);
			List<String> host = new ArrayList<String>(2);
			if (md.getD1() == null && md.getD2() == null) {
				// unlikely to happen unless the item has just disappeared.
				if (md.getServer() != null) {
					name.add(md.getServer() + ":80");
					host.add(md.getServer());
				}
			} else {
				if (md.getD1() != null && !md.getD1().equals("")) {
					name.add(md.getD1() + ":80");
					host.add(md.getD1());
				}
				if (md.getD2() != null && !md.getD2().equals("")) {
					name.add(md.getD2() + ":80");
					host.add(md.getD2());
				}
			}
			LOG.debug("getFileBlockLocations sf=" + sf + " : " + host);
			if (name.isEmpty()) {
				return new BlockLocation[0];
			} else {
				return new BlockLocation[] {
						new BlockLocation(name.toArray(new String[name.size()]),
								host.toArray(new String[host.size()]), 0, file.getLen())
				};
			}
		} else {
			return new BlockLocation[0];
		}
	}	
}