package eu.fbk.knowledgestore.data; import java.io.File; import java.io.FilterOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStream; import java.io.Serializable; import java.net.URI; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import javax.annotation.Nullable; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import eu.fbk.rdfpro.util.IO; // NOTE: the current implementation rewrites the dictionary file each time a mapping is added, // keeping always a backup copy. This is secure, however performances are severely limited if a // lot of insertions are performed. A better scheme using a log file should be used /** * A persistent, synchronized, monotonic (add-only) dictionary mapping positive {@code int} keys * to {@code Serializable} objects. */ public abstract class Dictionary<T extends Serializable> { private static final long MAX_CLOCK_SKEW = 60 * 1000; // 60 sec private final Class<T> clazz; private final String url; private volatile List<T> keyToObjectIndex; // immutable, replaced on reload private volatile Map<T, Integer> objectToKeyIndex; // immutable, replaced on reload private long lastAccessed; public static <T extends Serializable> Dictionary<T> createLocalDictionary( final Class<T> objectClass, final File file) throws IOException { // Check parameters Preconditions.checkNotNull(objectClass); Preconditions.checkNotNull(file); // Build, initialize and return the dictionary final Dictionary<T> dictionary = new LocalDictionary<T>(objectClass, file.toURI() .toString(), file); dictionary.reload(); return dictionary; } public static <T extends Serializable> Dictionary<T> createHadoopDictionary( final Class<T> objectClass, final String fileURL) throws IOException { // Check parameters Preconditions.checkNotNull(objectClass); Preconditions.checkNotNull(fileURL); // Resolve the supplied Hadoop URL, retrieving FileSystem and Path objects final FileSystem fs = FileSystem.get(URI.create(fileURL), new org.apache.hadoop.conf.Configuration(true)); final Path path = new Path(URI.create(fileURL).getPath()); // Build normalized URL String urlBase = fs.getUri().toString(); String urlPath = path.toString(); if (urlBase.endsWith("/")) { urlBase = urlBase.substring(0, urlBase.length() - 1); } if (!urlPath.startsWith("/")) { urlPath = "/" + urlPath; } final String url = urlBase + urlPath; // Build a Dictionary using the Hadoop API for all the I/O final Dictionary<T> dictionary = new HadoopDictionary<T>(objectClass, url, fs, path); // Load dictionary data and return the initialized dictionary dictionary.reload(); return dictionary; } Dictionary(final Class<T> objectClass, final String url) { this.clazz = Preconditions.checkNotNull(objectClass); this.url = Preconditions.checkNotNull(url); this.keyToObjectIndex = Lists.newArrayList(); this.objectToKeyIndex = Maps.newHashMap(); } @Nullable abstract Long lastModified(String suffix) throws IOException; abstract InputStream read(String suffix) throws IOException; abstract OutputStream write(String suffix) throws IOException; abstract void delete(String suffix) throws IOException; abstract void rename(String oldSuffix, String newSuffix) throws IOException; public Class<T> getObjectClass() { return this.clazz; } public String getDictionaryURL() { return this.url; } public T objectFor(final int key) throws IOException, NoSuchElementException { return this.objectFor(key, true); } @Nullable public T objectFor(final int key, final boolean mustExist) throws IOException, NoSuchElementException { Preconditions.checkArgument(key > 0, "Non-positive key %d", key); // local cache of keyToObjectIndex, which may change concurrently List<T> index = this.keyToObjectIndex; if (key > index.size()) { this.reload(); // object might have been added by another process index = this.keyToObjectIndex; // pick up updated index } if (key <= index.size()) { return index.get(key - 1); // 1-based indexes } else if (!mustExist) { return null; } throw new NoSuchElementException("No object for key " + key); } public List<T> objectsFor(final Iterable<? extends Integer> keys, final boolean mustExist) throws IOException, NoSuchElementException { // local cache of keyToObjectIndex, which may change within for cycles List<T> index = this.keyToObjectIndex; for (final int key : keys) { Preconditions.checkArgument(key > 0, "Non-positive key %d", key); if (key > index.size()) { this.reload(); // missing objects might have been added by other processes index = this.keyToObjectIndex; break; } } final List<T> result = Lists.newArrayListWithCapacity(Iterables.size(keys)); List<Integer> missing = null; for (final int key : keys) { if (key <= index.size()) { result.add(index.get(key - 1)); // 1-based indexes } else if (mustExist) { if (missing == null) { missing = Lists.newArrayList(); } missing.add(key); } } if (missing != null) { throw new NoSuchElementException("No objects for keys " + Joiner.on(", ").join(missing)); } return result; } public Integer keyFor(final T object) throws IOException { final Integer key = this.keyFor(object, true); assert key != null; return key; } @Nullable public Integer keyFor(final T object, final boolean mayGenerate) throws IOException { Preconditions.checkNotNull(object); Integer key = this.objectToKeyIndex.get(object); if (key == null && mayGenerate) { this.update(Collections.singletonList(object)); key = this.objectToKeyIndex.get(object); } return key; } public List<Integer> keysFor(final Iterable<? extends T> objects, final boolean mayGenerate) throws IOException { Preconditions.checkNotNull(objects); // local cache of objectToKeyIndex, which may change within for cycles Map<T, Integer> index = this.objectToKeyIndex; final List<Integer> result = Lists.newArrayListWithCapacity(Iterables.size(objects)); List<T> missingObjects = null; List<Integer> missingOffsets = null; for (final T object : objects) { final Integer key = index.get(object); result.add(key); if (key == null) { Preconditions.checkNotNull(object); if (missingOffsets == null) { missingObjects = Lists.newArrayList(); missingOffsets = Lists.newArrayList(); } assert missingObjects != null; // to make Eclipse happy :-( missingObjects.add(object); missingOffsets.add(result.size()); } } if (missingObjects != null && mayGenerate) { assert missingOffsets != null; // to make Eclipse happy :-( this.update(missingObjects); index = this.objectToKeyIndex; // pick up updated index for (int i = 0; i < missingObjects.size(); ++i) { final int offset = missingOffsets.get(i); final T object = missingObjects.get(i); final Integer key = this.objectToKeyIndex.get(object); result.set(offset, key); } } return result; } public <M extends Map<? super Integer, ? super T>> M toMap(@Nullable final M map) throws IOException { @SuppressWarnings("unchecked") final M actualMap = map != null ? map : (M) Maps.newHashMap(); this.reload(); // make sure to read the most recently persisted data // local cache of objectToKeyIndex, which may change within the for cycle final List<T> index = this.keyToObjectIndex; for (int i = 0; i < index.size(); ++i) { actualMap.put(i, index.get(i)); } return actualMap; } public <L extends List<? super T>> L toList(@Nullable final L list) throws IOException { @SuppressWarnings("unchecked") final L actualList = list != null ? list : (L) Lists.newArrayList(); this.reload(); // make sure to read the most recently persisted data actualList.addAll(this.keyToObjectIndex); return actualList; } private synchronized void reload() throws IOException { // abort if the file was not modified after the last time we loaded it final Long lastModified = lastModifiedWithBackup(); if (lastModified == null || lastModified < this.lastAccessed - Dictionary.MAX_CLOCK_SKEW) { return; } // prepare two builders for re-creating the in-memory indexes final ImmutableList.Builder<T> keyToObjectIndexBuilder = ImmutableList.builder(); final ImmutableMap.Builder<T, Integer> objectToKeyIndexBuilder = ImmutableMap.builder(); // read from the file, putting data in the builders final ObjectInputStream stream = new ObjectInputStream(readWithBackup()); assert stream != null; T object = null; try { final int size = stream.readInt(); for (int key = 1; key <= size; ++key) { object = this.clazz.cast(stream.readObject()); keyToObjectIndexBuilder.add(object); objectToKeyIndexBuilder.put(object, key); } } catch (final ClassCastException ex) { assert object != null; throw new IOException("Cannot read from " + this.url + ": found " + object.getClass().getName() + ", expected " + this.clazz.getName()); } catch (final ClassNotFoundException ex) { throw new IOException("Cannot read from " + this.url + ": either the content is " + "malformed, or it encodes data of another dictionary using classes not " + "available in this JVM"); } finally { stream.close(); } // on success, build the new in-memory indexes and store them in the object fields this.keyToObjectIndex = keyToObjectIndexBuilder.build(); this.objectToKeyIndex = objectToKeyIndexBuilder.build(); // update the last accessed time this.lastAccessed = System.currentTimeMillis(); } private synchronized void update(final Iterable<T> newObjects) throws IOException { // make sure to have the most recent data (we rely on the fact locking is reentrant) this.reload(); // access current versions of in-memory indexes (after the reload) List<T> keyToObjectIndex = this.keyToObjectIndex; Map<T, Integer> objectToKeyIndex = this.objectToKeyIndex; // detect missing objects. nothing to do if there are no missing objects final List<T> missing = Lists.newArrayList(); for (final T object : newObjects) { if (!objectToKeyIndex.containsKey(object)) { missing.add(object); } } if (missing.isEmpty()) { return; } // create new key -> object index that includes the missing objects keyToObjectIndex = ImmutableList.copyOf(Iterables.concat(keyToObjectIndex, missing)); // create new object -> key index that includes the missing objects final ImmutableMap.Builder<T, Integer> builder = ImmutableMap.builder(); builder.putAll(objectToKeyIndex); int key = objectToKeyIndex.size(); for (final T object : missing) { builder.put(object, ++key); } objectToKeyIndex = builder.build(); // write the new index to the file final ObjectOutputStream stream = new ObjectOutputStream(writeWithBackup()); try { stream.writeInt(keyToObjectIndex.size()); for (final T object : keyToObjectIndex) { stream.writeObject(object); } } finally { stream.close(); } // update last accessed time this.lastAccessed = System.currentTimeMillis(); // update index member variables this.keyToObjectIndex = keyToObjectIndex; this.objectToKeyIndex = objectToKeyIndex; } private InputStream readWithBackup() throws IOException { // we keep track of filesystem exceptions (but it's unclear when they are thrown) IOException exception = null; try { // 1. try to read the requested file final InputStream result = read(""); if (result != null) { return result; } } catch (final IOException ex) { exception = ex; } try { // 2. on failure, try to read its backup final InputStream result = read(".backup"); if (result != null) { return result; } } catch (final IOException ex) { if (exception == null) { exception = ex; } } // 3. only on failure check whether the two files exist final boolean fileExists = lastModified("") != null; final boolean backupExists = lastModified(".backup") != null; // 4. if they don't exist it's ok, just report this returning null if (!fileExists && !backupExists) { return null; } // 5. otherwise we throw an exception (possibly the ones got before) if (exception == null) { exception = new IOException("Cannot read " + (fileExists ? this.url : this.url + ".backup") + " (file reported to exist)"); } throw exception; } private OutputStream writeWithBackup() throws IOException { // 1. delete filename.new if it exists delete(".new"); // 2. if filename exists, rename it to filename.backup (deleting old backup) if (lastModified("") != null) { delete(".backup"); rename("", ".backup"); } // 3. create filename.new, returning a stream for writing its content return new FilterOutputStream(write(".new")) { @Override public void close() throws IOException { super.close(); rename(".new", ""); } }; } private Long lastModifiedWithBackup() throws IOException { Long lastModified = lastModified(""); if (lastModified == null) { lastModified = lastModified(".backup"); } return lastModified; } private static final class LocalDictionary<T extends Serializable> extends Dictionary<T> { private final File file; LocalDictionary(final Class<T> objectClass, final String url, final File file) { super(objectClass, url); this.file = file; } @Override @Nullable Long lastModified(final String suffix) throws IOException { final long modifiedTime = applySuffix(suffix).lastModified(); return modifiedTime > 0 ? modifiedTime : null; } @Override InputStream read(final String suffix) throws IOException { return IO.read(applySuffix(suffix).getAbsolutePath()); } @Override OutputStream write(final String suffix) throws IOException { return IO.write(applySuffix(suffix).getAbsolutePath()); } @Override void delete(final String suffix) throws IOException { applySuffix(suffix).delete(); } @Override void rename(final String oldSuffix, final String newSuffix) throws IOException { java.nio.file.Files.move(applySuffix(oldSuffix).toPath(), applySuffix(newSuffix) .toPath()); } private File applySuffix(final String suffix) { return Strings.isNullOrEmpty(suffix) ? this.file : new File( this.file.getAbsolutePath() + suffix); } } private static final class HadoopDictionary<T extends Serializable> extends Dictionary<T> { private final FileSystem fs; private final Path path; HadoopDictionary(final Class<T> objectClass, final String url, final FileSystem fs, final Path path) { super(objectClass, url); this.fs = fs; this.path = path; } @Override @Nullable Long lastModified(final String suffix) throws IOException { final Path path = applySuffix(suffix); try { final FileStatus status = this.fs.getFileStatus(path); if (status != null) { return status.getModificationTime(); } } catch (final IOException ex) { if (this.fs.exists(path)) { throw ex; } } return null; } @Override InputStream read(final String suffix) throws IOException { return this.fs.open(applySuffix(suffix)); } @Override OutputStream write(final String suffix) throws IOException { return this.fs.create(applySuffix(suffix)); } @Override void delete(final String suffix) throws IOException { final Path path = applySuffix(suffix); IOException exception = null; try { if (this.fs.delete(path, false)) { return; } } catch (final IOException ex) { exception = ex; } if (this.fs.exists(path)) { throw exception != null ? exception : new IOException("Cannot delete " + path); } } @Override void rename(final String oldSuffix, final String newSuffix) throws IOException { if (oldSuffix.equals(newSuffix)) { return; } final Path from = applySuffix(oldSuffix); final Path to = applySuffix(newSuffix); final boolean renamed = this.fs.rename(from, to); if (!renamed) { String message = "Cannot rename " + from + " to " + to; if (this.fs.exists(to)) { message += ": destination already exists"; } else if (this.fs.exists(from)) { message += ": source does not exist"; } throw new IOException(message); } } private Path applySuffix(final String suffix) { return Strings.isNullOrEmpty(suffix) ? this.path : new Path(this.path.getParent() + "/" + this.path.getName() + suffix); } } }