/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.dataset.check.impl; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.concurrent.Semaphore; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.carrotsearch.hppc.ObjectLongOpenHashMap; /** * <p> * This class is an extension of the {@link EntityCheckerManagerImpl} that * caches the results using a persistent file-based cache. Note that the cache * stores the time stamp at which a URI is added to the cache. Cache entries * have a maximal lifetime. If it is exceeded, the entries might be deleted from * the cache. * </p> * <p> * Internally the result of the checking is stored using the lowest bit of the * timestamp. * </p> * * @author Michael Röder (roeder@informatik.uni-leipzig.de) * */ public class FileBasedCachingEntityCheckerManager extends EntityCheckerManagerImpl { private static final Logger LOGGER = LoggerFactory.getLogger(FileBasedCachingEntityCheckerManager.class); private static final int MAX_CONCURRENT_READERS = 1000; private static final int EXISTS_FLAG_MASK = 0x1; private static final int ENTITY_EXISTS_BIT = 0x1; private static final int ENTITY_DOES_NOT_EXIST_BIT = 0x0; private static final int ERASE_EXISTS_FLAG_MASK = ~EXISTS_FLAG_MASK; public static FileBasedCachingEntityCheckerManager create(long cacheEntryLifetime, File cacheFile) { File tempCacheFile = new File(cacheFile.getAbsolutePath() + "_temp"); ObjectLongOpenHashMap<String> cache = null; // try to read the cache file cache = readCacheFile(cacheFile); // if this doesn't work, try to read the temp file if (cache == null) { LOGGER.warn("Couldn't read the cache file. Trying the temporary file..."); cache = readCacheFile(tempCacheFile); // if this worked, rename the temp file to the real file if (cache != null) { try { if (!tempCacheFile.renameTo(cacheFile)) { LOGGER.warn("Reading from the temporary cache file worked, but I couldn't rename it."); } } catch (Exception e) { LOGGER.warn("Reading from the temporary cache file worked, but I couldn't rename it.", e); } } } // if the reading didn't worked, create new cache objects if (cache == null) { LOGGER.warn("Couldn't read cache from files. Creating new empty cache."); cache = new ObjectLongOpenHashMap<String>(); } return new FileBasedCachingEntityCheckerManager(cache, cacheEntryLifetime, cacheFile, tempCacheFile); } public static ObjectLongOpenHashMap<String> readCacheFile(File cacheFile) { if (!cacheFile.exists() || cacheFile.isDirectory()) { return null; } ObjectInputStream ois = null; try { ois = new ObjectInputStream(new BufferedInputStream(new FileInputStream(cacheFile))); // first, read the number of URIs int count = ois.readInt(); String uri; ObjectLongOpenHashMap<String> cache = new ObjectLongOpenHashMap<String>(2 * count); for (int i = 0; i < count; ++i) { uri = (String) ois.readObject(); cache.put(uri, ois.readLong()); } return cache; } catch (Exception e) { LOGGER.error("Exception while reading cache file.", e); } finally { IOUtils.closeQuietly(ois); } return null; } protected ObjectLongOpenHashMap<String> cache; private long cacheEntryLifetime; private int cacheChanges = 0; private int forceStorageAfterChanges = 10000; private Semaphore cacheReadMutex = new Semaphore(MAX_CONCURRENT_READERS); private Semaphore cacheWriteMutex = new Semaphore(1); protected File cacheFile; protected File tempCacheFile; protected FileBasedCachingEntityCheckerManager(ObjectLongOpenHashMap<String> cache, long cacheEntryLifetime, File cacheFile, File tempCacheFile) { this.cache = cache; this.cacheEntryLifetime = cacheEntryLifetime; this.cacheFile = cacheFile; this.tempCacheFile = tempCacheFile; } @Override public boolean checkUri(String uri) { try { cacheReadMutex.acquire(); } catch (InterruptedException e) { LOGGER.error("Exception while waiting for read mutex. Returning true.", e); return true; } boolean uriIsCached = cache.containsKey(uri); long timestamp, resultBit = ENTITY_EXISTS_BIT; if (uriIsCached) { timestamp = cache.get(uri); if ((System.currentTimeMillis() - timestamp) < cacheEntryLifetime) { resultBit = timestamp & EXISTS_FLAG_MASK; } else { uriIsCached = false; } } // If the URI is not in the cache, or it has been cached but the result // is null and the request should be retried if (!uriIsCached) { cacheReadMutex.release(); resultBit = super.checkUri(uri) ? ENTITY_EXISTS_BIT : ENTITY_DOES_NOT_EXIST_BIT; // Set the new timestamp inside the cache try { cacheWriteMutex.acquire(); // now we need all others cacheReadMutex.acquire(MAX_CONCURRENT_READERS); } catch (InterruptedException e) { LOGGER.error("Exception while waiting for read mutex. Returning.", e); return resultBit != ENTITY_DOES_NOT_EXIST_BIT; } timestamp = (System.currentTimeMillis() & ERASE_EXISTS_FLAG_MASK) | resultBit; cache.put(uri, timestamp); ++cacheChanges; if ((forceStorageAfterChanges > 0) && (cacheChanges >= forceStorageAfterChanges)) { LOGGER.info("Storing the cache has been forced..."); try { performCacheStorage(); } catch (IOException e) { LOGGER.error("Exception while writing cache to file. Aborting.", e); } } // The last one will be released at the end cacheReadMutex.release(MAX_CONCURRENT_READERS - 1); cacheWriteMutex.release(); } cacheReadMutex.release(); return resultBit != ENTITY_DOES_NOT_EXIST_BIT; } protected boolean performCheck(String uri) { return super.checkUri(uri); } public void storeCache() { try { cacheWriteMutex.acquire(); } catch (InterruptedException e) { LOGGER.error("Exception while waiting for write mutex for storing the cache. Aborting.", e); return; } try { performCacheStorage(); } catch (IOException e) { LOGGER.error("Exception while writing cache to file. Aborting.", e); } cacheWriteMutex.release(); } /** * Writes the cache to the {@link #tempCacheFile}. After that the * {@link #cacheFile} is deleted and the {@link #tempCacheFile} is renamed. * <b>NOTE</b> that this method should only be called if the * {@link #cacheWriteMutex} has been acquired. * * @throws IOException */ private void performCacheStorage() throws IOException { eraseOldEntries(); FileOutputStream fout = null; ObjectOutputStream oout = null; try { fout = new FileOutputStream(tempCacheFile); oout = new ObjectOutputStream(fout); // first, serialize the number of URIs oout.writeInt(cache.assigned); // go over the mapping and serialize all existing pairs for (int i = 0; i < cache.allocated.length; ++i) { if (cache.allocated[i]) { oout.writeObject(((Object[]) cache.keys)[i]); oout.writeLong(cache.values[i]); } } } finally { IOUtils.closeQuietly(oout); IOUtils.closeQuietly(fout); } if (cacheFile.exists() && !cacheFile.delete()) { LOGGER.error("Cache file couldn't be deleted. Aborting."); return; } if (!tempCacheFile.renameTo(cacheFile)) { LOGGER.error("Temporary cache file couldn't be renamed. Aborting."); return; } cacheChanges = 0; } private void eraseOldEntries() { // TODO Add the erasing of old entries // long currentTime = System.currentTimeMillis(); // for (int i = 0; i < cache.allocated.length; ++i) { // if (cache.allocated[i]) { // // If this entry is to old // if ((currentTime - cache.values[i]) > cacheEntryLifetime) { // ((Object[]) cache.keys)[i] = null; // cache.values[i] = 0L; // --cache.assigned; // cache.allocated[i] = false; // } // } // } } }