/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.semantic.sameas.impl.cache;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Semaphore;
import org.aksw.gerbil.semantic.sameas.SameAsRetriever;
import org.aksw.gerbil.semantic.sameas.impl.AbstractSameAsRetrieverDecorator;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntOpenHashSet;
import com.carrotsearch.hppc.ObjectIntOpenHashMap;
public class FileBasedCachingSameAsRetriever extends AbstractSameAsRetrieverDecorator {
private static final Logger LOGGER = LoggerFactory.getLogger(FileBasedCachingSameAsRetriever.class);
private static final int MAX_CONCURRENT_READERS = 1000;
protected static final int ENTITY_NOT_FOUND = -1;
@SuppressWarnings("unchecked")
public static FileBasedCachingSameAsRetriever create(SameAsRetriever decoratedRetriever,
boolean requestEntitiesNotFound, File cacheFile) {
File tempCacheFile = new File(cacheFile.getAbsolutePath() + "_temp");
Object objects[] = null;
// try to read the cache file
objects = readCacheFile(cacheFile);
// if this doesn't work, try to read the temp file
if (objects == null) {
LOGGER.warn("Couldn't read the cache file. Trying the temporary file...");
objects = readCacheFile(tempCacheFile);
// if this worked, rename the temp file to the real file
if (objects != null) {
try {
if (!tempCacheFile.renameTo(cacheFile)) {
LOGGER.warn("Reading from the temporary cache file worked, but I couldn't rename it.");
}
} catch (Exception e) {
LOGGER.warn("Reading from the temporary cache file worked, but I couldn't rename it.", e);
}
}
}
ObjectIntOpenHashMap<String> uriSetIdMapping;
List<Set<String>> sets;
// if the reading didn't worked, create new cache objects
if (objects == null) {
LOGGER.warn("Couldn't read cache from files. Creating new empty cache.");
uriSetIdMapping = new ObjectIntOpenHashMap<String>();
sets = new ArrayList<Set<String>>();
} else {
uriSetIdMapping = (ObjectIntOpenHashMap<String>) objects[0];
sets = (List<Set<String>>) objects[1];
}
return new FileBasedCachingSameAsRetriever(decoratedRetriever, uriSetIdMapping, sets, requestEntitiesNotFound,
cacheFile, tempCacheFile);
}
protected ObjectIntOpenHashMap<String> uriSetIdMapping;
protected List<Set<String>> sets;
private int cacheChanges = 0;
private int forceStorageAfterChanges = 1000;
private Semaphore cacheReadMutex = new Semaphore(MAX_CONCURRENT_READERS);
private Semaphore cacheWriteMutex = new Semaphore(1);
private boolean requestEntitiesNotFound;
protected File cacheFile;
protected File tempCacheFile;
protected FileBasedCachingSameAsRetriever(SameAsRetriever decoratedRetriever,
ObjectIntOpenHashMap<String> uriSetIdMapping, List<Set<String>> sets, boolean requestEntitiesNotFound,
File cacheFile, File tempCacheFile) {
super(decoratedRetriever);
this.uriSetIdMapping = uriSetIdMapping;
this.sets = sets;
this.requestEntitiesNotFound = requestEntitiesNotFound;
this.cacheFile = cacheFile;
this.tempCacheFile = tempCacheFile;
}
@Override
public Set<String> retrieveSameURIs(String uri) {
// if the cache contains the uri, return the set or a set
// containing only the uri (use the read mutex!!!)
Set<String> result = null;
try {
cacheReadMutex.acquire();
} catch (InterruptedException e) {
LOGGER.error("Exception while waiting for read mutex. Returning null.", e);
return null;
}
boolean uriIsCached = uriSetIdMapping.containsKey(uri);
if (uriIsCached) {
int setId = uriSetIdMapping.get(uri);
if (setId != ENTITY_NOT_FOUND) {
result = sets.get(setId);
}
}
// If the URI is not in the cache, or it has been cached but the result
// is null and the request should be retried
if (!uriIsCached || (uriIsCached && (result == null) && requestEntitiesNotFound)) {
cacheReadMutex.release();
result = decoratedRetriever.retrieveSameURIs(uri);
try {
cacheWriteMutex.acquire();
// now we need all others
cacheReadMutex.acquire(MAX_CONCURRENT_READERS);
} catch (InterruptedException e) {
LOGGER.error("Exception while waiting for read mutex. Returning null.", e);
return null;
}
// Check again that nobody already added the uri
if (uriSetIdMapping.containsKey(uri)) {
// use the cached result
int setId = uriSetIdMapping.get(uri);
if (setId != ENTITY_NOT_FOUND) {
result = sets.get(setId);
} else {
result = null;
}
} else {
if (result != null) {
mergeSetIntoCache(result);
} else {
uriSetIdMapping.put(uri, ENTITY_NOT_FOUND);
}
++cacheChanges;
if ((forceStorageAfterChanges > 0) && (cacheChanges >= forceStorageAfterChanges)) {
LOGGER.info("Storing the cache has been forced...");
try {
performCacheStorage();
} catch (IOException e) {
LOGGER.error("Exception while writing cache to file. Aborting.", e);
}
}
}
// The last one will be released at the end
cacheReadMutex.release(MAX_CONCURRENT_READERS - 1);
cacheWriteMutex.release();
}
cacheReadMutex.release();
return result;
}
protected void mergeSetIntoCache(Set<String> result) {
// In most cases we shouldn't need this objects
IntOpenHashSet alreadyExistingSets = null;
int setId;
for (String uri : result) {
if (uriSetIdMapping.containsKey(uri)) {
setId = uriSetIdMapping.get(uri);
if (setId != ENTITY_NOT_FOUND) {
if (alreadyExistingSets == null) {
alreadyExistingSets = new IntOpenHashSet();
}
alreadyExistingSets.add(setId);
}
}
}
// if a joining is needed
if (alreadyExistingSets != null) {
for (int i = 0; i < alreadyExistingSets.allocated.length; i++) {
if (alreadyExistingSets.allocated[i] && (alreadyExistingSets.keys[i] != ENTITY_NOT_FOUND)) {
result.addAll(sets.get(alreadyExistingSets.keys[i]));
sets.set(alreadyExistingSets.keys[i], null);
}
}
}
setId = sets.size();
sets.add(result);
for (String uri : result) {
uriSetIdMapping.put(uri, setId);
}
}
public void storeCache() {
try {
cacheWriteMutex.acquire();
} catch (InterruptedException e) {
LOGGER.error("Exception while waiting for write mutex for storing the cache. Aborting.", e);
return;
}
try {
performCacheStorage();
} catch (IOException e) {
LOGGER.error("Exception while writing cache to file. Aborting.", e);
}
cacheWriteMutex.release();
}
private void performCacheStorage() throws IOException {
checkSetMapping();
FileOutputStream fout = null;
ObjectOutputStream oout = null;
try {
fout = new FileOutputStream(tempCacheFile);
oout = new ObjectOutputStream(fout);
// first, serialize the number of URIs
oout.writeInt(uriSetIdMapping.assigned);
// go over the mapping and serialize all existing pairs
for (int i = 0; i < uriSetIdMapping.allocated.length; ++i) {
if (uriSetIdMapping.allocated[i]) {
oout.writeObject(((Object[]) uriSetIdMapping.keys)[i]);
oout.writeInt(uriSetIdMapping.values[i]);
}
}
// write the number of sets
oout.writeInt(sets.size());
// write the single sets
for (Set<String> set : sets) {
oout.writeInt(set.size());
for (String uri : set) {
oout.writeObject(uri);
}
}
} finally {
IOUtils.closeQuietly(oout);
IOUtils.closeQuietly(fout);
}
if (cacheFile.exists() && !cacheFile.delete()) {
LOGGER.error("Cache file couldn't be deleted. Aborting.");
return;
}
if (!tempCacheFile.renameTo(cacheFile)) {
LOGGER.error("Temporary cache file couldn't be renamed. Aborting.");
return;
}
cacheChanges = 0;
}
private void checkSetMapping() {
IntArrayList missingSets = null;
for (int i = 0; i < sets.size(); ++i) {
if (sets.get(i) == null) {
if (missingSets == null) {
missingSets = new IntArrayList();
}
missingSets.add(i);
}
}
if (missingSets != null) {
LOGGER.info("The cache contains sets that have been merged. Renumbering the existing sets.");
// very simple approach: go through the missing sets starting with
// the highest id, remove the null from the list of sets and reduce
// the ids of all uris referencing a set with a higher id than the
// deleted one
int setId;
for (int i = (missingSets.elementsCount - 1); i >= 0; --i) {
setId = missingSets.buffer[i];
sets.remove(setId);
for (int j = 0; j < uriSetIdMapping.allocated.length; ++j) {
if (uriSetIdMapping.allocated[j]) {
if (uriSetIdMapping.values[j] == setId) {
LOGGER.error("Found a uri pointing to a non existing set!");
}
if (uriSetIdMapping.values[j] > setId) {
--uriSetIdMapping.values[j];
}
}
}
}
}
}
public static Object[] readCacheFile(File cacheFile) {
if (!cacheFile.exists() || cacheFile.isDirectory()) {
return null;
}
ObjectInputStream oin = null;
try {
oin = new ObjectInputStream(new BufferedInputStream(new FileInputStream(cacheFile)));
// first, read the number of URIs
int count = oin.readInt();
String uri;
ObjectIntOpenHashMap<String> uriSetIdMapping = new ObjectIntOpenHashMap<String>(count);
for (int i = 0; i < count; ++i) {
uri = (String) oin.readObject();
uriSetIdMapping.put(uri, oin.readInt());
}
count = oin.readInt();
List<Set<String>> sets = new ArrayList<Set<String>>(count);
Set<String> set;
int setSize;
for (int i = 0; i < count; ++i) {
setSize = oin.readInt();
set = new HashSet<String>(setSize);
for (int j = 0; j < setSize; ++j) {
set.add((String) oin.readObject());
}
sets.add(set);
}
return new Object[] { uriSetIdMapping, sets };
} catch (Exception e) {
LOGGER.error("Exception while reading cache file.", e);
} finally {
IOUtils.closeQuietly(oin);
}
return null;
}
@Override
public Set<String> retrieveSameURIs(String domain, String uri) {
return retrieveSameURIs(uri);
}
}