package uk.ac.imperial.lsds.seepworker.core; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import uk.ac.imperial.lsds.seep.api.DataReference; import uk.ac.imperial.lsds.seep.api.DataStoreType; import uk.ac.imperial.lsds.seep.api.data.OTuple; import uk.ac.imperial.lsds.seep.api.data.Schema; import uk.ac.imperial.lsds.seep.api.data.TupleInfo; import uk.ac.imperial.lsds.seep.comm.Connection; import uk.ac.imperial.lsds.seep.comm.OutgoingConnectionRequest; import uk.ac.imperial.lsds.seep.core.DataStoreSelector; import uk.ac.imperial.lsds.seep.core.DatasetMetadata; import uk.ac.imperial.lsds.seep.core.DatasetMetadataPackage; import uk.ac.imperial.lsds.seep.core.IBuffer; import uk.ac.imperial.lsds.seep.core.OBuffer; import uk.ac.imperial.lsds.seep.infrastructure.DataEndPoint; import uk.ac.imperial.lsds.seepworker.WorkerConfig; import uk.ac.imperial.lsds.seepworker.comm.NetworkSelector; import uk.ac.imperial.lsds.seepworker.core.output.CoreOutput; /** * This has to: * keep track of all DataReferences that the node manages (store and serve types) * keep track of those datasets that correspond to datareferences of type store * provide on-demand access to both datasets and datareferences * @author ra */ public class DataReferenceManager { final private Logger LOG = LoggerFactory.getLogger(DataReferenceManager.class.getName()); private static DataReferenceManager instance; private Map<Integer, DataReference> catalogue; private Map<Integer, Dataset> datasets; private List<DataStoreSelector> dataStoreSelectors; /** * This list keeps datasets ordered by priority of staying in memory. Such order * is determined by the master and used by DRM to choose which datasets to evict to disk * and which datasets to load from disk. */ private List<Integer> rankedDatasets; private DiskCacher cacher; private int syntheticDatasetGenerator; private BufferPool bufferPool; // metrics private long __time_freeDatasets = 0; private DataReferenceManager(WorkerConfig wc) { this.catalogue = new HashMap<>(); this.datasets = new HashMap<>(); int rnd = new Random().nextInt(); // Get from WC the data reference ID for the synthetic generator and create a dataset for it this.syntheticDatasetGenerator = wc.getInt(WorkerConfig.SYNTHETIC_DATA_GENERATOR_ID) + rnd; this.bufferPool = BufferPool.createBufferPool(wc); this.cacher = DiskCacher.makeDiskCacher(wc); } public static DataReferenceManager makeDataReferenceManager(WorkerConfig wc) { if(instance == null) { instance = new DataReferenceManager(wc); } return instance; } public void updateRankedDatasets(List<Integer> rankedDatasets) { this.rankedDatasets = rankedDatasets; freeDatasets(); //loadToMemoryEvictedDatasets(); } private void loadToMemoryEvictedDatasets() { // Iterate in order until detecting the first dataset not in memory for(Integer i : rankedDatasets) { if(datasets.containsKey(i)) { Dataset d = datasets.get(i); if(! cacher.inMem(d)) { // Check if there's enough memory to load it back again long size = d.size() + bufferPool.getMinimumBufferSize(); // Overcalculate to account for cached buffer if(bufferPool.isThereXMemAvailable(size)) { this.retrieveDatasetFromDisk(i); } } } } } private void freeDatasets() { long start = System.currentTimeMillis(); // Free datasets that are no longer part of the list of rankedDatasets int totalFreedMemory = 0; Set<Integer> toRemove = new HashSet<>(); for(Integer dId : datasets.keySet()) { if(! rankedDatasets.contains(dId)) { // Eliminate dataset LOG.info("Marked Dataset for removal: {}", dId); totalFreedMemory = totalFreedMemory + datasets.get(dId).freeDataset(); toRemove.add(dId); } } for (int index = 0; index < rankedDatasets.size(); index++) { LOG.info("Dataset {} ranked {}, is in mem? {}", rankedDatasets.get(index), index, datasetIsInMem(rankedDatasets.get(index))); } for(Integer tr : toRemove){ datasets.remove(tr); catalogue.remove(tr); } LOG.info("Total freed memory: {}", totalFreedMemory); long end = System.currentTimeMillis(); __time_freeDatasets = __time_freeDatasets + (end - start); } public DatasetMetadataPackage getManagedDatasetsMetadata(Set<Integer> usedSet) { Set<DatasetMetadata> oldDatasets = new HashSet<>(); Set<DatasetMetadata> newDatasets = new HashSet<>(); Set<DatasetMetadata> usedDatasets = new HashSet<>(); for(Dataset d : this.datasets.values()) { // Iterate over all datasets int id = d.id(); long size = d.size(); boolean inMem = datasetIsInMem(id); long estimatedCreationCost = d.creationCost(); int diskAccess = d.getDiskAccess(); if(diskAccess != 0) { System.out.println(); } int memAccess = d.getMemAccess(); DatasetMetadata dm = new DatasetMetadata(id, size, inMem, estimatedCreationCost, diskAccess, memAccess); // Classify then as old (non used by this stage) and new (used by this stage) if(rankedDatasets.contains(id)) { oldDatasets.add(dm); } else { newDatasets.add(dm); } // Then also add those (repeated reference) that were used by this stage if(usedSet.contains(id)) { usedDatasets.add(dm); } } double availableMemory = bufferPool.getPercAvailableMemory(); DatasetMetadataPackage dmp = new DatasetMetadataPackage(oldDatasets, newDatasets, usedDatasets, availableMemory, __time_freeDatasets); return dmp; } public OBuffer _manageNewDataReferenceBackupOnDisk(DataReference dataRef) { int id = dataRef.getId(); Dataset newDataset = null; if(! catalogue.containsKey(id)) { LOG.info("Start managing new DataReference, id -> {}", id); catalogue.put(id, dataRef); // TODO: will become more complex... newDataset = Dataset.newDatasetOnDisk(dataRef, bufferPool, this); //newDataset = new Dataset(dataRef, bufferPool, this); datasets.put(id, newDataset); } else { LOG.warn("Attempt to register an already existent DataReference, id -> {}", id); } return newDataset; } public OBuffer manageNewDataReference(DataReference dataRef) { int id = dataRef.getId(); Dataset newDataset = null; if(! catalogue.containsKey(id)) { LOG.info("Start managing new DataReference, id -> {}", id); catalogue.put(id, dataRef); // TODO: will become more complex... newDataset = new Dataset(dataRef, bufferPool, this); datasets.put(id, newDataset); } else { LOG.warn("Attempt to register an already existent DataReference, id -> {}", id); } return newDataset; } public boolean registerDataReferenceInCatalogue(DataReference dr) { int drId = dr.getId(); if( ! catalogue.containsKey(drId)) { catalogue.put(drId, dr); LOG.info("DataReference id -> {} registered in DRM", drId); return true; } return false; } public DataReference doesManageDataReference(int dataRefId) { return catalogue.get(dataRefId); } // FIXME: temporal method public void serveDataSet(CoreOutput coreOutput, DataReference dr, DataEndPoint dep) { Connection c = new Connection(dep); OBuffer buffer = coreOutput.getBuffers().get(dr.getId()); OutgoingConnectionRequest ocr = new OutgoingConnectionRequest(c, buffer); DataStoreType type = dr.getDataStore().type(); DataStoreSelector dss = getSelectorOfType(dr.getDataStore().type()); switch(type) { case NETWORK: Set<OutgoingConnectionRequest> conns = new HashSet<>(); conns.add(ocr); ((NetworkSelector)dss).configureOutgoingConnection(conns); break; default: break; } } public void setDataStoreSelectors(List<DataStoreSelector> dataStoreSelectors) { this.dataStoreSelectors = dataStoreSelectors; } private DataStoreSelector getSelectorOfType(DataStoreType type) { for(DataStoreSelector dss : dataStoreSelectors) { if(dss.type() == type) return dss; } return null; } public String createDatasetOnDisk(int datasetId) { LOG.info("Creating Dataset on disk, id -> {}", datasetId); String name = cacher.createDatasetOnDisk(datasetId); LOG.info("Finished caching Dataset to disk, id -> {}", datasetId); return name; } public long sendDatasetToDisk(int datasetId) throws IOException { LOG.info("Caching Dataset to disk, id -> {}", datasetId); long freedMemory = cacher.cacheToDisk(datasets.get(datasetId)); LOG.info("Cached to disk, id -> {}, freedMemory -> {}", datasetId, freedMemory); return freedMemory; } public void retrieveDatasetFromDisk(int datasetId) { // Safety check, is there enough memory available long memRequired = datasets.get(datasetId).size() + bufferPool.getMinimumBufferSize(); boolean enoughMem = bufferPool.isThereXMemAvailable(memRequired); if(! enoughMem) { LOG.error("Impossible to load to memory: Not enough mem available"); return; } try { LOG.info("Returning cached Dataset to memory, id -> {}", datasetId); try { cacher.retrieveFromDisk(datasets.get(datasetId)); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } finally { LOG.info("Finished returning cached Dataset to memory, id -> {}", datasetId); } } public boolean datasetIsInMem(int datasetId) { return cacher.inMem(datasets.get(datasetId)); } public IBuffer getInputBufferFor(DataReference dr) { // Sanity check if(doesManageDataReference(dr.getId()) == null) { // TODO: throw error LOG.error("Asked to retrieve dataset, but dataset not managed here!"); System.exit(0); } return datasets.get(dr.getId()); } public IBuffer getSyntheticDataset(DataReference dr, long sizeOfDataToGenerate) { Dataset d = new Dataset(dr.getId(), dr, bufferPool, this); // Store dataset already, in case it needs to be spilled to disk while writing // in the below lines datasets.put(dr.getId(), d); Schema s = dr.getDataStore().getSchema(); // byte[] tuple = OTuple.create(s, s.names(), s.randomValues()); int size = s.sizeOfTuple(); byte[] tuple = OTuple.createUnsafe(s.fields(), s.randomValues(), size); int tupleSizeWithOverhead = tuple.length + TupleInfo.TUPLE_SIZE_OVERHEAD; // Filling dataset with data (may or may not spill to disk) long numTuples = sizeOfDataToGenerate / tupleSizeWithOverhead; int totalWritten = 0; OTuple o = new OTuple(s); for (int i = 0; i < numTuples; i++) { // byte[] srcData = OTuple.create(s, s.names(), s.randomValues()); // byte[] srcData = OTuple.createUnsafe(s.fields(), s.randomValues(), size); o.setValues(s.defaultValues());//s.randomValues()); totalWritten += o.getTupleSize() + TupleInfo.TUPLE_SIZE_OVERHEAD; d.write(o, null); // d.write(srcData, null); } LOG.info("Synthetic dataset with {} tuples, size: {}", numTuples, totalWritten); d.prepareSyntheticDatasetForRead(); // d.prepareDatasetForFutureRead(); return d; } public IBuffer _getSyntheticDataset(DataReference dr, int sizeOfDataToGenerate) { ByteBuffer d = ByteBuffer.allocate(sizeOfDataToGenerate); // Generate synthetic data Schema s = dr.getDataStore().getSchema(); int totalWritten = 0; boolean goOn = true; int totalTuples = 0; while(goOn) { byte[] tuple = OTuple.create(s, s.names(), s.randomValues()); if(d.position() + tuple.length + TupleInfo.TUPLE_SIZE_OVERHEAD <= d.capacity()) { d.putInt(tuple.length); d.put(tuple); totalWritten = totalWritten + TupleInfo.TUPLE_SIZE_OVERHEAD + tuple.length; totalTuples++; } else { // stop when no more data fits goOn = false; } } //Copy only the written bytes byte[] dataToForward = new byte[totalWritten]; System.arraycopy(d.array(), 0, dataToForward, 0, totalWritten); LOG.info("Synthetic dataset with {} tuples, size: {}", totalTuples, totalWritten); // Store synthetic data in synthetic dataset Dataset synthetic = new Dataset(syntheticDatasetGenerator, dataToForward, dr, bufferPool); // Store in catalogue and return it for use datasets.put(syntheticDatasetGenerator, synthetic); return synthetic; } public long spillDatasetsToDisk(Integer datasetId) { LOG.info("Worker node runs out of memory while writing to dataset: {}", datasetId); long freedMemory = 0; try { if(rankedDatasets == null) { if (datasetId != null) { freedMemory = sendDatasetToDisk(datasetId); } } else { for(Integer i : rankedDatasets) { // We find the first dataset in the list that is in memory and send it to disk // TODO: is one enough? how to know? if(this.datasetIsInMem(i)) { freedMemory = sendDatasetToDisk(i); if (freedMemory > 0) { return freedMemory; } } } } if (datasetId != null) { freedMemory = sendDatasetToDisk(datasetId); } } catch (IOException io) { LOG.error("While trying to spill dataset: {} to disk", datasetId); io.printStackTrace(); } return freedMemory; } public void printCatalogue() { for(Entry<Integer, DataReference> entry : catalogue.entrySet()) { System.out.println("id: " + entry.getKey()+ " val: " + entry.getValue().getPartitionId()); } } }