/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.runtime.controlprogram.caching; import java.io.IOException; import java.lang.ref.SoftReference; import org.apache.commons.lang.mutable.MutableBoolean; import com.ibm.bi.dml.api.DMLScript; import com.ibm.bi.dml.api.DMLScript.RUNTIME_PLATFORM; import com.ibm.bi.dml.hops.OptimizerUtils; import com.ibm.bi.dml.lops.Lop; import com.ibm.bi.dml.parser.DMLTranslator; import com.ibm.bi.dml.parser.Expression.DataType; import com.ibm.bi.dml.parser.Expression.ValueType; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; import com.ibm.bi.dml.runtime.controlprogram.context.SparkExecutionContext; import com.ibm.bi.dml.runtime.instructions.spark.data.BroadcastObject; import com.ibm.bi.dml.runtime.instructions.spark.data.RDDObject; import com.ibm.bi.dml.runtime.instructions.spark.data.RDDProperties; import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics; import com.ibm.bi.dml.runtime.matrix.MatrixDimensionsMetaData; import com.ibm.bi.dml.runtime.matrix.MatrixFormatMetaData; import com.ibm.bi.dml.runtime.matrix.MetaData; import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.InputInfo; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.NumItemsByEachReducerMetaData; import com.ibm.bi.dml.runtime.matrix.data.OutputInfo; import com.ibm.bi.dml.runtime.util.DataConverter; import com.ibm.bi.dml.runtime.util.IndexRange; import com.ibm.bi.dml.runtime.util.MapReduceTool; /** * Represents a matrix in control program. This class contains method to read * matrices from HDFS and convert them to a specific format/representation. It * is also able to write several formats/representation of matrices to HDFS. * IMPORTANT: Preserve one-to-one correspondence between {@link MatrixObject} * and {@link MatrixBlock} objects, for cache purposes. Do not change a * {@link MatrixBlock} object without informing its {@link MatrixObject} object. * */ public class MatrixObject extends CacheableData { private static final long serialVersionUID = 6374712373206495637L; /** * Current state of pinned variables, required for guarded collect. */ private static ThreadLocal<Long> sizePinned = new ThreadLocal<Long>() { @Override protected Long initialValue() { return 0L; } }; /** * Cache for actual data, evicted by garbage collector. */ private SoftReference<MatrixBlock> _cache = null; /** * Container object that holds the actual data. */ private MatrixBlock _data = null; /** * The name of HDFS file in which the data is backed up. */ private String _hdfsFileName = null; // file name and path /** * Flag that indicates whether or not hdfs file exists. * It is used for improving the performance of "rmvar" instruction. * When it has value <code>false</code>, one can skip invocations to * utility functions such as MapReduceTool.deleteFileIfExistOnHDFS(), * which can be potentially expensive. */ private boolean _hdfsFileExists = false; /** * <code>true</code> if the in-memory or evicted matrix may be different from * the matrix located at {@link #_hdfsFileName}; <code>false</code> if the two * matrices should be the same. */ private boolean _dirtyFlag = false; /** * Object that holds the metadata associated with the matrix, which * includes: 1) Matrix dimensions, if available 2) Number of non-zeros, if * available 3) Block dimensions, if applicable 4) InputInfo -- subsequent * operations that use this Matrix expect it to be in this format. * * When the matrix is written to HDFS (local file system, as well?), one * must get the OutputInfo that matches with InputInfo stored inside _mtd. */ private MetaData _metaData = null; //additional names and flags private String _varName = ""; //plan variable name private String _cacheFileName = null; //local eviction file name private boolean _requiresLocalWrite = false; //flag if local write for read obj private boolean _isAcquireFromEmpty = false; //flag if read from status empty private boolean _cleanupFlag = true; //flag if obj unpinned (cleanup enabled) private boolean _updateInPlaceFlag = false; //flag if in-place update //spark-specific handles //note: we use the abstraction of LineageObjects for two reasons: (1) to keep track of cleanup //for lazily evaluated RDDs, and (2) as abstraction for environments that do not necessarily have spark libraries available private RDDObject _rddHandle = null; //RDD handle private BroadcastObject _bcHandle = null; //Broadcast handle private RDDProperties _rddProperties = null; /** * Information relevant to partitioned matrices. */ private boolean _partitioned = false; //indicates if obj partitioned private PDataPartitionFormat _partitionFormat = null; //indicates how obj partitioned private int _partitionSize = -1; //indicates n for BLOCKWISE_N private String _partitionCacheName = null; //name of cache block private MatrixBlock _partitionInMemory = null; /** * Information relevant to specific external file formats */ FileFormatProperties _formatProperties = null; public RDDProperties getRddProperties() { return _rddProperties; } public void setRddProperties(RDDProperties _rddProperties) { this._rddProperties = _rddProperties; } /** * Constructor that takes only the HDFS filename. */ public MatrixObject (ValueType vt, String file) { this (vt, file, null); //HDFS file path } /** * Constructor that takes both HDFS filename and associated metadata. */ public MatrixObject( ValueType vt, String file, MetaData mtd ) { super (DataType.MATRIX, vt); _metaData = mtd; _hdfsFileName = file; _cache = null; _data = null; } /** * Copy constructor that copies meta data but NO data. * * @param mo */ public MatrixObject( MatrixObject mo ) { super(mo.getDataType(), mo.getValueType()); _hdfsFileName = mo._hdfsFileName; _hdfsFileExists = mo._hdfsFileExists; MatrixFormatMetaData metaOld = (MatrixFormatMetaData)mo.getMetaData(); _metaData = new MatrixFormatMetaData(new MatrixCharacteristics(metaOld.getMatrixCharacteristics()), metaOld.getOutputInfo(), metaOld.getInputInfo()); _varName = mo._varName; _cleanupFlag = mo._cleanupFlag; _updateInPlaceFlag = mo._updateInPlaceFlag; _partitioned = mo._partitioned; _partitionFormat = mo._partitionFormat; _partitionSize = mo._partitionSize; _partitionCacheName = mo._partitionCacheName; } public void setVarName(String s) { _varName = s; } public String getVarName() { return _varName; } @Override public void setMetaData(MetaData md) { _metaData = md; } @Override public MetaData getMetaData() { return _metaData; } @Override public void removeMetaData() { _metaData = null; } @Override public void updateMatrixCharacteristics (MatrixCharacteristics mc) { ((MatrixDimensionsMetaData)_metaData).setMatrixCharacteristics( mc ); } /** * Make the matrix metadata consistent with the in-memory matrix data * @throws CacheException */ public void refreshMetaData() throws CacheException { if ( _data == null || _metaData ==null ) //refresh only for existing data throw new CacheException("Cannot refresh meta data because there is no data or meta data. "); //we need to throw an exception, otherwise input/output format cannot be inferred MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics(); mc.setDimension( _data.getNumRows(), _data.getNumColumns() ); mc.setNonZeros( _data.getNonZeros() ); } public void setFileFormatProperties(FileFormatProperties formatProperties) { _formatProperties = formatProperties; } public FileFormatProperties getFileFormatProperties() { return _formatProperties; } public boolean isFileExists() { return _hdfsFileExists; } public void setFileExists( boolean flag ) { _hdfsFileExists = flag; } public String getFileName() { return _hdfsFileName; } public synchronized void setFileName( String file ) { if (!_hdfsFileName.equals (file)) { _hdfsFileName = file; if( ! isEmpty(true) ) _dirtyFlag = true; } } /** * * @return */ public long getNumRows () { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getRows (); } /** * * @return */ public long getNumColumns() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getCols (); } /** * * @return */ public long getNumRowsPerBlock() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getRowsPerBlock(); } /** * * @return */ public long getNumColumnsPerBlock() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getColsPerBlock(); } /** * * @return */ public long getNnz() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getNonZeros(); } /** * * @return */ public double getSparsity() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return ((double)mc.getNonZeros())/mc.getRows()/mc.getCols(); } /** * * @return */ public MatrixCharacteristics getMatrixCharacteristics() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; return meta.getMatrixCharacteristics(); } /** * <code>true</code> if the in-memory or evicted matrix may be different from * the matrix located at {@link #_hdfsFileName}; <code>false</code> if the two * matrices are supposed to be the same. */ public boolean isDirty () { return _dirtyFlag; } public String toString() { StringBuilder str = new StringBuilder(); str.append("Matrix: "); str.append(_hdfsFileName + ", "); //System.out.println(_hdfsFileName); if ( _metaData instanceof NumItemsByEachReducerMetaData ) { str.append("NumItemsByEachReducerMetaData"); } else { try { MatrixFormatMetaData md = (MatrixFormatMetaData)_metaData; if ( md != null ) { MatrixCharacteristics mc = ((MatrixDimensionsMetaData)_metaData).getMatrixCharacteristics(); str.append(mc.toString()); InputInfo ii = md.getInputInfo(); if ( ii == null ) str.append("null"); else { str.append(", "); str.append(InputInfo.inputInfoToString(ii)); } } else { str.append("null, null"); } } catch(Exception ex) { LOG.error(ex); } } str.append(", "); str.append(isDirty() ? "dirty" : "not-dirty"); return str.toString(); } public RDDObject getRDDHandle() { return _rddHandle; } public void setRDDHandle( RDDObject rdd ) { //cleanup potential old back reference if( _rddHandle != null ) _rddHandle.setBackReference(null); //add new rdd handle _rddHandle = rdd; if( _rddHandle != null ) rdd.setBackReference(this); } public BroadcastObject getBroadcastHandle() { return _bcHandle; } public void setBroadcastHandle( BroadcastObject bc ) { //cleanup potential old back reference if( _bcHandle != null ) _bcHandle.setBackReference(null); //add new broadcast handle _bcHandle = bc; if( _bcHandle != null ) bc.setBackReference(this); } // ********************************************* // *** *** // *** HIGH-LEVEL METHODS THAT SPECIFY *** // *** THE LOCKING AND CACHING INTERFACE *** // *** *** // ********************************************* /** * Acquires a shared "read-only" lock, produces the reference to the matrix data, * restores the matrix to main memory, reads from HDFS if needed. * * Synchronized because there might be parallel threads (parfor local) that * access the same MatrixObjectNew object (in case it was created before the loop). * * In-Status: EMPTY, EVICTABLE, EVICTED, READ; * Out-Status: READ(+1). * * @return the matrix data reference * @throws CacheException */ public synchronized MatrixBlock acquireRead() throws CacheException { if( LOG.isTraceEnabled() ) LOG.trace("Acquire read "+_varName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if ( !isAvailableToRead() ) throw new CacheStatusException ("MatrixObject not available to read."); //get object from cache if( _data == null ) getCache(); //read data from HDFS/RDD if required //(probe data for cache_nowrite / jvm_reuse) if( isEmpty(true) && _data==null ) { try { if( DMLScript.STATISTICS ) CacheStatistics.incrementHDFSHits(); if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) { //check filename if( _hdfsFileName == null ) throw new CacheException("Cannot read matrix for empty filename."); //read matrix from hdfs _data = readMatrixFromHDFS( _hdfsFileName ); //mark for initial local write despite read operation _requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ; } else { //read matrix from rdd (incl execute pending rdd operations) MutableBoolean writeStatus = new MutableBoolean(); _data = readMatrixFromRDD( getRDDHandle(), writeStatus ); //mark for initial local write (prevent repeated execution of rdd operations) if( writeStatus.booleanValue() ) _requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ; else _requiresLocalWrite = true; } _dirtyFlag = false; } catch (IOException e) { throw new CacheIOException("Reading of " + _hdfsFileName + " ("+_varName+") failed.", e); } _isAcquireFromEmpty = true; } else if( DMLScript.STATISTICS ) { if( _data!=null ) CacheStatistics.incrementMemHits(); } //cache status maintenance super.acquire( false, _data==null ); updateStatusPinned(true); if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementAcquireRTime(t1-t0); } return _data; } /** * Acquires the exclusive "write" lock for a thread that wants to change matrix * cell values. Produces the reference to the matrix data, restores the matrix * to main memory, reads from HDFS if needed. * * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: MODIFY. * * @return the matrix data reference * @throws CacheException */ public synchronized MatrixBlock acquireModify() throws CacheException { if( LOG.isTraceEnabled() ) LOG.trace("Acquire modify "+_varName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if ( !isAvailableToModify() ) throw new CacheStatusException("MatrixObject not available to modify."); //get object from cache if( _data == null ) getCache(); //read data from HDFS if required if( isEmpty(true) && _data == null ) { //check filename if( _hdfsFileName == null ) throw new CacheException("Cannot read matrix for empty filename."); //load data try { _data = readMatrixFromHDFS( _hdfsFileName ); } catch (IOException e) { throw new CacheIOException("Reading of " + _hdfsFileName + " ("+_varName+") failed.", e); } } //cache status maintenance super.acquire( true, _data==null ); updateStatusPinned(true); _dirtyFlag = true; _isAcquireFromEmpty = false; if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementAcquireMTime(t1-t0); } return _data; } /** * Acquires the exclusive "write" lock for a thread that wants to throw away the * old matrix data and link up with new matrix data. Abandons the old matrix data * without reading it. Sets the new matrix data reference. * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: MODIFY. * * @param newData : the new matrix data reference * @return the matrix data reference, which is the same as the argument * @throws CacheException */ public synchronized MatrixBlock acquireModify(MatrixBlock newData) throws CacheException { if( LOG.isTraceEnabled() ) LOG.trace("Acquire modify newdata "+_varName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (! isAvailableToModify ()) throw new CacheStatusException ("MatrixObject not available to modify."); //clear old data clearData(); //cache status maintenance super.acquire (true, false); //no need to load evicted matrix _dirtyFlag = true; _isAcquireFromEmpty = false; //set references to new data if (newData == null) throw new CacheException("acquireModify with empty matrix block."); _data = newData; updateStatusPinned(true); if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementAcquireMTime(t1-t0); } return _data; } /** * Releases the shared ("read-only") or exclusive ("write") lock. Updates * the matrix size, last-access time, metadata, etc. * * Synchronized because there might be parallel threads (parfor local) that * access the same MatrixObjectNew object (in case it was created before the loop). * * In-Status: READ, MODIFY; * Out-Status: READ(-1), EVICTABLE, EMPTY. * * @throws CacheStatusException */ public synchronized void release() throws CacheException { if( LOG.isTraceEnabled() ) LOG.trace("Release "+_varName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; boolean write = false; if ( isModify() ) { //set flags for write write = true; _dirtyFlag = true; //update meta data refreshMetaData(); } //compact empty in-memory block if( _data.isEmptyBlock(false) && _data.isAllocated() ) _data.cleanupBlock(true, true); //cache status maintenance (pass cacheNoWrite flag) super.release(_isAcquireFromEmpty && !_requiresLocalWrite); updateStatusPinned(false); if( isCachingActive() //only if caching is enabled (otherwise keep everything in mem) && isCached(true) //not empty and not read/modify && !isUpdateInPlace() //pinned result variable && !isBelowCachingThreshold() ) //min size for caching { if( write || _requiresLocalWrite ) { //evict blob String filePath = getCacheFilePathAndName(); try { writeMatrix (filePath); } catch (Exception e) { throw new CacheException("Eviction to local path " + filePath + " ("+_varName+") failed.", e); } _requiresLocalWrite = false; } //create cache createCache(); _data = null; } else if( LOG.isTraceEnabled() ){ LOG.trace("Var "+_varName+" not subject to caching: rows="+_data.getNumRows()+", cols="+_data.getNumColumns()+", state="+getStatusAsString()); } if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementReleaseTime(t1-t0); } } /** * Sets the matrix data reference to <code>null</code>, abandons the old matrix. * Makes the "envelope" empty. Run it to finalize the matrix (otherwise the * evicted matrix file may remain undeleted). * * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: EMPTY. * @throws CacheException */ public synchronized void clearData() throws CacheException { if( LOG.isTraceEnabled() ) LOG.trace("Clear data "+_varName); // check if cleanup enabled and possible if( !_cleanupFlag ) return; // do nothing if( !isAvailableToModify() ) throw new CacheStatusException ("MatrixObject (" + this.getDebugName() + ") not available to modify. Status = " + this.getStatusAsString() + "."); // clear existing WB / FS representation (but prevent unnecessary probes) if( !(isEmpty(true)||(_data!=null && isBelowCachingThreshold()) ||(_data!=null && !isCachingActive()) )) //additional condition for JMLC freeEvictedBlob(); // clear the in-memory data _data = null; clearCache(); // clear rdd/broadcast back refs if( _rddHandle != null ) _rddHandle.setBackReference(null); if( _bcHandle != null ) _bcHandle.setBackReference(null); // change object state EMPTY _dirtyFlag = false; setEmpty(); } public synchronized void exportData() throws CacheException { exportData( -1 ); } /** * Writes, or flushes, the matrix data to HDFS. * * In-Status: EMPTY, EVICTABLE, EVICTED, READ; * Out-Status: EMPTY, EVICTABLE, EVICTED, READ. * * @throws CacheException */ public synchronized void exportData( int replication ) throws CacheException { exportData(_hdfsFileName, null, replication, null); _hdfsFileExists = true; } /** * * @param fName * @param outputFormat * @param formatProperties * @throws CacheException */ public synchronized void exportData (String fName, String outputFormat, FileFormatProperties formatProperties) throws CacheException { exportData(fName, outputFormat, -1, formatProperties); } /** * * @param fName * @param outputFormat * @throws CacheException */ public synchronized void exportData (String fName, String outputFormat) throws CacheException { exportData(fName, outputFormat, -1, null); } /** * Synchronized because there might be parallel threads (parfor local) that * access the same MatrixObjectNew object (in case it was created before the loop). * If all threads export the same data object concurrently it results in errors * because they all write to the same file. Efficiency for loops and parallel threads * is achieved by checking if the in-memory matrix block is dirty. * * NOTE: MB: we do not use dfs copy from local (evicted) to HDFS because this would ignore * the output format and most importantly would bypass reblocking during write (which effects the * potential degree of parallelism). However, we copy files on HDFS if certain criteria are given. * * @param fName * @param outputFormat * @throws CacheException */ public synchronized void exportData (String fName, String outputFormat, int replication, FileFormatProperties formatProperties) throws CacheException { if( LOG.isTraceEnabled() ) LOG.trace("Export data "+_varName+" "+fName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; //prevent concurrent modifications if ( !isAvailableToRead() ) throw new CacheStatusException ("MatrixObject not available to read."); LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat); boolean pWrite = false; // !fName.equals(_hdfsFileName); //persistent write flag if ( fName.equals(_hdfsFileName) ) { _hdfsFileExists = true; pWrite = false; } else { pWrite = true; // i.e., export is called from "write" instruction } //actual export (note: no direct transfer of local copy in order to ensure blocking (and hence, parallelism)) if( isDirty() || //use dirty for skipping parallel exports (pWrite && !isEqualOutputFormat(outputFormat)) ) { // CASE 1: dirty in-mem matrix or pWrite w/ different format (write matrix to fname; load into memory if evicted) // a) get the matrix if( isEmpty(true) ) { //read data from HDFS if required (never read before), this applies only to pWrite w/ different output formats //note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here) try { if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) _data = readMatrixFromHDFS( _hdfsFileName ); else _data = readMatrixFromRDD( getRDDHandle(), new MutableBoolean() ); _dirtyFlag = false; } catch (IOException e) { throw new CacheIOException("Reading of " + _hdfsFileName + " ("+_varName+") failed.", e); } } //get object from cache if( _data == null ) getCache(); super.acquire( false, _data==null ); //incl. read matrix if evicted // b) write the matrix try { writeMetaData( fName, outputFormat, formatProperties ); writeMatrixToHDFS( fName, outputFormat, replication, formatProperties ); if ( !pWrite ) _dirtyFlag = false; } catch (Exception e) { throw new CacheIOException ("Export to " + fName + " failed.", e); } finally { release(); } } else if( pWrite ) // pwrite with same output format { //CASE 2: matrix already in same format but different file on hdfs (copy matrix to fname) try { MapReduceTool.deleteFileIfExistOnHDFS(fName); MapReduceTool.deleteFileIfExistOnHDFS(fName+".mtd"); if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) MapReduceTool.copyFileOnHDFS( _hdfsFileName, fName ); else //write might trigger rdd operations and nnz maintenance writeMatrixFromRDDtoHDFS(getRDDHandle(), fName, outputFormat); writeMetaData( fName, outputFormat, formatProperties ); } catch (Exception e) { throw new CacheIOException ("Export to " + fName + " failed.", e); } } else if( getRDDHandle()!=null && //pending rdd operation !getRDDHandle().allowsShortCircuitRead() ) { //CASE 3: pending rdd operation (other than checkpoints) try { writeMatrixFromRDDtoHDFS(getRDDHandle(), fName, outputFormat); writeMetaData( fName, outputFormat, formatProperties ); } catch (Exception e) { throw new CacheIOException ("Export to " + fName + " failed.", e); } } else { //CASE 4: data already in hdfs (do nothing, no need for export) LOG.trace(this.getDebugName() + ": Skip export to hdfs since data already exists."); } if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementExportTime(t1-t0); } } /** * * @param fName * @param outputFormat * @return * @throws CacheIOException */ public synchronized boolean moveData(String fName, String outputFormat) throws CacheIOException { boolean ret = false; try { //ensure input file is persistent on hdfs (pending RDD operations), //file might have been written during export or collect via write/read if( getRDDHandle() != null && !MapReduceTool.existsFileOnHDFS(_hdfsFileName) ) { writeMatrixFromRDDtoHDFS(getRDDHandle(), _hdfsFileName, outputFormat); } //export or rename to target file on hdfs if( isDirty() || (!isEqualOutputFormat(outputFormat) && isEmpty(true))) { exportData(fName, outputFormat); ret = true; } else if( isEqualOutputFormat(outputFormat) ) { MapReduceTool.deleteFileIfExistOnHDFS(fName); MapReduceTool.deleteFileIfExistOnHDFS(fName+".mtd"); MapReduceTool.renameFileOnHDFS( _hdfsFileName, fName ); writeMetaData( fName, outputFormat, null ); ret = true; } } catch (Exception e) { throw new CacheIOException ("Move to " + fName + " failed.", e); } return ret; } // ********************************************* // *** *** // *** HIGH-LEVEL PUBLIC METHODS *** // *** FOR PARTITIONED MATRIX ACCESS *** // *** (all other methods still usable) *** // *** *** // ********************************************* /** * @param n * */ public void setPartitioned( PDataPartitionFormat format, int n ) { _partitioned = true; _partitionFormat = format; _partitionSize = n; } public void unsetPartitioned() { _partitioned = false; _partitionFormat = null; _partitionSize = -1; } /** * * @return */ public boolean isPartitioned() { return _partitioned; } public PDataPartitionFormat getPartitionFormat() { return _partitionFormat; } public int getPartitionSize() { return _partitionSize; } public synchronized void setInMemoryPartition(MatrixBlock block) { _partitionInMemory = block; } /** * NOTE: for reading matrix partitions, we could cache (in its real sense) the read block * with soft references (no need for eviction, as partitioning only applied for read-only matrices). * However, since we currently only support row- and column-wise partitioning caching is not applied yet. * This could be changed once we also support column-block-wise and row-block-wise. Furthermore, * as we reject to partition vectors and support only full row or column indexing, no metadata (apart from * the partition flag) is required. * * @param pred * @return * @throws CacheException */ public synchronized MatrixBlock readMatrixPartition( IndexRange pred ) throws CacheException { if( LOG.isTraceEnabled() ) LOG.trace("Acquire partition "+_varName+" "+pred); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if ( !_partitioned ) throw new CacheStatusException ("MatrixObject not available to indexed read."); //return static partition of set from outside of the program if( _partitionInMemory != null ) return _partitionInMemory; MatrixBlock mb = null; try { boolean blockwise = (_partitionFormat==PDataPartitionFormat.ROW_BLOCK_WISE || _partitionFormat==PDataPartitionFormat.COLUMN_BLOCK_WISE); //preparations for block wise access MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); //get filename depending on format String fname = getPartitionFileName( pred, brlen, bclen ); //probe cache if( blockwise && _partitionCacheName != null && _partitionCacheName.equals(fname) ) { mb = _cache.get(); //try getting block from cache } if( mb == null ) //block not in cache { //get rows and cols long rows = -1; long cols = -1; switch( _partitionFormat ) { case ROW_WISE: rows = 1; cols = mc.getCols(); break; case ROW_BLOCK_WISE: rows = brlen; cols = mc.getCols(); break; case COLUMN_WISE: rows = mc.getRows(); cols = 1; break; case COLUMN_BLOCK_WISE: rows = mc.getRows(); cols = bclen; break; default: throw new CacheException("Unsupported partition format: "+_partitionFormat); } //read the if( MapReduceTool.existsFileOnHDFS(fname) ) mb = readMatrixFromHDFS( fname, rows, cols ); else { mb = new MatrixBlock((int)rows, (int)cols, true); LOG.warn("Reading empty matrix partition "+fname); } } //post processing if( blockwise ) { //put block into cache _partitionCacheName = fname; _cache = new SoftReference<MatrixBlock>(mb); if( _partitionFormat == PDataPartitionFormat.ROW_BLOCK_WISE ) { int rix = (int)((pred.rowStart-1)%brlen); mb = mb.sliceOperations(rix, rix, (int)(pred.colStart-1), (int)(pred.colEnd-1), new MatrixBlock()); } if( _partitionFormat == PDataPartitionFormat.COLUMN_BLOCK_WISE ) { int cix = (int)((pred.colStart-1)%bclen); mb = mb.sliceOperations((int)(pred.rowStart-1), (int)(pred.rowEnd-1), cix, cix, new MatrixBlock()); } } //NOTE: currently no special treatment of non-existing partitions necessary // because empty blocks are written anyway } catch(Exception ex) { throw new CacheException(ex); } if( DMLScript.STATISTICS ){ long t1 = System.nanoTime(); CacheStatistics.incrementAcquireRTime(t1-t0); } return mb; } /** * * @param pred * @return * @throws CacheStatusException */ public String getPartitionFileName( IndexRange pred, int brlen, int bclen ) throws CacheStatusException { if ( !_partitioned ) throw new CacheStatusException ("MatrixObject not available to indexed read."); StringBuilder sb = new StringBuilder(); sb.append(_hdfsFileName); switch( _partitionFormat ) { case ROW_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append(pred.rowStart); break; case ROW_BLOCK_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append((pred.rowStart-1)/brlen+1); break; case COLUMN_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append(pred.colStart); break; case COLUMN_BLOCK_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append((pred.colStart-1)/bclen+1); break; default: throw new CacheStatusException ("MatrixObject not available to indexed read."); } return sb.toString(); } // ********************************************* // *** *** // *** LOW-LEVEL PROTECTED METHODS *** // *** EXTEND CACHEABLE DATA *** // *** ONLY CALLED BY THE SUPERCLASS *** // *** *** // ********************************************* @Override protected boolean isBlobPresent() { return (_data != null); } @Override protected void evictBlobFromMemory ( MatrixBlock mb ) throws CacheIOException { throw new CacheIOException("Redundant explicit eviction."); } @Override protected void restoreBlobIntoMemory () throws CacheIOException { long begin = 0; if( LOG.isTraceEnabled() ) { LOG.trace("RESTORE of Matrix "+_varName+", "+_hdfsFileName); begin = System.currentTimeMillis(); } String filePath = getCacheFilePathAndName(); if( LOG.isTraceEnabled() ) LOG.trace ("CACHE: Restoring matrix... " + _varName + " HDFS path: " + (_hdfsFileName == null ? "null" : _hdfsFileName) + ", Restore from path: " + filePath); if (_data != null) throw new CacheIOException (filePath + " : Cannot restore on top of existing in-memory data."); try { _data = readMatrix(filePath); } catch (IOException e) { throw new CacheIOException (filePath + " : Restore failed.", e); } //check for success if (_data == null) throw new CacheIOException (filePath + " : Restore failed."); if( LOG.isTraceEnabled() ) LOG.trace("Restoring matrix - COMPLETED ... " + (System.currentTimeMillis()-begin) + " msec."); } @Override protected void freeEvictedBlob() { String cacheFilePathAndName = getCacheFilePathAndName(); long begin = 0; if( LOG.isTraceEnabled() ){ LOG.trace("CACHE: Freeing evicted matrix... " + _varName + " HDFS path: " + (_hdfsFileName == null ? "null" : _hdfsFileName) + " Eviction path: " + cacheFilePathAndName); begin = System.currentTimeMillis(); } LazyWriteBuffer.deleteMatrix(cacheFilePathAndName); if( LOG.isTraceEnabled() ) LOG.trace("Freeing evicted matrix - COMPLETED ... " + (System.currentTimeMillis()-begin) + " msec."); } @Override protected boolean isBelowCachingThreshold() { long rlen = _data.getNumRows(); long clen = _data.getNumColumns(); long nnz = _data.getNonZeros(); //get in-memory size (assume dense, if nnz unknown) double sparsity = OptimizerUtils.getSparsity( rlen, clen, nnz ); double size = MatrixBlock.estimateSizeInMemory( rlen, clen, sparsity ); return ( !_data.isAllocated() || size <= CACHING_THRESHOLD ); } // ******************************************* // *** *** // *** LOW-LEVEL PRIVATE METHODS *** // *** FOR MATRIX I/O *** // *** *** // ******************************************* private boolean isUpdateInPlace() { return _updateInPlaceFlag; } /** * */ private String getCacheFilePathAndName () { if( _cacheFileName==null ) { StringBuilder sb = new StringBuilder(); sb.append(CacheableData.cacheEvictionLocalFilePath); sb.append(CacheableData.cacheEvictionLocalFilePrefix); sb.append(String.format ("%09d", getUniqueCacheID())); sb.append(CacheableData.cacheEvictionLocalFileExtension); _cacheFileName = sb.toString(); } return _cacheFileName; } /** * * @param filePathAndName * @return * @throws IOException */ private MatrixBlock readMatrix (String filePathAndName) throws IOException { return LazyWriteBuffer.readMatrix(filePathAndName); } /** * * @param filePathAndName * @return * @throws IOException */ private MatrixBlock readMatrixFromHDFS(String filePathAndName) throws IOException { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); return readMatrixFromHDFS( filePathAndName, mc.getRows(), mc.getCols() ); } /** * * @param rdd * @return * @throws IOException */ private MatrixBlock readMatrixFromRDD(RDDObject rdd, MutableBoolean writeStatus) throws IOException { //note: the read of a matrix block from an RDD might trigger //lazy evaluation of pending transformations. RDDObject lrdd = rdd; //prepare return status (by default only collect) writeStatus.setValue(false); MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); MatrixBlock mb = null; try { //prevent unnecessary collect through rdd checkpoint if( rdd.allowsShortCircuitCollect() ) { lrdd = (RDDObject)rdd.getLineageChilds().get(0); } //obtain matrix block from RDD int rlen = (int)mc.getRows(); int clen = (int)mc.getCols(); int brlen = (int)mc.getRowsPerBlock(); int bclen = (int)mc.getColsPerBlock(); long nnz = mc.getNonZeros(); //guarded rdd collect if( !OptimizerUtils.checkSparkCollectMemoryBudget(rlen, clen, brlen, bclen, nnz, sizePinned.get()) ) { //write RDD to hdfs and read to prevent invalid collect mem consumption //note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower if( !MapReduceTool.existsFileOnHDFS(_hdfsFileName) ) { //prevent overwrite existing file long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo()); ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics().setNonZeros(newnnz); ((RDDObject)rdd).setHDFSFile(true); //mark rdd as hdfs file (for restore) writeStatus.setValue(true); //mark for no cache-write on read } mb = readMatrixFromHDFS(_hdfsFileName); } else { //collect matrix block from RDD mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz); } } catch(DMLRuntimeException ex) { throw new IOException(ex); } //sanity check correct output if( mb == null ) { throw new IOException("Unable to load matrix from rdd: "+lrdd.getVarName()); } return mb; } /** * * @param rdd * @param fname * @param outputFormat * @throws DMLRuntimeException */ private void writeMatrixFromRDDtoHDFS(RDDObject rdd, String fname, String outputFormat) throws DMLRuntimeException { //prepare output info MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo (outputFormat) : InputInfo.getMatchingOutputInfo (iimd.getInputInfo ())); //note: the write of an RDD to HDFS might trigger //lazy evaluation of pending transformations. long newnnz = SparkExecutionContext.writeRDDtoHDFS(rdd, fname, oinfo); ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics().setNonZeros(newnnz); } /** * * @param filePathAndName * @param rlen * @param clen * @return * @throws IOException */ private MatrixBlock readMatrixFromHDFS(String filePathAndName, long rlen, long clen) throws IOException { long begin = 0; MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); if( LOG.isTraceEnabled() ){ LOG.trace("Reading matrix from HDFS... " + _varName + " Path: " + filePathAndName + ", dimensions: [" + mc.getRows() + ", " + mc.getCols() + ", " + mc.getNonZeros() + "]"); begin = System.currentTimeMillis(); } double sparsity = ( mc.getNonZeros() >= 0 ? ((double)mc.getNonZeros())/(mc.getRows()*mc.getCols()) : 1.0d) ; //expected sparsity MatrixBlock newData = DataConverter.readMatrixFromHDFS(filePathAndName, iimd.getInputInfo(), rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), sparsity, _formatProperties); //sanity check correct output if( newData == null ) { throw new IOException("Unable to load matrix from file: "+filePathAndName); } if( LOG.isTraceEnabled() ) LOG.trace("Reading Completed: " + (System.currentTimeMillis()-begin) + " msec."); return newData; } /** * * @param filePathAndName * @throws DMLRuntimeException * @throws IOException */ private void writeMatrix (String filePathAndName) throws DMLRuntimeException, IOException { LazyWriteBuffer.writeMatrix(filePathAndName, _data); } /** * Writes in-memory matrix to HDFS in a specified format. * * @throws DMLRuntimeException * @throws IOException */ private void writeMatrixToHDFS (String filePathAndName, String outputFormat, int replication, FileFormatProperties formatProperties) throws DMLRuntimeException, IOException { long begin = 0; if( LOG.isTraceEnabled() ){ LOG.trace (" Writing matrix to HDFS... " + _varName + " Path: " + filePathAndName + ", Format: " + (outputFormat != null ? outputFormat : "inferred from metadata")); begin = System.currentTimeMillis(); } MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; if (_data != null) { // Get the dimension information from the metadata stored within MatrixObject MatrixCharacteristics mc = iimd.getMatrixCharacteristics (); // Write the matrix to HDFS in requested format OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo (outputFormat) : InputInfo.getMatchingOutputInfo (iimd.getInputInfo ())); // when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions // note: this is only required if singlenode (due to binarycell default) if ( oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && (mc.getRowsPerBlock() != DMLTranslator.DMLBlockSize || mc.getColsPerBlock() != DMLTranslator.DMLBlockSize) ) { DataConverter.writeMatrixToHDFS(_data, filePathAndName, oinfo, new MatrixCharacteristics(mc.getRows(), mc.getCols(), DMLTranslator.DMLBlockSize, DMLTranslator.DMLBlockSize, mc.getNonZeros()), replication, formatProperties); } else { DataConverter.writeMatrixToHDFS(_data, filePathAndName, oinfo, mc, replication, formatProperties); } if( LOG.isTraceEnabled() ) LOG.trace("Writing matrix to HDFS ("+filePathAndName+") - COMPLETED... " + (System.currentTimeMillis()-begin) + " msec."); } else if( LOG.isTraceEnabled() ) { LOG.trace ("Writing matrix to HDFS ("+filePathAndName+") - NOTHING TO WRITE (_data == null)."); } if( DMLScript.STATISTICS ) CacheStatistics.incrementHDFSWrites(); } /** * * @param filePathAndName * @param outputFormat * @throws DMLRuntimeException * @throws IOException */ private void writeMetaData (String filePathAndName, String outputFormat, FileFormatProperties formatProperties) throws DMLRuntimeException, IOException { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; if (iimd != null) { // Write the matrix to HDFS in requested format OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo (outputFormat) : InputInfo.getMatchingOutputInfo (iimd.getInputInfo ())); if ( oinfo != OutputInfo.MatrixMarketOutputInfo ) { // Get the dimension information from the metadata stored within MatrixObject MatrixCharacteristics mc = iimd.getMatrixCharacteristics (); // when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions // note: this is only required if singlenode (due to binarycell default) if ( oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && (mc.getRowsPerBlock() != DMLTranslator.DMLBlockSize || mc.getColsPerBlock() != DMLTranslator.DMLBlockSize) ) { mc = new MatrixCharacteristics(mc.getRows(), mc.getCols(), DMLTranslator.DMLBlockSize, DMLTranslator.DMLBlockSize, mc.getNonZeros()); } MapReduceTool.writeMetaDataFile (filePathAndName + ".mtd", valueType, mc, oinfo, formatProperties); } } else { throw new DMLRuntimeException("Unexpected error while writing mtd file (" + filePathAndName + ") -- metadata is null."); } } /** * * @param outputFormat * @return */ private boolean isEqualOutputFormat( String outputFormat ) { boolean ret = true; if( outputFormat != null ) { try { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; OutputInfo oi1 = InputInfo.getMatchingOutputInfo( iimd.getInputInfo() ); OutputInfo oi2 = OutputInfo.stringToOutputInfo( outputFormat ); if( oi1 != oi2 ) { ret = false; } } catch(Exception ex) { ret = false; } } return ret; } @Override public synchronized String getDebugName() { int maxLength = 23; String debugNameEnding = (_hdfsFileName == null ? "null" : (_hdfsFileName.length() < maxLength ? _hdfsFileName : "..." + _hdfsFileName.substring (_hdfsFileName.length() - maxLength + 3))); return _varName + " " + debugNameEnding; } // ******************************************* // *** *** // *** LOW-LEVEL PRIVATE METHODS *** // *** FOR SOFTREFERENCE CACHE *** // *** *** // ******************************************* /** * */ private void createCache( ) { _cache = new SoftReference<MatrixBlock>( _data ); } /** * */ private void getCache() { if( _cache !=null ) { _data = _cache.get(); clearCache(); } } /** * */ private void clearCache() { if( _cache != null ) { _cache.clear(); _cache = null; } } /** * * @param add */ private void updateStatusPinned(boolean add) { if( _data != null ) { //data should never be null long size = sizePinned.get(); size += (add ? 1 : -1) * _data.getSizeInMemory(); sizePinned.set( Math.max(size,0) ); } } /** * see clear data * * @param flag */ public void enableCleanup(boolean flag) { _cleanupFlag = flag; } /** * see clear data * * @return */ public boolean isCleanupEnabled() { return _cleanupFlag; } /** * * @param flag */ public void enableUpdateInPlace(boolean flag) { _updateInPlaceFlag = flag; } /** * * @return */ public boolean isUpdateInPlaceEnabled() { return _updateInPlaceFlag; } /** * */ public void setEmptyStatus() { setEmpty(); } }