package com.dedupeer.dao.operation; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Vector; import me.prettyprint.cassandra.connection.HConnectionManager; import me.prettyprint.cassandra.model.QuorumAllConsistencyLevelPolicy; import me.prettyprint.cassandra.serializers.BytesArraySerializer; import me.prettyprint.cassandra.serializers.LongSerializer; import me.prettyprint.cassandra.serializers.StringSerializer; import me.prettyprint.cassandra.service.CassandraHostConfigurator; import me.prettyprint.cassandra.service.ColumnSliceIterator; import me.prettyprint.cassandra.service.FailoverPolicy; import me.prettyprint.cassandra.service.KeyspaceService; import me.prettyprint.cassandra.service.KeyspaceServiceImpl; import me.prettyprint.hector.api.Cluster; import me.prettyprint.hector.api.Keyspace; import me.prettyprint.hector.api.beans.HColumn; import me.prettyprint.hector.api.beans.HSuperColumn; import me.prettyprint.hector.api.beans.SuperSlice; import me.prettyprint.hector.api.exceptions.HectorException; import me.prettyprint.hector.api.factory.HFactory; import me.prettyprint.hector.api.mutation.Mutator; import me.prettyprint.hector.api.query.QueryResult; import me.prettyprint.hector.api.query.SliceQuery; import me.prettyprint.hector.api.query.SuperColumnQuery; import me.prettyprint.hector.api.query.SuperSliceQuery; import org.apache.log4j.Logger; import com.dedupeer.backup.StoredFileFeedback; import com.dedupeer.thrift.Chunk; import com.dedupeer.thrift.ChunkIDs; import com.dedupeer.utils.FileUtils; import com.dedupeer.utils.Range; /** * @author Paulo Fernando (pf@paulofernando.net.br) */ public class ChunksDaoOperations { private static final Logger log = Logger.getLogger(ChunksDaoOperations.class); private Cluster cluster; private Keyspace keyspaceOperator; private static StringSerializer stringSerializer = StringSerializer.get(); private StoredFileFeedback feedback; KeyspaceService keyspace; /** * Creates an object to manipulate the operations on the Chunks Column Family * @param clusterName The cluster name from instance of Cassandra * @param keyspaceName The Keyspace name where the Chunks Column Family was created */ public ChunksDaoOperations (String clusterName, String keyspaceName) { cluster = HFactory.getOrCreateCluster(clusterName, "localhost:9160"); keyspaceOperator = HFactory.createKeyspace(keyspaceName, cluster); HConnectionManager connectionManager = new HConnectionManager(clusterName, new CassandraHostConfigurator("localhost:9160")); keyspace = new KeyspaceServiceImpl(keyspaceName, new QuorumAllConsistencyLevelPolicy(), connectionManager, FailoverPolicy.ON_FAIL_TRY_ALL_AVAILABLE); } /** * Creates an object to manipulate the operations on the Chunks Column Family * @param clusterName The cluster name from instance of Cassandra * @param keyspaceName The Keyspace name where the Chunks Column Family was created * @param StoredFileFeedback to inform the current progress */ public ChunksDaoOperations (String clusterName, String keyspaceName, StoredFileFeedback feedback) { this(clusterName, keyspaceName); this.feedback = feedback; } @SuppressWarnings("unchecked") public void insertRow(String fileID, String chunk_num, String strongHash, String weakHash, String index, String length, byte[] content) { try { Mutator<String> mutator = HFactory.createMutator(keyspaceOperator, stringSerializer); mutator.insert(fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("strongHash", strongHash)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("weakHash", weakHash)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("index", index)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("length", length)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createColumn("content", content)), stringSerializer, stringSerializer, BytesArraySerializer.get())); } catch (HectorException e) { log.error("Data was not inserted"); e.printStackTrace(); } } @SuppressWarnings("unchecked") public void insertRow(Chunk chunk) { try { Mutator<String> mutator = HFactory.createMutator(keyspaceOperator, stringSerializer); if(chunk.pfile == null) { mutator.insert(chunk.fileID, "Chunks", HFactory.createSuperColumn(chunk.chunkNumber, Arrays.asList(HFactory.createStringColumn("strongHash", chunk.strongHash)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(chunk.fileID, "Chunks", HFactory.createSuperColumn(chunk.chunkNumber, Arrays.asList(HFactory.createStringColumn("weakHash", chunk.weakHash)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(chunk.fileID, "Chunks", HFactory.createSuperColumn(chunk.chunkNumber, Arrays.asList(HFactory.createStringColumn("index", chunk.index)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(chunk.fileID, "Chunks", HFactory.createSuperColumn(chunk.chunkNumber, Arrays.asList(HFactory.createStringColumn("length", chunk.length)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(chunk.fileID, "Chunks", HFactory.createSuperColumn(chunk.chunkNumber, Arrays.asList(HFactory.createColumn("content", chunk.content.array())), stringSerializer, stringSerializer, BytesArraySerializer.get())); } else { //Deduplicated chunk mutator.insert(chunk.fileID, "Chunks", HFactory.createSuperColumn(chunk.chunkNumber, Arrays.asList(HFactory.createStringColumn("index", chunk.index)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(chunk.fileID, "Chunks", HFactory.createSuperColumn(chunk.chunkNumber, Arrays.asList(HFactory.createStringColumn("pfile", chunk.pfile)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(chunk.fileID, "Chunks", HFactory.createSuperColumn(chunk.chunkNumber, Arrays.asList(HFactory.createStringColumn("pchunk", chunk.pchunk)), stringSerializer, stringSerializer, stringSerializer)); } } catch (HectorException e) { log.error("Data was not inserted"); e.printStackTrace(); } } /** Inserts a collection of chunks on the Chunk Column Family */ @SuppressWarnings("unchecked") public void insertRows(ArrayList<Chunk> chunks, int initialChunk) { int chunk_number = initialChunk; if(chunks.get(0) != null) { log.info("Chunk: " + chunks.get(0).chunkNumber); } for(Chunk c: chunks) { try { String chunk_num = String.valueOf(chunk_number); Mutator<String> mutator = HFactory.createMutator(keyspaceOperator, stringSerializer); if(c.pfile == null) { mutator.insert(c.fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("strongHash", c.strongHash)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(c.fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("weakHash", c.weakHash)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(c.fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("index", c.index)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(c.fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("length", c.length)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(c.fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList( HFactory.createColumn("content", c.content.array())), stringSerializer, stringSerializer, BytesArraySerializer.get())); } else { //Deduplicated chunk mutator.insert(c.fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("index", c.index)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(c.fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("pfile", c.pfile)), stringSerializer, stringSerializer, stringSerializer)); mutator.insert(c.fileID, "Chunks", HFactory.createSuperColumn(chunk_num, Arrays.asList(HFactory.createStringColumn("pchunk", c.pchunk)), stringSerializer, stringSerializer, stringSerializer)); } chunk_number++; if(feedback != null) { feedback.updateProgress((int) Math.floor((((double)(chunk_number - initialChunk)) * 100) / chunks.size())); } } catch (HectorException e) { log.error("Data was not inserted"); e.printStackTrace(); } } } /** * Retrieves the Column with the key and the column name specified * @param file_id The line key * @param columnName The column name to get the value * @return The SuperColumn with the parameters specified */ public QueryResult<HSuperColumn<String, String, String>> getValues(String file_id, String chunk_number) { SuperColumnQuery<String, String, String, String> superColumnQuery = HFactory.createSuperColumnQuery(keyspaceOperator, stringSerializer, stringSerializer, stringSerializer, stringSerializer); superColumnQuery.setColumnFamily("Chunks").setKey(file_id).setSuperName(chunk_number); QueryResult<HSuperColumn<String, String, String>> result = superColumnQuery.execute(); return result; } /** * * @param file_id File ID * @param amountChunks Amount of chunks to retrieve the information * @return HashMap with <weakHash, <strongHash, chunkNum>> */ public Map<Integer, Map<String, ChunkIDs>> getHashesOfAFile(String file_id, int amountChunks) { SuperSliceQuery<String, String, String, String> query = HFactory.createSuperSliceQuery(keyspaceOperator, stringSerializer, stringSerializer, stringSerializer, stringSerializer); query.setColumnFamily("Chunks"); query.setKey(file_id); int loadByTime = 10; Map<Integer, Map<String, ChunkIDs>> chunksLoaded = new HashMap<Integer, Map<String, ChunkIDs>>(); long loaded = 0; while(loaded < amountChunks) { //TODO To refactor with the code easier to modify if(amountChunks - loaded >= loadByTime) { query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1), String.valueOf(loaded + 2), String.valueOf(loaded + 3), String.valueOf(loaded + 4), String.valueOf(loaded + 5), String.valueOf(loaded + 6), String.valueOf(loaded + 7), String.valueOf(loaded + 8), String.valueOf(loaded + 9)); } else { int remains = (int)(amountChunks - loaded); switch(remains) { case 9: query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1), String.valueOf(loaded + 2), String.valueOf(loaded + 3), String.valueOf(loaded + 4), String.valueOf(loaded + 5), String.valueOf(loaded + 6), String.valueOf(loaded + 7), String.valueOf(loaded + 8)); break; case 8: query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1), String.valueOf(loaded + 2), String.valueOf(loaded + 3), String.valueOf(loaded + 4), String.valueOf(loaded + 5), String.valueOf(loaded + 6), String.valueOf(loaded + 7)); break; case 7: query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1), String.valueOf(loaded + 2), String.valueOf(loaded + 3), String.valueOf(loaded + 4), String.valueOf(loaded + 5), String.valueOf(loaded + 6)); break; case 6: query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1), String.valueOf(loaded + 2), String.valueOf(loaded + 3), String.valueOf(loaded + 4), String.valueOf(loaded + 5)); break; case 5: query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1), String.valueOf(loaded + 2), String.valueOf(loaded + 3), String.valueOf(loaded + 4)); break; case 4: query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1), String.valueOf(loaded + 2), String.valueOf(loaded + 3)); break; case 3: query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1), String.valueOf(loaded + 2)); break; case 2: query.setColumnNames(String.valueOf(loaded), String.valueOf(loaded + 1)); break; case 1: query.setColumnNames(String.valueOf(loaded)); break; } } QueryResult<SuperSlice<String, String, String>> result = query.execute(); for(HSuperColumn<String, String, String> column: result.get().getSuperColumns()) { String weakHash = column.getSubColumnByName("weakHash").getValue(); if(!chunksLoaded.containsKey(weakHash)) { Map<String, ChunkIDs> chunkInfo = new HashMap<String, ChunkIDs>(); ChunkIDs ids = new ChunkIDs(); ids.setChunkID(String.valueOf(loaded)); ids.setFileID(file_id); chunkInfo.put(column.getSubColumnByName("strongHash").getValue(), ids); chunksLoaded.put(Integer.parseInt(weakHash), chunkInfo); } else { Map<String, ChunkIDs> strongHashSet = chunksLoaded.get(weakHash); ChunkIDs ids = new ChunkIDs(); ids.setChunkID(String.valueOf(loaded)); ids.setFileID(file_id); strongHashSet.put(column.getSubColumnByName("strongHash").getValue(), ids); } loaded++; } log.info("Last chunk loaded: " + loaded); } return chunksLoaded; } public void getAllChunks(String file_id) { SliceQuery<String, String, String> query = HFactory.createSliceQuery(keyspaceOperator, StringSerializer.get(), StringSerializer.get(), StringSerializer.get()). setKey(file_id).setColumnFamily("Chunks"); ColumnSliceIterator<String, String, String> iterator = new ColumnSliceIterator<String, String, String>(query, null, "\uFFFF", false); while (iterator.hasNext()) { log.debug(iterator.next().getValue()); } } public long getSpaceOccupiedByTheFile(String owner, String filename) { //--------------- retrieving the id --------------- UserFilesDaoOperations ufdo = new UserFilesDaoOperations("TestCluster", "Dedupeer"); HColumn<String, String> columnFileID = ufdo.getValues(owner, filename).get().getSubColumnByName("file_id"); String fileID = columnFileID.getValue(); //------------------------------------------------ SuperSliceQuery<String, String, String, String> query = HFactory.createSuperSliceQuery(keyspaceOperator, stringSerializer, stringSerializer, stringSerializer, stringSerializer); query.setColumnFamily("Chunks"); query.setKey(fileID); long chunksLoaded = 0, bytesStored = 0, i = 0, count = ufdo.getChunksCount(owner, filename); QueryResult<SuperSlice<String, String, String>> result; do { query.setColumnNames(String.valueOf(i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //5 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //10 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //15 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i)); //20 result = query.execute(); for(HSuperColumn<String, String, String> chunk: result.get().getSuperColumns()) { if(chunk.getSubColumnByName("content") != null) { //chunk with content bytesStored += Integer.parseInt(chunk.getSubColumnByName("length").getValue()); } if(feedback != null) feedback.updateProgress((int) Math.ceil((((double)i) * 100) / count)); chunksLoaded++; } log.info("Calculating economy of \"" + filename + "\": " + chunksLoaded); } while(result.get().getSuperColumns().size() != 0); return bytesStored; } public List<Range> getAreasModified(String owner, String filename) { //--------------- retrieving the id --------------- UserFilesDaoOperations ufdo = new UserFilesDaoOperations("TestCluster", "Dedupeer"); HColumn<String, String> columnFileID = ufdo.getValues(owner, filename).get().getSubColumnByName("file_id"); String fileID = columnFileID.getValue(); //------------------------------------------------ ArrayList<Range> chunksPosition = new ArrayList<Range>(); SuperSliceQuery<String, String, String, String> query = HFactory.createSuperSliceQuery(keyspaceOperator, stringSerializer, stringSerializer, stringSerializer, stringSerializer); query.setColumnFamily("Chunks"); query.setKey(fileID); long i = 0, chunksLoaded = 0, index = 0, count = ufdo.getChunksCount(owner, filename); int percent = 0; QueryResult<SuperSlice<String, String, String>> result; do { query.setColumnNames(String.valueOf(i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //5 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //10 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //15 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i)); //20 result = query.execute(); for(HSuperColumn<String, String, String> chunk: result.get().getSuperColumns()) { if(chunk.getSubColumnByName("length") != null) { //chunk with content index = Long.parseLong(chunk.getSubColumnByName("index").getValue()); chunksPosition.add(new Range(index, index + Long.parseLong(chunk.getSubColumnByName("length").getValue()))); } chunksLoaded++; } if(feedback != null) { percent = (int) Math.ceil((((double)chunksLoaded) * 100) / count); feedback.updateProgress(percent); } log.info("Analyzing " + filename + ": " + chunksLoaded); } while(result.get().getSuperColumns().size() != 0); return mergeAreas(chunksPosition); } /** * Join the areas that have the final index equals with the initial index of the chunks. * For instance, the ranges (0,10) and (10,14) turns into (0,14). * @param ranges Chunks position to merge * @return Merged areas */ private static List<Range> mergeAreas(List<Range> ranges) { //TODO Change to Interval tree to be more fast List<Range> result = new ArrayList<Range>(); if((ranges != null) && (ranges.size() > 0)) { Collections.sort(ranges); result.add(new Range(ranges.get(0).getInitialValue(), ranges.get(0).getFinalValue())); for(int i = 1; i < (ranges.size()); i++) { if(result.get(result.size()-1).getFinalValue() == ranges.get(i).getInitialValue()) { result.get(result.size()-1).setFinalValue(ranges.get(i).getFinalValue()); } else { result.add(new Range(ranges.get(i).getInitialValue(), ranges.get(i).getFinalValue())); } } } return result; } public Vector<QueryResult<HSuperColumn<String, String, String>>> getAllValuesWithContent(String owner, String filename) { SuperColumnQuery<String, String, String, String> superColumnQuery = HFactory.createSuperColumnQuery(keyspaceOperator, stringSerializer, stringSerializer, stringSerializer, stringSerializer); Vector<QueryResult<HSuperColumn<String, String, String>>> result = new Vector<QueryResult<HSuperColumn<String, String, String>>>(); //--------------- retrieving the id --------------- UserFilesDaoOperations ufdo = new UserFilesDaoOperations("TestCluster", "Dedupeer"); HColumn<String, String> columnFileID = ufdo.getValues(owner, filename).get().getSubColumnByName("file_id"); String fileID = columnFileID.getValue(); //------------------------------------------------- long count = ufdo.getChunksCount(owner, filename); for(int i = 0; i < count; i++) { superColumnQuery.setColumnFamily("Chunks").setKey(fileID).setSuperName(String.valueOf(i)); QueryResult<HSuperColumn<String, String, String>> column = superColumnQuery.execute(); if(column.get().getSubColumnByName("content") != null) { result.add(column); } if(feedback != null) { feedback.updateProgress((int)(Math.ceil((((double)i) * 100) / count))); } } return result; } /** * Retrieves the range of super columns (chunks). Chunks with and without content are included */ public Vector<QueryResult<HSuperColumn<String, String, String>>> getValuesWithContent(String owner, String fileID, long chunksCount,long initialChunk, long amountOfChunks) { SuperColumnQuery<String, String, String, String> superColumnQuery = HFactory.createSuperColumnQuery(keyspaceOperator, stringSerializer, stringSerializer, stringSerializer, stringSerializer); Vector<QueryResult<HSuperColumn<String, String, String>>> result = new Vector<QueryResult<HSuperColumn<String, String, String>>>(); long i = initialChunk; while(result.size() < amountOfChunks) { superColumnQuery.setColumnFamily("Chunks").setKey(fileID).setSuperName(String.valueOf(i)); QueryResult<HSuperColumn<String, String, String>> column = superColumnQuery.execute(); if(column.get().getSubColumnByName("content") != null) { result.add(column); } i++; } return result; } public QueryResult<SuperSlice<String, String, String>> getChunksByRange(String owner, String fileID, long chunksCount,long initialChunk) { SuperSliceQuery<String, String, String, String> query = HFactory.createSuperSliceQuery(keyspaceOperator, stringSerializer, stringSerializer, stringSerializer, stringSerializer); query.setColumnFamily("Chunks"); query.setKey(fileID); long i = initialChunk; query.setColumnNames(String.valueOf(i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //5 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //10 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), //15 String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i), String.valueOf(++i)); //20 QueryResult<SuperSlice<String, String, String>> result = query.execute(); return result; } /** * Retrieves all columns with content in bytes of file * @param file_id The line key * @param chunk_number The column name to get the value * @return The SuperColumns with the column content */ public Vector<QueryResult<HSuperColumn<String, String, String>>> getValuesWithoutContent(String owner, String filename, long initialChunk, long amountOfChunks) { SuperColumnQuery<String, String, String, String> superColumnQuery = HFactory.createSuperColumnQuery(keyspaceOperator, stringSerializer, stringSerializer, stringSerializer, stringSerializer); Vector<QueryResult<HSuperColumn<String, String, String>>> result = new Vector<QueryResult<HSuperColumn<String, String, String>>>(); //--- retrieving the id ---- UserFilesDaoOperations ufdo = new UserFilesDaoOperations("TestCluster", "Dedupeer"); HColumn<String, String> columnFileID = ufdo.getValues(owner, filename).get().getSubColumnByName("file_id"); String fileID = columnFileID.getValue(); //------------------------- long i = initialChunk; while(result.size() < amountOfChunks) { superColumnQuery.setColumnFamily("Chunks").setKey(fileID).setSuperName(String.valueOf(i)); QueryResult<HSuperColumn<String, String, String>> column = superColumnQuery.execute(); if(column.get().getSubColumnByName("pfile") != null) { result.add(column); } i++; } return result; } /** Closes the connection with cluster */ public void close() { cluster.getConnectionManager().shutdown(); } }