package ch.usi.da.smr; /* * Copyright (c) 2013 Università della Svizzera italiana (USI) * * This file is part of URingPaxos. * * URingPaxos is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * URingPaxos is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with URingPaxos. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.InetAddress; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.SortedMap; import java.util.TreeMap; import org.apache.log4j.Logger; import ch.usi.da.paxos.Util; import ch.usi.da.paxos.message.Control; import ch.usi.da.paxos.message.ControlType; import ch.usi.da.smr.message.Command; import ch.usi.da.smr.message.CommandType; import ch.usi.da.smr.message.Message; import ch.usi.da.smr.recovery.DfsRecovery; import ch.usi.da.smr.recovery.RecoveryInterface; import ch.usi.da.smr.recovery.SnapshotWriter; import ch.usi.da.smr.transport.ABListener; import ch.usi.da.smr.transport.ABSender; import ch.usi.da.smr.transport.Receiver; import ch.usi.da.smr.transport.UDPSender; /** * Name: Replica<br> * Description: <br> * * Creation date: Mar 12, 2013<br> * $Id$ * * @author Samuel Benz benz@geoid.ch */ public class Replica implements Receiver { static { // get hostname and pid for log file name String host = "localhost"; try { Process proc = Runtime.getRuntime().exec("hostname"); BufferedInputStream in = new BufferedInputStream(proc.getInputStream()); proc.waitFor(); byte [] b = new byte[in.available()]; in.read(b); in.close(); host = new String(b).replace("\n",""); } catch (IOException | InterruptedException e) { } int pid = 0; try { pid = Integer.parseInt((new File("/proc/self")).getCanonicalFile().getName()); } catch (NumberFormatException | IOException e) { } System.setProperty("logfilename", "L" + host + "-" + pid + ".log"); } private final static Logger logger = Logger.getLogger(Replica.class); public final int nodeID; public String token; private final PartitionManager partitions; private volatile int min_token; private volatile int max_token; // if min_token > max_token : replica serves whole key space private final UDPSender udp; private final ABListener ab; private final InetAddress ip = Util.getHostAddress(); private volatile SortedMap<String,byte[]> db; private volatile Map<Integer,Long> exec_instance = new HashMap<Integer,Long>(); private long exec_cmd = 0; private int snapshot_modulo = 0; // disabled private final boolean use_thrift = false; RecoveryInterface stable_storage; private volatile boolean recovery = false; private volatile boolean active_snapshot = false; public Replica(String token, int ringID, int nodeID, int snapshot_modulo, String zoo_host) throws Exception { this.nodeID = nodeID; this.token = token; this.snapshot_modulo = snapshot_modulo; this.partitions = new PartitionManager(zoo_host); partitions.init(); setPartition(partitions.register(nodeID, ringID, ip, token)); udp = new UDPSender(); if(use_thrift){ ab = partitions.getThriftABListener(ringID,nodeID); }else{ ab = partitions.getRawABListener(ringID,nodeID); } db = new TreeMap<String,byte[]>(); //stable_storage = new HttpRecovery(partitions); stable_storage = new DfsRecovery(nodeID,token,"/tmp/smr",partitions); } public void setPartition(Partition partition){ logger.info("Replica update partition " + partition); min_token = partition.getLow(); max_token = partition.getHigh(); } public void start(){ partitions.registerPartitionChangeNotifier(this); // install old state //FIXME: disabled recovery! exec_instance = load(); // start listening ab.registerReceiver(this); if(min_token > max_token){ logger.info("Replica start serving partition " + token + ": whole key space"); }else{ logger.info("Replica start serving partition " + token + ": " + min_token + "->" + max_token); } Thread t = new Thread((Runnable) ab); t.setName("ABListener"); t.start(); /*Thread c = new Thread("Experiemnt controller"){ @Override public void run(){ try { if(nodeID == 5){ //re-partitioning: split group Thread.sleep(35000); //!!!!!!!!!!!! Edit also RawABListener to set correct replication group !!!!!!!!!!!! int oldRing = 1; int newRing = 2; int groupID = 2; token = "7FFFFFFF"; // add ring 2 ABSender old_sender = partitions.getThriftABSender(oldRing,2); ABSender new_sender = partitions.getThriftABSender(newRing,2); //String c = "s," + groupID + "," + newRing; Control c = new Control(1,ControlType.Subscribe,groupID,newRing); Message m = new Message(1,ip + ";" + 8000,"",null); m.setControl(c); old_sender.abroadcast(m); new_sender.abroadcast(m); Thread.sleep(5000); // register new partition setPartition(partitions.register(nodeID, newRing, ip, token)); //Thread.sleep(4000); // remove ring 1 c = new Control(2,ControlType.Unsubscribe,groupID,oldRing); m = new Message(1,ip + ";" + 8000,"",null); m.setControl(c); new_sender.abroadcast(m); } } catch (Exception e) { } } }; c.start();*/ } public void close(){ ab.close(); stable_storage.close(); partitions.deregister(nodeID,token); } @Override public void receive(Message m) { logger.debug("Replica received ring " + m.getRing() + " instnace " + m.getInstnce() + " (" + m + ")"); // skip already executed commands /*FIXME: disabled recovery!if(m.getInstnce() <= exec_instance.get(m.getRing())){ return; }*/ if(m.isSkip() || m.isSetControl()){ // skip skip-instances exec_instance.put(m.getRing(),m.getInstnce()); return; } List<Command> cmds = new ArrayList<Command>(); // recover if a not ascending instance arrives /*FIXME: disabled recovery! if(m.getInstnce()-1 != exec_instance.get(m.getRing())){ while(m.getInstnce()-1 > exec_instance.get(m.getRing())){ logger.info("Replica start recovery: " + exec_instance.get(m.getRing()) + " to " + (m.getInstnce()-1)); exec_instance = load(); } }*/ // write snapshot exec_cmd++; if(snapshot_modulo > 0 && exec_cmd % snapshot_modulo == 0){ async_checkpoint(); } synchronized(db){ byte[] data; for(Command c : m.getCommands()){ switch(c.getType()){ case PUT: db.put(c.getKey(),c.getValue()); if(db.containsKey(c.getKey())){ Command cmd = new Command(c.getID(),CommandType.RESPONSE,c.getKey(),"OK".getBytes()); cmds.add(cmd); }else{ Command cmd = new Command(c.getID(),CommandType.RESPONSE,c.getKey(),"FAIL".getBytes()); cmds.add(cmd); } break; case DELETE: if(db.remove(c.getKey()) != null){ Command cmd = new Command(c.getID(),CommandType.RESPONSE,c.getKey(),"OK".getBytes()); cmds.add(cmd); }else{ Command cmd = new Command(c.getID(),CommandType.RESPONSE,c.getKey(),"FAIL".getBytes()); cmds.add(cmd); } break; case GET: data = db.get(c.getKey()); if(data != null){ Command cmd = new Command(c.getID(),CommandType.RESPONSE,c.getKey(),data); cmds.add(cmd); }else{ Command cmd = new Command(c.getID(),CommandType.RESPONSE,c.getKey(),null); cmds.add(cmd); } break; case GETRANGE: // key range (token range not implemented) /* Inspired by the Cassandra API: The semantics of start keys and tokens are slightly different. Keys are start-inclusive; tokens are start-exclusive. Token ranges may also wrap -- that is, the end token may be less than the start one. Thus, a range from keyX to keyX is a one-element range, but a range from tokenY to tokenY is the full ring (one exception is if keyX is mapped to the minimum token, then the range from keyX to keyX is the full ring). Attribute Description start_key The first key in the inclusive KeyRange. end_key The last key in the inclusive KeyRange. start_token The first token in the exclusive KeyRange. end_token The last token in the exclusive KeyRange. count The total number of keys to permit in the KeyRange. */ String start_key = c.getKey(); String end_key = new String(c.getValue()).split(";")[0]; int count = c.getCount(); logger.info("getrange " + start_key + " -> " + end_key + " (" + MurmurHash.hash32(start_key) + "->" + MurmurHash.hash32(end_key) + ")"); //logger.debug("tailMap:" + db.tailMap(start_key).keySet() + " count:" + count); int msg = 0; int msg_size = 0; for(Entry<String,byte[]> e : db.tailMap(start_key).entrySet()){ if(msg >= count || (!end_key.isEmpty() && e.getKey().compareTo(end_key) > 0)){ break; } if(msg_size >= 50000){ break; } // send by UDP Command cmd = new Command(c.getID(),CommandType.RESPONSE,e.getKey(),e.getValue()); msg_size += e.getValue().length; cmds.add(cmd); msg++; } if(msg == 0){ Command cmd = new Command(c.getID(),CommandType.RESPONSE,"",null); cmds.add(cmd); } // signal partitions.singal(token,c); // wait until signal from every involved partition boolean ret = partitions.waitSignal(c); if(ret != true){ cmds.clear(); } break; default: System.err.println("Receive RESPONSE as Command!"); break; } } } exec_instance.put(m.getRing(),m.getInstnce()); int msg_id = MurmurHash.hash32(m.getInstnce() + "-" + token); Message msg = new Message(msg_id,token,m.getFrom(),cmds); //logger.debug("Send UDP: " + msg); udp.send(msg); } public Map<Integer,Long> load(){ try{ return stable_storage.installState(token,db); }catch(Exception e){ if(!exec_instance.isEmpty()){ return exec_instance; }else{ // init to 0 Map<Integer,Long> instances = new HashMap<Integer,Long>(); instances.put(partitions.getGlobalRing(),0L); for(Partition p : partitions.getPartitions()){ instances.put(p.getRing(),0L); } return instances; } } } public boolean sync_checkpoint(){ if(stable_storage.storeState(exec_instance,db)){ try { for(Entry<Integer, Long> e : exec_instance.entrySet()){ ab.safe(e.getKey(),e.getValue()); } logger.info("Replica checkpointed up to instance " + exec_instance); return true; } catch (Exception e) { logger.error(e); } } return false; } public boolean async_checkpoint(){ if(!active_snapshot){ active_snapshot = true; // shallow copy Map<Integer,Long> old_exec_instance = new HashMap<Integer,Long>(exec_instance); SortedMap<String,byte[]> old_db = new TreeMap<String,byte[]>(db); // deep copy /* Map<Integer,Long> old_exec_instance = new HashMap<Integer,Long>(); for(Entry<Integer,Long> e : exec_instance.entrySet()){ old_exec_instance.put(new Integer(e.getKey()),new Long(e.getValue())); } SortedMap<String,byte[]> old_db = new TreeMap<String,byte[]>(); for(Entry<String,byte[]> e : db.entrySet()){ old_db.put(new String(e.getKey()),Arrays.copyOf(e.getValue(),e.getValue().length)); } old_db.putAll(db); */ Thread t = new Thread(new SnapshotWriter(this,old_exec_instance,old_db,stable_storage,ab)); t.start(); }else{ logger.info("Async checkpoint supressed since other active!"); } return true; } public void setActiveSnapshot(boolean b){ active_snapshot = b; } public boolean getRecovery(){ return recovery; } /** * Do not accept commands until you know you have recovered! * * The commands are queued in the learner itself. * */ @Override public boolean is_ready(Integer ring, Long instance) { if(instance <= exec_instance.get(ring)+1){ if(recovery == true){ recovery = false; logger.info("Recovery set false."); } return true; } if(recovery == false){ recovery = true; Thread t = new Thread(){ @Override public void run() { logger.info("Replica starts recovery thread."); while(getRecovery()){ exec_instance = load(); try { Thread.sleep(10000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } } logger.info("Recovery thread stopped."); } }; t.setName("Recovery"); t.start(); } return false; } public static boolean newerState(Map<Integer, Long> nstate, Map<Integer, Long> state) { for(Entry<Integer, Long> e : state.entrySet()){ long i = e.getValue(); if(i > 0){ long ni = nstate.get(e.getKey()); if(ni > i){ return true; } } } return false; } /** * @param args */ public static void main(String[] args) { String zoo_host = "127.0.0.1:2181"; int snapshot = 0; if (args.length > 2) { zoo_host = args[2]; } if (args.length > 1) { snapshot = Integer.parseInt(args[1]); } if (args.length < 1) { System.err.println("Plese use \"Replica\" \"ringID,nodeID,Token\" [snapshot_modulo] [zookeeper host]"); } else { String[] arg = args[0].split(","); final int nodeID = Integer.parseInt(arg[1]); final int ringID = Integer.parseInt(arg[0]); final String token = arg[2]; try { final Replica replica = new Replica(token,ringID,nodeID,snapshot,zoo_host); Runtime.getRuntime().addShutdownHook(new Thread("ShutdownHook"){ @Override public void run(){ replica.close(); } }); replica.start(); BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); in.readLine(); replica.close(); } catch (Exception e) { e.printStackTrace(); System.exit(1); } } } }