package er.extensions.concurrency; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FilenameFilter; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.Hashtable; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import er.extensions.foundation.ERXConfigurationManager; import er.extensions.foundation.ERXProperties; public class ERXJobLoadBalancer { /* * This class solves the following problem: we have a set of jobs (identified by an Id) waiting to be processed * Several worker processes are competing for jobs and we need to way to efficiently parcel out those jobs out * We want to avoid as much as possible several workers attempting to grab the same jobs and locking it * * This class will let Workers threads or processes ask for a 'JobSet', which is basically int1 modulo int2 * The worker should then process jobs whose id = int1 module int2 * * This is NOT a substitute for proper locking, as there may be temporary situations where several workers attempt * to grab the same job. Rather it ensures that, as steady state, jobs are handed out equally and adapts automatically * when either new workers are added or some workers die * * This class assumes access to a shared file system. This is where the state will be shared. * It is also able to detect crashed or deadlocked workers and will adjust accordingly * * The expected pattern is sth like: * * WorkerIdentification wid=new WorkerIdentification("ProcessInvoices", 3); * while(true) { * jobLoadBalancer().heartbeat(wid); * jobSet=jobLoadBalancer().idSpace(wid); * .. fetch N jobs whose primary key == jobset.index mod jobset.module * for (i=0; i<N; i++_) { * .. process Job i * jobLoadBalancer().heartbeat(wid); * } * * * TO DO: we probably would be able to automatically infer the Id of a given worker based on thread id + pid + host ip */ private final static String SHARED_ROOT_LOCATION = "er.extensions.ERXJobLoadBalancer.RootLocation"; private static final Logger log = LoggerFactory.getLogger(ERXJobLoadBalancer.class); /* * How old an entry in the shared state has to be before we consider its author dead */ private final static String DEFAULT_DEAD_TIMEOUT_MILLIS = "er.extensions.ERXJobLoadBalancer.DefaultDeadTimeoutMillis"; /** * Describes which jobs (index mod modulo) the worker should attempt to process */ public static class JobSet { /* * Indicate to the worke what id space (index mod modulo) they should be attempting to process */ public int _index; public int _modulo; public JobSet(int i, int m) { _index=i; _modulo=m; } @Override public String toString() { return index()+" mod "+modulo(); } public int index() { return _index; } public int modulo() { return _modulo; } } private static ERXJobLoadBalancer _instance; public static ERXJobLoadBalancer jobLoadBalancer() { if (_instance==null) { _instance=new ERXJobLoadBalancer(); } return _instance; } /** * Identifies a worker to the load balancer */ public static class WorkerIdentification { String _type; String _id; public WorkerIdentification(String t, String i) { _type = t; _id=i; } @Override public String toString() { return type()+"-"+id(); } public String id() { return _id; } public String type() { return _type; } } private String _sharedRootLocation; /** * @return the shared path where the state of the workers is stored */ public String sharedRootLocation() { if (_sharedRootLocation == null) { _sharedRootLocation = ERXProperties.stringForKeyWithDefault(SHARED_ROOT_LOCATION, ERXProperties.stringForKeyWithDefault("java.io.tmpdir", "/tmp")+"/ERXJobLoadBalancer"); } return _sharedRootLocation; } private File _sharedRoot; protected File sharedRoot() { if (_sharedRoot == null) { _sharedRoot= new File(sharedRootLocation()); } return _sharedRoot; } private Map<String, Long> _ttlsPerType = new Hashtable<>(); /** * @param type * @return the ttl for a given worker type. An instance that has not called heartbeat for more than this TTL * will be considered dead by the other instances */ public long ttlForWorkerType(String type) { // to do specify TTL per type? Long result = _ttlsPerType.get(type); if (result==null) { result = Long.valueOf(ERXProperties.longForKeyWithDefault(DEFAULT_DEAD_TIMEOUT_MILLIS, 60000L)); // 1mn by default _ttlsPerType.put(type, result); } return result.longValue(); } /** * Sets the timeout for a given worker type * @param type * @param ttl (in milliseconds) */ public void setTtlForWorkerType(String type, long ttl) { _ttlsPerType.put(type, Long.valueOf(ttl)); } protected String pathForWorkerIdentification(WorkerIdentification workerId) { return sharedRootLocation()+"/"+workerId.type()+"-"+workerId.id(); } /** * Signals to the load balancer that the worker identified is alive * Clients should call this periodically, and certainly more often than the timeout * * @param workerId which worker is alive */ public void heartbeat(WorkerIdentification workerId) { /* method used to indicate the worker # workerId (process or thread) is alive */ String pathForEntry = pathForWorkerIdentification(workerId); File entryFile = new File(pathForEntry); final File tempFile = new File(pathForEntry + "." + System.currentTimeMillis()); log.debug("Writing Entry at {}: {}", tempFile, workerId); ObjectOutputStream out=null; try { // First make sure we have a directory File parentDir=tempFile.getParentFile(); if (!parentDir.exists()) parentDir.mkdirs(); out = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile))); // First we write when the entry expires long now=System.currentTimeMillis(); // 1. now out.writeLong(now); // 2. write my Id out.writeUTF(workerId.id()); log.debug("Wrote to {}", tempFile); out.close(); out=null; tempFile.renameTo(entryFile); } catch (FileNotFoundException e) { log.error("Writing to {}", tempFile, e); } catch (IOException e2) { log.error("Writing to {}", tempFile, e2); } finally { if (out!=null) try { out.close(); } catch (IOException e) {} } } /** * @param workerId * @return the JobSet that the worker should attempt to process * Given a worker looks at the shared state and determine the id space (index mod module) they should be processing */ public JobSet idSpace(final WorkerIdentification workerId) { // heartbeat for ourselves // this will ensure we don't overcount ERXJobLoadBalancer.jobLoadBalancer().heartbeat(workerId); FilenameFilter friendsFilter=new FilenameFilter() { public boolean accept(File dir, String name) { return name.indexOf(workerId.type())==0; } }; File[] friends = sharedRoot().listFiles(friendsFilter); // now check if each is alive int aliveFriendsCount=0; int aliveFriendsWithLowerIdFound=0; long now=System.currentTimeMillis(); long ttl=ttlForWorkerType(workerId.type()); for (int i=0; i<friends.length; i++) { File friend=friends[i]; ObjectInputStream in=null; try { in = new ObjectInputStream(new FileInputStream(friend)); long entryCreationTime = in.readLong(); String friendId = in.readUTF(); if ((now-entryCreationTime)<ttl) { aliveFriendsCount++; if (friendId.compareTo(workerId.id())<0) { aliveFriendsWithLowerIdFound++; } } else { // we found a dead worker - remove his entry to keep the shared directory clean if (!friend.delete()) { log.info("Could not delete dead worker entry: {}", friend); } } } catch (FileNotFoundException e) { } catch (IOException e2) { } finally { if (in!=null) try { in.close(); } catch (IOException e) {} } } // if we end up here with 0, we must have had a pb with the file system // in this case, just count ourselves and try to process everything if (aliveFriendsCount==0) { aliveFriendsCount=1; aliveFriendsWithLowerIdFound=0; } return new JobSet(aliveFriendsWithLowerIdFound, aliveFriendsCount); } /** * @return a String suitable to identify this particular worker instance * !! this string is not MT safe */ public String workerInstanceIdentification() { return ERXConfigurationManager.defaultManager().hostName()+"-"+System.getProperty("com.webobjects.pid"); } /* * ------------------------------------------------------------------------------------------------------------- */ /* * A simple test */ public static void main(String[] args) { if (args.length<2) { usage(); System.exit(1); } String arg1=args[0]; if (arg1.equals("-createJobs")) { createJobs(args[1], Integer.parseInt(args[2])); } else if (arg1.equals("-processJobs")) { processJobs(args[1],Integer.parseInt(args[2])); } else { System.out.println("Unrecognized: "+arg1); usage(); } } public static void usage() { System.out.println("Usage -createJobs <workerType> <numberOfJobs> | -processJobs <workerType> <workerNumber>"); } public static void createJobs(String workerType, int n) { System.out.println("Creating "+n+" jobs for "+workerType); jobLoadBalancer().sharedRoot().mkdirs(); try { for (int i=0; i<n; i++) { File f=new File(jobLoadBalancer().sharedRootLocation(), "Job-"+i); f.createNewFile(); } } catch (IOException e) { System.out.println("Could not create jobs: "+e); } } public static void processJobs(String workerType, int workerNumber) { WorkerIdentification wid=new WorkerIdentification(workerType,""+workerNumber); jobLoadBalancer().setTtlForWorkerType(workerType, 20000); // 20s while(true) { jobLoadBalancer().heartbeat(wid); System.out.println("Worker number "+workerNumber+" processing jobs #"+jobLoadBalancer().idSpace(wid)); try { Thread.sleep(5000); } catch (Exception e) { System.out.println("ERXJobLoadBalancer.processJobs: " + e); } } } }