package com.github.martinprillard.shavadoop.master; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.URLDecoder; import java.net.UnknownHostException; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import com.github.martinprillard.shavadoop.util.Pair; import com.github.martinprillard.shavadoop.util.Util; import com.github.martinprillard.shavadoop.master.tasktracker.TaskTracker; import com.github.martinprillard.shavadoop.network.SSHManager; import com.github.martinprillard.shavadoop.slave.Slave; import com.github.martinprillard.shavadoop.util.Constant; import com.github.martinprillard.shavadoop.util.PropReader; /** * * @author martin prillard * */ public class Master { private int portMasterDictionary; private int portTaskTracker; private int nbWorker; private SSHManager sm; private double startTime; private String fileToTreat; private List<String> workersCores; private double stShavadoop = 0; private double st = 0; private double totalTime; // dictionary Map<String, HashSet<Pair>> dictionaryMapping; // worker, (host, UM_Wx file) -> to shuffling Map<String, String> dictionaryReducing; // idWorker, host -> to get all RM files /** * Clean and initialize the MapReduce process */ public void initialize() { System.out.println(Constant.MODE_DEBUG); if (Constant.MODE_DEBUG) { System.out.println(); System.out.println("Shavadoop program " + Constant.APP_VERSION); System.out.println(); stShavadoop = System.currentTimeMillis(); System.out.println(Constant.APP_DEBUG_BLOC + " Initialize and clean " + Constant.APP_DEBUG_BLOC); st = System.currentTimeMillis(); } // initialize the SSH manager String hostFullMaster = null; try { hostFullMaster = InetAddress.getLocalHost().getCanonicalHostName(); } catch (UnknownHostException e1) { e1.printStackTrace(); } sm = new SSHManager(hostFullMaster); sm.initialize(); // create / clean res directory Util.initializeResDirectory(Constant.PATH_REPO_RES, true); // get network's ip adress PropReader prop = new PropReader(); String ipFileString = prop.getPropValues(PropReader.FILE_IP_ADRESS); File ipFile = new File(ipFileString); // if no ip file given if (!ipFile.exists()) { if (Constant.MODE_DEBUG) System.out.println("Generate network's IP adress file : "); sm.generateNetworkIpAdress(prop.getPropValues(PropReader.NETWORK_IP_REGEX)); } else { Constant.PATH_NETWORK_IP_FILE = ipFileString; } // get values from properties file fileToTreat = prop.getPropValues(PropReader.FILE_INPUT); int nbWorkerMax = Integer.parseInt(prop.getPropValues(PropReader.WORKER_MAX)); portMasterDictionary = Integer.parseInt(prop.getPropValues(PropReader.PORT_MASTER_DICTIONARY)); portTaskTracker = Integer.parseInt(prop.getPropValues(PropReader.PORT_TASK_TRACKER)); if (Constant.MODE_DEBUG) { System.out.println("Variables initialized"); System.out.println("Get workers core alive : "); } try { Constant.PATH_JAR_MASTER = URLDecoder.decode(Constant.PATH_JAR_MASTER_TODECODE, "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } // get workers workersCores = sm.getHostAliveCores(nbWorkerMax, false, true); nbWorker = workersCores.size(); if (Constant.MODE_DEBUG) { System.out.println("Workers core : " + workersCores); totalTime = (double) ((System.currentTimeMillis() - st) / 1000.0) % 60; System.out.println(); System.out.println(Constant.APP_DEBUG_BLOC + " Initialize and clean in " + totalTime + " secondes " + Constant.APP_DEBUG_BLOC); System.out.println(); System.out.println(); } } /** * Launch MapReduce process */ public void launchMapReduce() { if (Constant.MODE_DEBUG) { System.out.println(Constant.APP_DEBUG_BLOC + " MapReduce process " + Constant.APP_DEBUG_BLOC); startTime = System.currentTimeMillis(); System.out.println(); } // split the file : master if (Constant.MODE_DEBUG) { System.out.println(Constant.APP_DEBUG_TITLE + " Input splitting on " + fileToTreat); st = System.currentTimeMillis(); } //对输入数据进行分片 List<String> filesToMap = inputSplitting(workersCores, fileToTreat); if (Constant.MODE_DEBUG) { totalTime = (double) ((System.currentTimeMillis() - st) / 1000.0) % 60; System.out.println(Constant.APP_DEBUG_TITLE + " done in " + totalTime + " secondes"); System.out.println(); } // launch maps process : master & slave if (Constant.MODE_DEBUG) { System.out.println(Constant.APP_DEBUG_TITLE + " Split mapping"); st = System.currentTimeMillis(); } dictionaryMapping = launchSplitMappingThreads(workersCores, filesToMap); if (Constant.MODE_DEBUG) { totalTime = (double) ((System.currentTimeMillis() - st) / 1000.0) % 60; System.out.println("Mapping dictionary's size : " + dictionaryMapping.size()); System.out.println(Constant.APP_DEBUG_TITLE + " done in " + totalTime + " secondes"); System.out.println(); } // launch shuffling maps process : master & slave if (Constant.MODE_DEBUG) { System.out.println(Constant.APP_DEBUG_TITLE + " Shuffling map"); st = System.currentTimeMillis(); } try { dictionaryReducing = launchShufflingMapThreads(workersCores); } catch (IOException e) { e.printStackTrace(); } if (Constant.MODE_DEBUG) { totalTime = (double) ((System.currentTimeMillis() - st) / 1000.0) % 60; System.out.println(Constant.APP_DEBUG_TITLE + " done in " + totalTime + " secondes"); System.out.println(); } // assembling final maps : master if (Constant.MODE_DEBUG) { System.out.println(Constant.APP_DEBUG_TITLE + " Assembling final maps"); st = System.currentTimeMillis(); } assemblingFinalMaps(); if (Constant.MODE_DEBUG) { totalTime = (double) ((System.currentTimeMillis() - st) / 1000.0) % 60; System.out.println(Constant.APP_DEBUG_TITLE + " done in " + totalTime + " secondes"); System.out.println(); totalTime = (double) ((System.currentTimeMillis() - startTime) / 1000.0) % 60; System.out.println(Constant.APP_DEBUG_BLOC + " MapReduce process done in " + totalTime + " secondes " + Constant.APP_DEBUG_BLOC); System.out.println(); totalTime = (double) ((System.currentTimeMillis() - stShavadoop) / 1000.0) % 60; System.out.println("Shavadoop program done in " + totalTime + " secondes "); System.out.println(); } } /** * Split the original file * * @param workers * @param fileToTreat * @return list files splitted */ private List<String> inputSplitting(List<String> workers, String fileToTreat) { List<String> filesToMap; if (Constant.MODE_DEBUG) System.out.println("Nb workers mappers : " + (nbWorker) + " " + workers); long sizeFileToTreat = new File(fileToTreat).length(); int totalBloc; // split by line if (sizeFileToTreat < Constant.BLOC_SIZE_MIN) { totalBloc = Util.getFileNumberLine(fileToTreat); // split by bloc } else { totalBloc = (int) Math.ceil((double) sizeFileToTreat / (double) Constant.BLOC_SIZE_MIN); } // if too more worker available for map process if (nbWorker > totalBloc) { nbWorker = totalBloc; } // split by line if (sizeFileToTreat < Constant.BLOC_SIZE_MIN) { // the rest of the division for the last host int restBlocByHost = totalBloc % nbWorker; // Calculate the number of lines for each host int nbBlocByHost = (totalBloc - restBlocByHost) / (nbWorker); if (Constant.MODE_DEBUG) System.out.println("Nb line by host mapper : " + (nbBlocByHost)); if (Constant.MODE_DEBUG) System.out.println("Nb line for the last host mapper : " + (restBlocByHost)); filesToMap = Util.splitByLineFile(fileToTreat, nbBlocByHost, restBlocByHost, nbWorker); if (Constant.MODE_DEBUG) System.out.println("Nb line to tread : " + (filesToMap.size())); } else { // split by bloc filesToMap = Util.splitLargeFile(fileToTreat); if (Constant.MODE_DEBUG) System.out.println("Nb bloc (" + Constant.BLOC_SIZE_MIN + " MB) to tread : " + (filesToMap.size())); } return filesToMap; } /** * Launch a thread to execute map on each distant computer * * @param workersMapperCores * @param filesToMap * @return dictionary mapping */ private Map<String, HashSet<Pair>> launchSplitMappingThreads(List<String> workersMapperCores, List<String> filesToMap) { // object to synchronize threads ExecutorService es = Executors.newCachedThreadPool(); TaskTracker ts = new TaskTracker(sm, es, portTaskTracker, String.valueOf(nbWorker), null); es.execute(ts); int sizeFilesToMap = filesToMap.size(); if (Constant.MODE_DEBUG) System.out.println("Nb workers mappers : " + nbWorker); if (Constant.MODE_DEBUG) System.out.println("Nb files splitted : " + sizeFilesToMap); // dictionary ConcurrentHashMap<String, HashSet<Pair>> dicoMapping = new ConcurrentHashMap<String, HashSet<Pair>>(); // listener to get part dictionary from the worker mappers es.execute(new DictionaryManager(portMasterDictionary, sizeFilesToMap, dicoMapping)); int idWorkerMapperCore = 0; // for each files to map for (int i = 0; i < sizeFilesToMap; i++) { int id = i; if (nbWorker <= sizeFilesToMap && id >= nbWorker) { // for blocs, it's sequential id = idWorkerMapperCore % nbWorker; } else { id = i; } String worker = workersMapperCores.get(id); Thread smt = new LaunchSplitMapping(sm, String.valueOf(nbWorker), worker, filesToMap.get(i), sm.getHostFull(), Integer.toString(idWorkerMapperCore)); es.execute(smt); ts.addTask(smt, worker, Integer.toString(idWorkerMapperCore), Slave.SPLIT_MAPPING_FUNCTION, filesToMap.get(i), null); ++idWorkerMapperCore; } try { es.awaitTermination(Constant.THREAD_MAX_LIFETIME, TimeUnit.MINUTES); } catch (InterruptedException e) { e.printStackTrace(); } return dicoMapping; } /** * Launch a thread to execute shuffling map on each distant computer * * @param workersCores * @return dictionary reducing * @throws IOException */ private Map<String, String> launchShufflingMapThreads(List<String> workersCores) throws IOException { // host who have a reduce file to assemble ConcurrentHashMap<String, String> dicoReducing = new ConcurrentHashMap<String, String>(); // object to synchronize threads ExecutorService es = Executors.newCachedThreadPool(); TaskTracker ts = new TaskTracker(sm, es, portTaskTracker, String.valueOf(nbWorker), dicoReducing); es.execute(ts); // for each key and files to shuffling maps for (Entry<String, HashSet<Pair>> e : dictionaryMapping.entrySet()) { int idWorkerReducerCore = Integer.valueOf(e.getKey()); String workerReducer = workersCores.get(idWorkerReducerCore); // File output String shufflingDictionaryFile = Constant.PATH_F_SHUFFLING_DICTIONARY + Constant.SEP_NAME_FILE + idWorkerReducerCore; FileWriter fw = new FileWriter(shufflingDictionaryFile); BufferedWriter bw = new BufferedWriter(fw); PrintWriter write = new PrintWriter(bw); for (Pair p : e.getValue()) { write.println(p.getVal1() + Constant.SEP_CONTAINS_FILE + p.getVal2()); } write.close(); dicoReducing.put(Integer.toString(idWorkerReducerCore), workerReducer); // launch shuffling map process Thread smt = new LaunchShufflingMap(sm, String.valueOf(nbWorker), workerReducer, shufflingDictionaryFile, sm.getHostFull(), Integer.toString(idWorkerReducerCore)); es.execute(smt); ts.addTask(smt, workerReducer, Integer.toString(idWorkerReducerCore), Slave.SHUFFLING_MAP_FUNCTION, shufflingDictionaryFile, e.getKey()); } try { es.awaitTermination(Constant.THREAD_MAX_LIFETIME, TimeUnit.MINUTES); } catch (InterruptedException e) { e.printStackTrace(); } return dicoReducing; } /** * Concat final maps together in one file result */ private void assemblingFinalMaps() { // final file to reduce String fileFinalResult = Constant.PATH_F_FINAL_RESULT; // get the list of file Set<String> listFiles = new HashSet<String>(); for (Entry<String, String> e : dictionaryReducing.entrySet()) { String idWorker = e.getKey(); String worker = e.getValue(); String nameFileToMerge = Constant.PATH_F_REDUCING + Constant.SEP_NAME_FILE + idWorker // id worker + Constant.SEP_NAME_FILE + worker; // hostname listFiles.add(nameFileToMerge); } if (Constant.MODE_DEBUG) System.out.println("Nb files to merge : " + listFiles.size()); // concat data of each files in one ConcurrentHashMap<String, Integer> finalResult = new ConcurrentHashMap<String, Integer>(); ExecutorService es = Executors.newCachedThreadPool(); // for each files for (Iterator<String> it = listFiles.iterator(); it.hasNext();) { final String file = it.next(); // merge file into the final hashmap es.execute(new LaunchMergeFile(file, finalResult)); } es.shutdown(); try { es.awaitTermination(Constant.THREAD_MAX_LIFETIME, TimeUnit.MINUTES); } catch (InterruptedException e) { e.printStackTrace(); } // write the final result Util.writeFileFromMap(fileFinalResult, finalResult); } }