/* * The contents of this file are subject to the terms * of the Common Development and Distribution License * (the License). You may not use this file except in * compliance with the License. * * You can obtain a copy of the License at * http://www.sun.com/cddl/cddl.html or * install_dir/legal/LICENSE * See the License for the specific language governing * permission and limitations under the License. * * When distributing Covered Code, include this CDDL * Header Notice in each file and include the License file * at install_dir/legal/LICENSE. * If applicable, add the following below the CDDL Header, * with the fields enclosed by brackets [] replaced by * your own identifying information: * "Portions Copyrighted [year] [name of copyright owner]" * * Copyright 2009 Sun Microsystems Inc. All Rights Reserved */ package com.sun.hadoop.harness; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Calendar; import java.util.Iterator; import java.util.List; import java.util.StringTokenizer; import java.util.Vector; import java.util.logging.Logger; /** * * @author Damien Cooke * The intention here is to read a line from the log and pop it into one of these objects then have the object * put into a container class for later reteieval. */ public class MRLogParser { static Logger logger = Logger.getLogger(MRLogParser.class.getName()); String runID = null; String hostname = null; /* * method processes a single directory by retrieving each logfile and disseminating the lines into descrete objects for processing by specialised classes by type. * @param baseDirectory, path to the base directory to locate the log files in. * @param runID, FABAN supplied run identifier so we put the results in the right place * @return collection of TimeCapture objects or null if there was an error */ public Vector<TimeCapture> processDirctory(final String baseDirectory, final String runID) { Vector<TimeCapture> capturedStats = new Vector<TimeCapture>(); Calendar startTime = Calendar.getInstance(); String logLocation = null; String runName = null; if((baseDirectory != null)&&(baseDirectory.length() > 0)) { StringTokenizer st = new StringTokenizer(baseDirectory); if(st.hasMoreTokens()) { hostname = st.nextToken(); } logLocation = baseDirectory; }else { logger.warning("MRLogParser.processDirctory() Bogus baseDirectory passed in" + baseDirectory); Vector<TimeCapture> returnTime = null; return returnTime; } if((runID != null)&&(runID.length() > 0)) { runName = runID; }else { logger.warning("MRLogParser.processDirctory() Bogus runID passed in" + runID); Vector<TimeCapture> returnTime = null; return returnTime; } File logsDirectory = new File(baseDirectory); String[] taskIDList = null; if(logsDirectory.isDirectory()) { //we now have a list of the tasks to be examined. taskIDList = logsDirectory.list(); }else { logger.warning("MRLogParser.processDirctory() logsDirectory passed in is not a directory: "+ baseDirectory); Vector<TimeCapture> returnTime = null; return returnTime; } Arrays.sort(taskIDList); List<String> taskList = Arrays.asList(taskIDList); //now create two arrays one for the reducers and one for the mappers ArrayList<String> mapperList = new ArrayList<String>(); ArrayList<String> reducerList = new ArrayList<String>(); //only capture the files that are for this run. //taskList.trimToSize(); String lastDirectory; //if it is the first time it has been run since reboot we need to grab the first one //logger.warning("taskList size = "+ taskList.size()); if(taskList.size() == 1) { lastDirectory = taskList.get(taskList.size()); }else //else we get the last but 1 { lastDirectory = taskList.get(taskList.size() -1); } String[] directoryPattern = lastDirectory.split("_"); lastDirectory = directoryPattern[1]; directoryPattern = null; //logger.warning("Directory patten found = "+directoryPattern[1]); String mapredTask; for(Iterator<String> sit = taskList.iterator(); sit.hasNext();) { while(sit.hasNext()) { mapredTask = sit.next(); if(mapredTask.contains(lastDirectory) == true )//only include items from the last run { if(mapredTask.contains("_m_") == true ) { //put the mappers in their own container mapperList.add(mapredTask); }else if(mapredTask.contains("_r_") == true ) { //put the reducers in their own container reducerList.add(mapredTask); } } } } taskList = null; mapredTask = null; logger.info("Mapper list contains "+mapperList.size()); logger.info("Reducer List contains "+reducerList.size()); TimeCapture timeConsumedinMap = calculateMRTime(mapperList, baseDirectory); TimeCapture timeConsumedinRed = calculateMRTime(reducerList, baseDirectory); TimeCapture timeConsumedinShuffle = calculateShuffleTime(reducerList, baseDirectory); TimeCapture timeConsumedinReduceWrite = calculateWriteTime(reducerList, baseDirectory); logger.info("Mappers combined time taken = "+timeConsumedinMap.getTotalAcumulatedTime()/1000 + " seconds Starting @" +timeConsumedinMap.getFirstRecord().getTime().toString()); logger.info("Reducers combined time taken = "+timeConsumedinRed.getTotalAcumulatedTime()/1000 + " seconds Starting @" +timeConsumedinRed.getFirstRecord().getTime().toString()); logger.info("Shuffle combined time taken = "+timeConsumedinShuffle.getTotalAcumulatedTime() + " milliseconds Starting @" +timeConsumedinShuffle.getFirstRecord().getTime().toString()); logger.info(" Reduce write combined time taken = "+timeConsumedinReduceWrite.getTotalAcumulatedTime() + " miliseconds Starting @" +timeConsumedinReduceWrite.getFirstRecord().getTime().toString()); Calendar finishTime = Calendar.getInstance(); logger.info("Time taken to process logs = " + (finishTime.getTimeInMillis() - startTime.getTimeInMillis()) + " milliseconds"); //set the tags so we do not care for the order they are written into the vector timeConsumedinMap.setTag("MAP"); timeConsumedinRed.setTag("REDUCE"); timeConsumedinShuffle.setTag("SHUFFLE"); timeConsumedinReduceWrite.setTag("WRITE"); //copy the results into the vector for returning. capturedStats.add((TimeCapture)timeConsumedinMap); capturedStats.add((TimeCapture)timeConsumedinRed); capturedStats.add((TimeCapture)timeConsumedinShuffle); capturedStats.add((TimeCapture)timeConsumedinReduceWrite); return capturedStats; } /* * Reades a file of log entries into a vector of LogEntry objects * @param fileToRead file whoese contents need to read into the Vector * @return collection of type LogEntry for further processing */ public Vector<LogEntry> jobLogFileReader(final String fileToRead) { Vector<LogEntry> logEntries = new Vector<LogEntry>(); try { BufferedReader logReader = new BufferedReader(new FileReader(fileToRead)); String line = logReader.readLine(); LogEntry entry; while((line != null)&&(line.length() != 0)) { entry = parseLine(line); if(entry != null) { logEntries.add((LogEntry)entry); } line = logReader.readLine(); } logReader.close(); line = null; }catch(IOException ioe) { logger.warning("Some file IO error in jobLogFileReader occured with file fileToRead : "+ioe.getMessage()); Vector<LogEntry>returnVector = null; return returnVector; } return (Vector<LogEntry>)logEntries; } /* * Calculates the time spent writing in the reduce phase of the process * @param reduceList an ArrayList of directories holdiing files to process * @param baseDirectory holds the location of the base directory for these directories * @return TimeCapture object encapsulating processing time data for further processing * */ public TimeCapture calculateWriteTime(final ArrayList<String> reduceList, final String baseDirectory) { boolean isFirst = true; TimeCapture timeConsumed = new TimeCapture(); for(Iterator<String> it = reduceList.iterator();it.hasNext();) { Vector<LogEntry> reduceLogEntries = new Vector<LogEntry>(); reduceLogEntries = (Vector<LogEntry>)jobLogFileReader(baseDirectory+"/"+it.next()+"/syslog"); if(reduceLogEntries == null) //something failed here { logger.warning("taskLogEntries artificially empty in calculateWriteTime"); TimeCapture returnTime = null; return returnTime; }else { Calendar start = null; Calendar end = null; for(Iterator<LogEntry> itle = reduceLogEntries.iterator();itle.hasNext();) { LogEntry logentry = itle.next(); if(logentry.getDate() == null) { logger.info("date is null"); TimeCapture returnTime = null; return returnTime; }else { //logger.info(logentry.getMessage()); if(logentry.getMessage().contains("commiting") == true) { start = logentry.getDate(); if(isFirst) { timeConsumed.setFirstRecord(start); isFirst = false; } //(logentry.getSource().compareTo("org.apache.hadoop.mapred.FileOutputCommitter") == 0) && }else if(logentry.getMessage().contains("Saved output of task") == true) { end = logentry.getDate(); timeConsumed.setTotalAcumulatedTime((end.getTimeInMillis() - start.getTimeInMillis() )); end = null; start = null; } } } } } return timeConsumed; } /* * Calculates the time spent shuffling in the reduce phase of the process * @param reduceList an ArrayList of directories holdiing files to process * @param baseDirectory holds the location of the base directory for these directories * @return TimeCapture object encapsulating processing time data for further processing * */ public TimeCapture calculateShuffleTime(final ArrayList<String> reduceList, final String baseDirectory) { TimeCapture timeConsumed = new TimeCapture(); boolean isFirst = true; for(Iterator<String> it = reduceList.iterator();it.hasNext();) { Vector<LogEntry> reduceLogEntries = new Vector<LogEntry>(); reduceLogEntries = (Vector<LogEntry>)jobLogFileReader(baseDirectory+"/"+it.next()+"/syslog"); if(reduceLogEntries == null) //something failed here { logger.warning("reduceLogEntries artificially empty in calculateShuffleTime"); TimeCapture returnTime = null; return returnTime; }else { Calendar start = null; Calendar end = null; for(Iterator<LogEntry> itle = reduceLogEntries.iterator();itle.hasNext();) { LogEntry logentry = itle.next(); if(logentry.getDate() == null) { logger.info("date is null"); TimeCapture returnTime = null; return returnTime; }else { if(logentry.getMessage().contains("Shuffling") == true) { start = logentry.getDate(); if(isFirst) { timeConsumed.setFirstRecord(start); isFirst = false; } }else if(logentry.getMessage().contains("Rec #") == true) { end = logentry.getDate(); if((start != null) && (end != null)) { timeConsumed.setTotalAcumulatedTime((end.getTimeInMillis() - start.getTimeInMillis() )); }else { //logger.info("either start or end == null"); timeConsumed.setTotalAcumulatedTime(0); } //timeConsumed = timeConsumed + (end.getTimeInMillis() - start.getTimeInMillis() ); end = null; start = null; } } } } } return timeConsumed; } /* * Calculates the map or reduce time depending on the list of files supplied * @param mrList an ArrayList of directories holdiing files to process * @param baseDirectory holds the location of the base directory for these directories * @return TimeCapture object encapsulating processing time data for further processing * */ public TimeCapture calculateMRTime(final ArrayList<String> mrList, final String baseDirectory) { boolean isFirst = true; TimeCapture timeConsumed = new TimeCapture(); for(Iterator<String> it = mrList.iterator();it.hasNext();) { Vector<LogEntry> taskLogEntries = new Vector<LogEntry>(); String currentLogDir = it.next(); taskLogEntries = (Vector<LogEntry>) jobLogFileReader(baseDirectory+"/"+currentLogDir+"/syslog"); if(taskLogEntries == null) //something failed here { logger.warning("taskLogEntries artificially empty in calculateMRTime"); //TimeCapture returnTime = null; //return returnTime; }else { for(Iterator<LogEntry> itle = taskLogEntries.iterator();itle.hasNext();) { LogEntry logentry = itle.next(); if(logentry.getDate() == null) { logger.info("date is null"); TimeCapture returnTime = null; return returnTime; } } if(taskLogEntries.isEmpty()) { logger.warning("Log file is empty: "+baseDirectory+"/"+currentLogDir+"/syslog"); }else { Calendar start = ((LogEntry)taskLogEntries.firstElement()).getDate(); if(isFirst) { timeConsumed.setFirstRecord(start); isFirst = false; } Calendar end = ((LogEntry)taskLogEntries.lastElement()).getDate(); timeConsumed.setTotalAcumulatedTime((end.getTimeInMillis() - start.getTimeInMillis() )); } } } return timeConsumed; } /* * Parse a line passed to the method looking for significant log entries * @param line Line to parse. * @return LogEntry for further processing */ public LogEntry parseLine(final String line) throws IOException { LogEntry result = null; //logger.warning("Line found = "+line); if( (line != null)&&(line.length() != 0)) { String date; String time; String loglevel; String classname; String message; String[] processedLine = line.split(" "); String[] parsedForMessages = line.split(":"); try { if(processedLine[3] == null) { logger.warning("processedLine[3] == null"); logger.warning("error located Line passed in was: "+line); } }catch(ArrayIndexOutOfBoundsException aiob) { result = null; return result; } date = processedLine[0]; time = processedLine[1]; loglevel = processedLine[2]; classname = processedLine[3]; //had to do a second split to get the messages. if(parsedForMessages.length > 3) { StringBuffer sb = new StringBuffer(); for(int i = 3; i < parsedForMessages.length; i++) { sb.append(parsedForMessages[i]); sb.append(":"); } message = sb.toString(); }else { logger.warning("There is a problem with your hadoop system!"); logger.warning("We have detected log entries that are not supposed to be there."); logger.warning("We could just ignore these but it is better that you know."); logger.warning("exception Line passed in was: "+line); logger.warning("Line examined = "+line); message = parsedForMessages[3]; } //date needs further processing String[] dateString = date.split("-"); Calendar timeStamp = Calendar.getInstance(); timeStamp.set(Calendar.YEAR,Integer.parseInt(dateString[0])); timeStamp.set(Calendar.MONTH, Integer.parseInt(dateString[1])); timeStamp.set(Calendar.DATE, Integer.parseInt(dateString[2])); String[] sec_milisecString = time.split(","); String[] timeString = sec_milisecString[0].split(":"); timeStamp.set(Calendar.HOUR_OF_DAY, Integer.parseInt(timeString[0])); timeStamp.set(Calendar.MINUTE, Integer.parseInt(timeString[1])); timeStamp.set(Calendar.SECOND, Integer.parseInt(timeString[2])); timeStamp.set(Calendar.MILLISECOND, Integer.parseInt(sec_milisecString[1])); //now create a new LogEntry to store the entry result = new LogEntry(); //we do not have the hosname or runid info at this point. result.setDate(timeStamp); result.setLogLevel(loglevel); result.setSource(classname); result.setMessage(message); if(result.getDate() == null) { logger.warning("Error in getParser date is null"); } }else { result = null; return result; } return result; } }