/* * This file or a portion of this file is licensed under the terms of * the Globus Toolkit Public License, found in file GTPL, or at * http://www.globus.org/toolkit/download/license.html. This notice must * appear in redistributions of this file, with or without modification. * * Redistributions of this Software, with or without modification, must * reproduce the GTPL in: (1) the Software, or (2) the Documentation or * some other similar material which is provided with the Software (if * any). * * Copyright 1999-2004 University of Chicago and The University of * Southern California. All rights reserved. */ package org.griphyn.vdl.diagnozer; import java.io.*; import java.util.*; import java.util.regex.*; import org.griphyn.vdl.directive.*; import org.griphyn.vdl.toolkit.FriendlyNudge; /** * Inspects various files in a run directory to help diagnoze an error. * * @author Jin Soon Chang * @version $Revision$ */ public class Diagnozer { /** * Contains the base directory in and underneath which all files reside. */ private File m_basedir; /** * Names the .dag file */ private File m_dagfile; /** * Maps a job ID to its submit file. */ private Map m_job; /** * Remembers job IDs from jobs that were done (successfully). */ private Set m_done; /** * Maps a job ID to a JobInfo record. */ private Map m_jobRecord; /** * Maps a parent job ID to all dependent children. */ private Map m_parents; /** * Maps a child job ID to all dependent parents. */ private Map m_children; /** * work directory */ String m_workdir; /** * any signal from user */ String m_signal; public void parseDAG( File dag ) throws IOException { String line, parent, child; StringTokenizer st = null; LineNumberReader lnr = new LineNumberReader( new FileReader(dag) ); while ( (line = lnr.readLine()) != null ) { String lower = line.toLowerCase().trim(); if ( lower.startsWith("job ") ) { // JOB ID000001 00/00/ID000001.sub [ DONE ] st = new StringTokenizer(line.trim()); st.nextToken(); // JOB String jobid = st.nextToken(); // jobid String subfn = st.nextToken(); // submit file m_job.put( jobid, subfn ); /*JobInfo job=new JobInfo(); job.m_id=jobid; m_jobRecord.put(jobid,job);*/ if ( m_done != null && st.hasMoreTokens() && st.nextToken().toLowerCase().equals("done") ) m_done.add(jobid); /* else{ JobInfos jobs=new JobInfos (jobid); JobInfo job=new JobInfo(); job.setId(jobid); job.setRetry("0"); jobs.addJobInfo(job,"0"); m_jobRecord.put(jobid,jobs); } */ } else if ( lower.startsWith("parent ") ) { // PARENT ID000093 CHILD ID000094 TreeSet parents = new TreeSet(); TreeSet children = new TreeSet(); st = new StringTokenizer(line.trim()); st.nextToken(); // PARENT do { parent = st.nextToken(); if ( parent.toLowerCase().equals("child") ) break; parents.add(parent); } while ( st.hasMoreTokens() ); while ( st.hasMoreTokens() ) { children.add( st.nextToken() ); } for ( Iterator i=parents.iterator(); i.hasNext(); ) { parent = (String) i.next(); if ( ! m_parents.containsKey(parent) ) m_parents.put( parent, new TreeSet() ); ((Set) m_parents.get(parent)).addAll(children); } for ( Iterator i=children.iterator(); i.hasNext(); ) { child = (String) i.next(); if ( ! m_children.containsKey(child) ) m_children.put( child, new TreeSet() ); ((Set) m_children.get(child)).addAll(parents); } } } lnr.close(); } /** * c'tor. */ public Diagnozer( String basedir ) throws IOException { m_basedir = new File(basedir); if ( ! m_basedir.isDirectory() ) throw new IOException( basedir + " is not a directory" ); // post-condition: itsa dir File[] dagfiles = m_basedir.listFiles(new FindTheFile(".dag")); // File[] dagfiles = m_basedir.listFiles(new FindTheRegex("\\.dag$")); if ( dagfiles.length != 1 ) throw new RuntimeException( "too many dag files in " + basedir ); m_dagfile = dagfiles[0]; m_workdir=basedir; m_job = new TreeMap(); m_parents = new TreeMap(); m_children = new TreeMap(); m_jobRecord = new HashMap(); m_signal=null; // find the rescue dag File rescuedag = new File( m_dagfile.getPath() + ".rescue" ); if ( rescuedag.exists() ) { // parse rescue dag instead m_done = new TreeSet(); parseDAG(rescuedag); } else { // parse regular dag file m_done = null; parseDAG(m_dagfile); } for ( Iterator i=m_job.keySet().iterator(); i.hasNext(); ) { String jobid = (String) i.next(); String subfn = (String) m_job.get(jobid); if(allParentsDone(jobid) && !m_done.contains(jobid)){ JobInfos jobs=new JobInfos (jobid); JobInfo job=new JobInfo(); job.setId(jobid); job.setRetry("0"); jobs.addJobInfo(job,"0"); m_jobRecord.put(jobid,jobs); } } } /** * Dumps knowledge about the DAG for debugging purposes. */ public void dump() { for ( Iterator i=m_job.keySet().iterator(); i.hasNext(); ) { String jobid = (String) i.next(); String subfn = (String) m_job.get(jobid); String done = m_done == null ? "" : ( m_done.contains(jobid) ? "is done" : "NOT done" ); System.out.println( jobid + " -> " + subfn + ": " + done ); } for ( Iterator i=m_parents.keySet().iterator(); i.hasNext(); ) { String parent = (String) i.next(); System.out.println( "PARENT " + parent + " CHILD " + m_parents.get(parent).toString() ); } } public void parseDebug( String dbgfile ) { String line; File dbg = new File(dbgfile); LineNumberReader lnr = null; try { lnr = new LineNumberReader( new FileReader(dbg) ); while ( (line = lnr.readLine()) != null ) { } lnr.close(); } catch ( IOException ioe ) { System.err.println( "Warning: Unable to read " + dbgfile ); } } public void getDebugInfo() { for ( Iterator i=m_job.keySet().iterator(); i.hasNext(); ) { String jobid = (String) i.next(); String subfn = (String) m_job.get(jobid); if(!m_done.contains(jobid) && allParentsDone(jobid)){ String dbgfile=subfn.replaceAll(".sub",""); ParseDbg(m_dagfile.getParent()+"/"+dbgfile.trim()+".dbg",jobid); } } } public void ParseDbg(String dbgfile,String jobid){ try { File dbg=null; LineNumberReader lnr=null; try { dbg=new File(dbgfile); lnr = new LineNumberReader( new FileReader(dbg) ); } catch ( FileNotFoundException fne ) { System.err.println(dbgfile+" doesn't exists"); return; } int retries=-1; String line; //20041010T140006.218 [16939] PRE: chose site "term" Pattern site = Pattern. compile(".*chose\\ssite\\s(.*)"); //20040901T184209.513 [8299] PRE: starting /home/changjs/vds/contrib/Euryale/prescript.pl Pattern retry=Pattern. compile(".*pre:\\sstarting\\s.*prescript.*"); //20041028T143022.783 [4579] PRE: server gsiftp://gainly.uchicago.edu problem: connect: Connection refused Pattern badsite = Pattern. compile(".*pre:\\s(.*)problem:\\s(.*)"); //Unable to stage-in "fmri.1129-5_anonymized.img": no replicas found at /home/changjs/vds-1.3.2/contrib/Euryale/prescript.pl line 338. Pattern fe= Pattern. compile("\\s*([a-zA-Z].*)"); //20041029T014559.358 [8844] PRE: [transfer|T2] # [0x00004002] 1 1/0 2.477s "fmri.3472-5_anonymized.warp" error: the server sent an error response: 530 530 No local mapping for Globus ID //20040901T173703.111 [30512] PRE: stage-in exit code 42, trying to replan Pattern tr=Pattern. compile(".*\\[transfer\\].*\\s(.*)\\serror:\\s(.*)|.*\\[t2\\].*\\s(.*)\\serror:\\s(.*)"); //20041104T100447.069 [4466] PRE: [T2] error: globus_ftp_client: the server responded with an error Pattern tr2=Pattern. compile(".*\\[t2\\]\\serror:\\s(.*)|.*\\[transfer\\]\\serror:\\s(.*)"); String currentSite=null; while ( (line = lnr.readLine()) != null ) { String lower = line.toLowerCase().trim(); Matcher siteM=site.matcher(lower); Matcher badsiteM=badsite.matcher(lower); Matcher feM=fe.matcher(lower); Matcher trM=tr.matcher(lower); Matcher tr2M=tr2.matcher(lower); Matcher retryM=retry.matcher(lower); if(retryM.matches()){ retries=retries+1; } if(trM.matches()){ String filename=trM.group(1); String error=trM.group(2); String currentRetry=Integer.toString(retries); String errorS="Transfer Error: "+filename+" "+error+" "+ currentSite; //((JobInfo) m_jobRecord.get(jobid)).m_fatalErrorMessages.add(errorS); ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .addFatalErrorMessage(errorS); } if(tr2M.matches()){ String error=tr2M.group(1); String currentRetry=Integer.toString(retries); String errorS="Transfer Error: "+error+" "+ currentSite; ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .addFatalErrorMessage(errorS); } if ( badsiteM.matches() ) { String server=badsiteM.group(1); String error=badsiteM.group(2); String currentRetry=Integer.toString(retries);; error=server+" " + error; //System.out.println(error); ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .addFatalErrorMessage(error); } else if ( feM.matches() ) { String currentRetry=Integer.toString(retries); String feMes=feM.group(1); if(m_jobRecord.get(jobid)==null){ System.out.println("dsadsad"); } if((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)==null) { System.out.println("dsadsa"+currentRetry); } ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .addFatalErrorMessage(feMes); } else if ( siteM.matches() ) { String currentRetry=Integer.toString(retries);; currentSite=siteM.group(1); ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .setPool(currentSite); } else if ( lower.matches(".*out\\sof\\ssite\\scandidates.*") ) { String currentRetry=Integer.toString(retries); //20041028T143028.143 [4605] PRE: out of site candidates, giving up! ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .setPool(currentSite); } } } catch ( Exception e ) { e.printStackTrace(); } } public void parseDagmanOut() { try { String line; File dagmanout=new File(m_dagfile+".dagman.out"); //10/10 13:47:43 PRE Script of Job ID000003 failed with status 1 Map retries=new HashMap(); for ( Iterator i=m_jobRecord.keySet().iterator(); i.hasNext(); ) { retries.put(((String) i.next()),"0"); } Pattern exit = Pattern. compile(".+\\s(.*)\\sscript\\sof\\sjob\\s(.*)failed\\swith\\sstatus\\s(.*)"); //PRE Script of Job ID000001 completed successfully. Pattern success= Pattern. compile(".+\\s(.*)\\sscript\\sof\\sjob\\s(.*)\\scompleted\\ssuccessfully.*"); LineNumberReader lnr = new LineNumberReader( new FileReader(dagmanout) ); //10/29 01:48:02 Retrying node ID000001 (retry #1 of 5)... Pattern retry=Pattern. compile(".+\\sretrying\\snode\\s(.*)\\s\\(retry\\s#(.*)\\sof.*"); //compile(".+\\sretrying.*"); //11/4 10:33:42 Received SIGUSR1 Pattern sig= Pattern. compile(".*\\sreceived\\s(.*).*"); //10/29 01:46:17 Event: ULOG_GLOBUS_SUBMIT for Condor Job ID000001 (50119.0.0) Pattern stage=Pattern. compile(".*event:\\s(.*)\\sfor\\scondor\\sjob\\s(.*)\\s.*"); while ( (line = lnr.readLine()) != null ) { String lower = line.toLowerCase().trim(); Matcher preExitM = exit.matcher(lower); Matcher successM = success.matcher(lower); Matcher stageM=stage.matcher(lower); Matcher sigM=sig.matcher(lower); Matcher retryM=retry.matcher(lower); if(retryM.matches()){ //System.out.println(lower); String id=retryM.group(1); String retryN=retryM.group(2); if(m_jobRecord.containsKey(id.toUpperCase().trim())){ retries.put(id.trim().toUpperCase(),retryN.trim()); JobInfo j=new JobInfo(); j.setId(id.toUpperCase().trim()); j.setRetry(retryN); ((JobInfos)m_jobRecord.get(id.toUpperCase().trim())).addJobInfo(j,retryN); } } if(sigM.matches()){ String signal=sigM.group(1); m_signal=signal.toUpperCase().trim(); } if(preExitM.matches()){ String prepost=preExitM.group(1); String ID=preExitM.group(2); String exitCode=preExitM.group(3); String currentRetry=(String) retries.get(ID.toUpperCase().trim()); if(m_jobRecord.containsKey(ID.toUpperCase().trim())){ if ( prepost.equals("pre") ){ ((JobInfo)((JobInfos) m_jobRecord.get(ID.toUpperCase().trim())).getJobInfo(currentRetry)) .setPrescriptErrorCode(exitCode); ((JobInfo)((JobInfos) m_jobRecord.get(ID.toUpperCase().trim())).getJobInfo(currentRetry)) .setPostcriptErrorCode("N/A"); } if ( prepost.equals("post") ){ ((JobInfo)((JobInfos) m_jobRecord.get(ID.toUpperCase().trim())).getJobInfo(currentRetry)) .setPostcriptErrorCode(exitCode); String subfn = (String) m_job.get(ID.toUpperCase().trim()); File subDir=(new File(m_dagfile.getParent()+"/"+subfn)).getParentFile(); File outFile=new File(subDir.getPath()+ "/" +ID.toUpperCase().trim()+ ".out."+currentRetry); File errFile=new File(subDir.getPath()+ "/"+ID.toUpperCase().trim()+ ".err."+currentRetry); if(outFile==null){ System.out.println(subDir.getPath()+ ".out."+currentRetry+" doesn't exits"); } if(errFile==null){ System.out.println(subDir.getPath()+ ".err."+currentRetry+" doestn't exites"); } ParseOut(outFile,ID.toUpperCase().trim(), currentRetry); ParseError(errFile,ID.toUpperCase().trim(), currentRetry); } } } if(successM.matches()){ String prepost=successM.group(1); String jid=successM.group(2); String currentRetry=(String) retries.get(jid.trim().toUpperCase()); prepost.trim(); if(m_jobRecord.containsKey(jid.toUpperCase().trim())){ if ( prepost.equals("pre") ){ ((JobInfo)((JobInfos) m_jobRecord.get(jid.toUpperCase().trim())).getJobInfo(currentRetry)) .setPrescriptErrorCode("0"); } if ( prepost.equals("post") ){ ((JobInfo)((JobInfos) m_jobRecord.get(jid.toUpperCase().trim())).getJobInfo(currentRetry)) .setPostcriptErrorCode("0"); } } } if(stageM.matches()){ String jobID=stageM.group(2); String stageS=stageM.group(1); String currentRetry=(String)retries.get(jobID.trim().toUpperCase()); if(m_jobRecord.containsKey(jobID.toUpperCase().trim())){ ((JobInfo)((JobInfos) m_jobRecord.get(jobID.toUpperCase().trim())).getJobInfo(currentRetry)) .updateLastStage(stageS); } } } } catch ( Exception e ) { e.printStackTrace(); } } public void dumpJobRecords( PrintWriter pw ) throws IOException{ if(m_signal!=null) pw.println("THIS JOB WAS TERMINATED BY SIGNAL "+m_signal); pw.flush(); for ( Iterator i=m_jobRecord.keySet().iterator(); i.hasNext(); ) { String ID=(String) i.next() ; ((JobInfos) m_jobRecord.get(ID)).dump(pw); } pw.flush(); } public void ParseError(File errFile, String jobid, String currentRetry) { try { LineNumberReader lnr=null; String line; try { lnr = new LineNumberReader( new FileReader(errFile) ); } catch ( FileNotFoundException fne ) { //System.err.println(errFile.getName()+" doesn't exists"); return; } while ( (line = lnr.readLine()) != null ) { JobInfo job= (JobInfo) m_jobRecord.get(jobid); ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .addFatalErrorMessage(line); } } catch ( Exception e ) { e.printStackTrace(); } } public void ParseOut(File outFile, String jobid,String currentRetry) { try { LineNumberReader lnr=null; String line; try { lnr = new LineNumberReader( new FileReader(outFile) ); } catch ( FileNotFoundException fne ) { System.err.println(outFile.getName()+"is missing"); return; } //<data>/home/changjs/vdldemo/bin/align_warp dsadsa dsadsa dasdsa -m 12 -q //do_align_warp.c: 157: problem with file dsadsa //The specified file does not exist. (AIR_NO_FILE_READ_ERROR) //</data> ParseKickstart pks = new ParseKickstart(); String filename=outFile.getPath(); try{ ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .setOutfileExit(pks.parseFile(filename)); }catch (FriendlyNudge fn){ fn.toString(); }catch (ClassCastException cce){ System.out.println("class cast exception"); cce.printStackTrace(); } /* Pattern data = Pattern. compile("<data>(.*)"); //<data>/home/changjs/vdldemo/bin/align_warp dsadsa dsadsa dasdsa -m 12 -q //do_align_warp.c: 157: problem with file dsadsa //The specified file does not exist. (AIR_NO_FILE_READ_ERROR) //</data> Pattern endData=Pattern.compile(".*</data>.*"); while ( (line = lnr.readLine()) != null ) { //String lower = line.toLowerCase().trim(); Matcher dataM = data.matcher(line.trim()); Matcher endM=endData.matcher(line.trim()); if ( dataM.matches() ) { String dataS=dataM.group(1); //System.out.println(dataS); do{ //System.out.println(line); ((JobInfo)((JobInfos) m_jobRecord.get(jobid)).getJobInfo(currentRetry)) .setOutfileExit(errorS); line = lnr.readLine(); line=line.trim(); }while ( !endM.matches()) ; } }*/ } catch ( Exception e ) { e.printStackTrace(); } } /* public void getOutFileInfo() { for ( Iterator i=m_job.keySet().iterator(); i.hasNext(); ) { String jobid = (String) i.next(); String subfn = (String) m_job.get(jobid); if ( ! m_done.contains(jobid)&& ((JobInfo)m_jobRecord.get(jobid)).getLastStage()!=null) { File subDir=(new File(m_dagfile.getParent()+"/"+subfn)).getParentFile(); File[] outFiles=subDir.listFiles(new FindTheRegex(jobid+"\\.out.*")); //System.out.println("the sub "+ subDir.getPath()); if(outFiles==null) System.out.println("dsadsa"); System.out.println("out files:"+outFiles.length); for(int j=0;j<outFiles.length;++j){ System.out.println( outFiles[j].getName()); ParseOut(outFiles[j],jobid); } } } } */ private boolean allParentsDone(String cid){ TreeSet parents=new TreeSet(); parents=(TreeSet) m_children.get(cid); if(parents==null) return true; for ( Iterator j=parents.iterator(); j.hasNext(); ){ if(!m_done.contains((String)j.next())){ return false; } } return true; } /* public void getErrorFileInfo() { for ( Iterator i=m_job.keySet().iterator(); i.hasNext(); ) { String jobid = (String) i.next(); String subfn = (String) m_job.get(jobid); if ( !m_done.contains(jobid) && ((JobInfo)m_jobRecord.get(jobid)).getLastStage()!=null) { //String dbgfile=subfn.replaceAll(".sub",""); File subDir=(new File(m_dagfile.getParent()+"/"+subfn)).getParentFile(); File[] errFiles=subDir.listFiles(new FindTheRegex(jobid+"\\.err.*")); System.out.println("the sub "+ subDir.getPath()); if(errFiles==null) System.out.println("dsadsa"); System.out.println("err files:"+errFiles.length); for(int j=0;j<errFiles.length;++j){ System.out.println( errFiles[j].getName()); ParseError(errFiles[j],jobid); } } } } private String getSignal(){ return m_signal; } */ public static void main( String args[] ) { Diagnozer me = null; int result = 0; if ( args.length != 1 ) { System.err.println( "Need the base directory" ); System.exit(1); } try { me = new Diagnozer(args[0]); me.parseDagmanOut(); me.getDebugInfo(); // me.getErrorFileInfo(); // me.getOutFileInfo(); me.dumpJobRecords(new PrintWriter( System.out) ); //me.dump(); } catch ( IOException ioe ) { System.err.println( "ERROR: " + ioe.getMessage() ); result = 1; } catch ( RuntimeException rte ) { System.err.println( "RTE: " + rte.getMessage() ); rte.printStackTrace(System.err); result = 1; } catch ( Exception e ) { System.err.println( "FATAL: " + e.getMessage() ); e.printStackTrace(System.err); result = 2; } if ( result != 0 ) System.exit(result); } }