Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add two scripts for easier monitoring status of crab jobs #146

Open
wants to merge 3 commits into
base: UL_master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
358 changes: 358 additions & 0 deletions multilep/test/scripts/monitorCrabJobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
#!/bin/env python

##############################################################
# monitor crab jobs and put the results in a summary webpage #
##############################################################

# copied and modified from Kirill
# basic how to use:
# - follow the steps here if not done so before: https://t2bwiki.iihe.ac.be/PublicWebpages
# - run this script with 'python monitorCrabJobs.py --crabdir <crab log folder>'.
# the crab log folder should be the folder grouping all crab logs within a heavyNeutrino release,
# it is typically given by <some CMSSW version>/src/heavyNeutrino/multilep/test/crab.
# - it might take a long time to run if there are many samples
# (since the crab status command is rather slow),
# so it could be a good idea to run it in a screen terminal.
# - the result is stored in the form of a html document in ~/public_html/crab_status/index.html;
# you can watch it in http://homepage.iihe.ac.be/~YOURUSERNAME/crab_status
# further notes on usage:
# - run with 'python monitorCrabJobs.py -h' for a list of all available options.
# - there are two prerequisites:
# - this script requires a valid grid certificate (for the crab status command to work properly).
# you can create one using 'voms-proxy-init --voms cms'.
# optionally, you can set the location of your proxy using the '--proxy <path to your proxy>' argument.
# - this script requires that you have a CMSSW environment (for the crab status command to work).
# go to a recent CMSSW_<version>/src directory and run 'cmsenv' before running this script.
# - to check if both prerequisites are satisfied, it might be a good idea to run a crab status command
# on a single sample of your choice to see that it works properly, before running this script.
# - the default webpage name is 'crab_status' (see above), but it can be modified
# with the '--webpage <name of webpage>' argument.
# - this script can be used to resubmit failed jobs as while doing the monitoring.
# for this, you need to add the '--resubmit True' argument (the default is False).
# - in general, crab jobs are resubmittable up to 3 weeks after first submission (?),
# and their status can be retrieved up to 4 weeks after first submission (?).
# make sure to have stored a copy of the status before that deadline,
# since ater that deadline the status is no longer retrievable
# and this script will output 'finished 0%' for each sample.
# TO DO: implement some sort of check to avoid overwriting with 'finished 0%'?


import os, sys, glob, subprocess, pexpect, json
from datetime import datetime
from optparse import OptionParser


def style():
### define a fixed style string for the web page
# only meant for internal use in web function

s = '<style>'

s += 'body {\n'
s += 'margin: 0;\n'
s += 'padding: 0;\n'
s += 'width: 100%;\n'
s += '}\n'

s += 'h1 {\n'
s += 'width: 100%;\n'
s += 'text-align: center;\n'
s += 'font-size:20px;\n'
s += 'margin:0;\n'
s += 'padding:0;\n'
s += 'background: red;\n'
s += 'color: #FFF;\n'
s += 'display: inline-block;\n'
s += '}\n'

s += 'h3 {\n'
s += 'width: 100%;\n'
s += 'text-align: center;\n'
s += 'font-size:15px;\n'
s += 'background: #EAEDED;\n'
s += 'margin:0;\n'
s += 'padding:0;\n'
s += 'display: inline-block;\n'
s += '}\n'

s += '.divide tr td { width:60%; }\n'

s += '#progress {\n'
s += 'width: 500px;\n'
s += 'border: 1px solid black;\n'
s += 'position: relative;\n'
s += 'padding: 3px;\n'
s += '}\n'

s += '#percent {\n'
s += 'position: absolute;\n'
s += 'left: 10%;\n'
s += '}\n'

s += '#bar {\n'
s += 'height: 20px;\n'
s += 'background-color: green;\n'
s += 'width: 30%;\n'
s += '}\n'

s += '</style>\n'

return s


def web( data, webpath ):
### convert sample completion info into a html document for web display.
# input arguments:
# - data: a dictionary as generated by the main section.
# it should contain the key 'samples' and optionally the key 'meta';
# the value for the 'meta' key is a str->str dict with meta-information
# to be displayed at the top of the page,
# the value for the 'samples' key is a dict matching sample names to status dicts.
# the sample names are assumed to be production/sample/version,
# and the status dicts are assumed to be str->str with status to fraction matching.
# example: data = {'samples': {
# 'singlelepton_MC_2017_ULv5/
# WWG_TuneCP5_13TeV-amcatnlo-pythia8/
# crab_RunIISummer20UL17MiniAOD-106X_mc2017_realistic_v6-v2_singlelepton_MC_2017_ULv5':
# {'running': '13.3%', 'finished': '73.3%', 'idle': '13.3%'}}}
# - webpath: directory where the resulting index.html file should be stored.
# if it does not exist yet, it will be created;
# if it already exists and contains an index.html file, that file will be overwritten.

# initializations
now = datetime.now()
if not os.path.exists(webpath): os.makedirs(webpath)

# make the page layout and header
page = '<html>\n'
page += '<head>\n'+style()+'</head>\n'
page += '<body>\n'
page += '<table style="background-color:#2C3E50;color:#EAECEE;'
page += 'font-size:40px;width:100%;text-align: center;">'
page += '<tr><td>Status of ntuple production</td></tr>'
page += '<tr><td style="font-size:15px;">Last update: '+now.strftime("%d/%m/%Y %H:%M:%S")+'</td></tr>'
page += '</table>\n'

# print some meta information
page += '<div id="meta-info"><h1>Meta-info</h1></div>\n'
if 'meta' in data.keys():
meta = data['meta']
print(meta)
for key,val in meta.items():
page += '<table class="divide" cellpadding="5px" cellspacing="0">\n'
page += '<tr>\n'
page += '<td style="width:30%">'+key+'</td>'
page += '<td style="widht:70%">'+val+'</td>\n'
page += '</tr>\n'
page += '</table>\n'
else:
page += '<table class="divide" cellpadding="5px" cellspacing="0">\n'
page += '<tr>\n'
page += '<td>(nothing to display)</td>'
page += '</tr>\n'
page += '</table>\n'

# get the sample data
sampledata = data['samples']

# sort the sample list
samples = sorted(list(sampledata.keys()),key=lambda x:x.lower())

# loop over samples
page += '<div id="samples"><h1>Samples</h1></div>\n'
for sample in samples:

# format sample name
production = sample.split('/')[0]
samplename = sample.split('/')[1]
versionname = sample.split('/')[2]

# get the data for this sample
sample_status = sampledata[sample]
status_str = ', '.join('{}: {}'.format(key,val) for key,val in sample_status.items())
finished_fraction = '0%'
if 'finished' in sample_status.keys(): finished_fraction = sample_status['finished']

# format the webpage entry
page += '<table class="divide" cellpadding="5px" cellspacing="0">\n'
page += '<tr>\n'
#page += '<td style="width:20%">'+production+'</td>'
page += '<td>'+samplename+'</td>'
page += '<td title='+versionname+'>version</td>'
page += '<td> <div id="progress">'
page += '<div id="percent" style="width=100%">'+status_str+'</div>'
page += '<div id="bar" style="width:'+finished_fraction+'"></div>'
page += '</div></td>\n'
page += '</tr>\n'

page += '</table>\n'
page += '</body>\n'
page += '</html>'

wfile = open(os.path.join(webpath,'index.html'), 'w')
wfile.write(page)
wfile.close()


if __name__ == '__main__':

# build command line parser
usage = "Usage: %prog [options]\n Monitoring script"
parser = OptionParser(usage)
parser.add_option('--crabdir', dest='crabdir',
help='main crab folder containing log files for all samples')
parser.add_option('--resubmit', dest='resubmit', default=False,
help='do resubmission of failed jobs (default: False, only monitor)')
parser.add_option('--proxy', dest='proxy', default=None,
help='path to your proxy (default: do not export proxy explicitly)')
parser.add_option('--webpage', dest='webpage', default='crab_status',
help='name of the webpage where the results will be displayed')
parser.add_option('--istest', dest='istest', default=False,
help='run in test mode, process only a few samples (default: False)')
(options, args) = parser.parse_args(sys.argv[1:])

# print command line arguments
print('Running monitorCrabJobs.py with following command line arguments:')
for opt in vars(options):
print(' - {}: {}'.format(opt, getattr(options,opt)))

# check command line arguments
if not os.path.isdir(options.crabdir):
raise Exception('ERROR: crab directory {} does not exist'.format(options.crabdir))
if options.proxy is not None:
if not os.path.exists(options.proxy):
raise Exception('ERROR: provided proxy {} does not exist'.format(options.proxy))

# export the proxy if requested
if options.proxy is not None:
os.system('export X509_USER_PROXY={}'.format(options.proxy))

# parse the web path
home = os.path.expanduser("~")
webpath = os.path.join(home, 'public_html', options.webpage)

# initializations
data = {'meta': {'generating script': os.path.abspath(__file__),
'command-line arguments': str(options)},
'samples': {}}
wdir = os.getcwd()
passp = ['Enter GRID pass phrase for this identity:', pexpect.EOF]
# note: once a proxy is created the password should not be needed anymore.
# still, T2 asks the password sometimes, in which case simply '\n' should suffice
# (see below)

# move to crab directory and find all sample folders
os.chdir(options.crabdir)
fproc = glob.glob('*/*/*')
nfproc = len(fproc)

# only for testing: subselect samples
if options.istest:
ntest = min(3, nfproc)
print('WARNING: running in test mode, will only process'
+' {} out of {} samples'.format(ntest, nfproc))
fproc = fproc[:ntest]

# initialize all samples to 0% finished
for fidx, f in enumerate(fproc):
data['samples'][f] = {'finished':'0%'}

# loop over samples
for fidx, f in enumerate(fproc):
print('Now processing sample {} of {}'.format(fidx+1,len(fproc)))
print('({})'.format(f))

# run crab status command and write the output to a log file
success = False
attempt = 0
while (attempt<5 and not success):
ch = pexpect.spawn('crab status -d '+f)
ch.timeout = 180 # in seconds, put large enough so the process finishes before limit
ch.logfile = open('monitor_tmp_log', 'w')
passpindex = ch.expect(passp)
if passpindex==0: ch.sendline('\n')
ch.read()
ch.close()
# check the output
with open('monitor_tmp_log', 'r') as fin:
outlines = fin.read().splitlines()
if len(outlines)==0:
print('Crab status seems to have failed, retrying...')
attempt += 1
else: success = True
if not success:
print('Crab status seems to have failed, skipping this sample.')
data['samples'][f] = {'crab status': 'failed'}
continue

# read the log file
jobsfailed = False
statuscompleted = False
with open('monitor_tmp_log', 'r') as fin:
outlines = fin.read().splitlines()
for line in outlines:
line = line.replace('Jobs status:','')
words = line.split()
if len(words)==0: continue
# check for known job statuses
for status in (['finished', 'running', 'transferring',
'failed', 'killed', 'idle','unsubmitted']):
if status in words[0]:
try: frac = words[2]
except: frac = '<none>'
# save to dict
data['samples'][f][status] = frac
# check if jobs failed for this sample
if( status=='failed' ): jobsfailed = True
print('Percentage '+status+': '+frac)

# check if job is complete
if 'Status on the scheduler' in line:
if 'COMPLETED' in line:
if not os.path.isfile(f+'/results/processedLumis.json'):
statuscompleted = True

# handle case where failed jobs were found
if jobsfailed:
if options.resubmit:
print('Found failed jobs, now resubmitting...')
ch = pexpect.spawn('crab resubmit -d '+f)
ch.timeout = 10000
ch.expect(passp)
ch.sendline('\n')
ch.expect(passp)
ch.sendline('\n')
ch.read()
ch.close()
print('Done')

# handle case where job is complete
if statuscompleted:
print('This task is completed...')
# below is commented: no report, and no purging.
'''print 'Get the report...'
ch = pexpect.spawn('crab report -d '+f)
ch.expect(passp)
ch.sendline('\n')
ch.expect(passp)
ch.sendline('\n')
ch.timeout = 10000
ch.read()
ch.close()
print 'Report obtained, run purge ..'
ch = pexpect.spawn('crab purge -d '+f)
ch.expect(passp)
ch.sendline('\n')
ch.timeout = 10000
ch.read()
ch.close()
print 'Done' '''

# make web interface for gathered completion data
os.chdir(wdir)
print('Loop over all samples completed.')
print('Retrieved following data:')
print(data)
web(data,webpath)
print('Sample status written to {}.'.format(webpath))
print('Done.')
Loading