GhentAnalysis · LukaLambrecht · Jul 27, 2022 · Jul 27, 2022 · Aug 11, 2022
diff --git a/multilep/test/scripts/monitorCrabJobs.py b/multilep/test/scripts/monitorCrabJobs.py
@@ -0,0 +1,358 @@
+#!/bin/env python
+
+##############################################################
+# monitor crab jobs and put the results in a summary webpage #
+##############################################################
+
+# copied and modified from Kirill
+# basic how to use:
+# - follow the steps here if not done so before: https://t2bwiki.iihe.ac.be/PublicWebpages
+# - run this script with 'python monitorCrabJobs.py --crabdir <crab log folder>'.
+#   the crab log folder should be the folder grouping all crab logs within a heavyNeutrino release,
+#   it is typically given by <some CMSSW version>/src/heavyNeutrino/multilep/test/crab.
+# - it might take a long time to run if there are many samples
+#   (since the crab status command is rather slow),
+#   so it could be a good idea to run it in a screen terminal.
+# - the result is stored in the form of a html document in ~/public_html/crab_status/index.html;
+#   you can watch it in http://homepage.iihe.ac.be/~YOURUSERNAME/crab_status
+# further notes on usage:
+# - run with 'python monitorCrabJobs.py -h' for a list of all available options.
+# - there are two prerequisites:
+#   - this script requires a valid grid certificate (for the crab status command to work properly).
+#     you can create one using 'voms-proxy-init --voms cms'.
+#     optionally, you can set the location of your proxy using the '--proxy <path to your proxy>' argument.
+#   - this script requires that you have a CMSSW environment (for the crab status command to work).
+#     go to a recent CMSSW_<version>/src directory and run 'cmsenv' before running this script.
+#   - to check if both prerequisites are satisfied, it might be a good idea to run a crab status command
+#     on a single sample of your choice to see that it works properly, before running this script.
+# - the default webpage name is 'crab_status' (see above), but it can be modified
+#   with the '--webpage <name of webpage>' argument.
+# - this script can be used to resubmit failed jobs as while doing the monitoring.
+#   for this, you need to add the '--resubmit True' argument (the default is False).
+# - in general, crab jobs are resubmittable up to 3 weeks after first submission (?),
+#   and their status can be retrieved up to 4 weeks after first submission (?).
+#   make sure to have stored a copy of the status before that deadline,
+#   since ater that deadline the status is no longer retrievable 
+#   and this script will output 'finished 0%' for each sample.
+#   TO DO: implement some sort of check to avoid overwriting with 'finished 0%'?
+
+
+import os, sys, glob, subprocess, pexpect, json
+from datetime import datetime
+from optparse import OptionParser
+
+
+def style():
+    ### define a fixed style string for the web page
+    # only meant for internal use in web function
+
+    s = '<style>'
+
+    s += 'body {\n'
+    s += 'margin: 0;\n'
+    s += 'padding: 0;\n'
+    s += 'width: 100%;\n'
+    s += '}\n'
+
+    s += 'h1 {\n'
+    s += 'width: 100%;\n'
+    s += 'text-align: center;\n'
+    s += 'font-size:20px;\n'
+    s += 'margin:0;\n'
+    s += 'padding:0;\n'
+    s += 'background: red;\n'
+    s += 'color: #FFF;\n'
+    s += 'display: inline-block;\n'
+    s += '}\n'
+
+    s += 'h3 {\n'
+    s += 'width: 100%;\n'
+    s += 'text-align: center;\n'
+    s += 'font-size:15px;\n'
+    s += 'background: #EAEDED;\n'
+    s += 'margin:0;\n'
+    s += 'padding:0;\n'
+    s += 'display: inline-block;\n'
+    s += '}\n'
+
+    s += '.divide tr td { width:60%; }\n'
+
+    s += '#progress {\n'
+    s += 'width: 500px;\n'
+    s += 'border: 1px solid black;\n'
+    s += 'position: relative;\n'
+    s += 'padding: 3px;\n'
+    s += '}\n'
+
+    s += '#percent {\n'
+    s += 'position: absolute;\n'
+    s += 'left: 10%;\n'
+    s += '}\n'
+
+    s += '#bar {\n'
+    s += 'height: 20px;\n'
+    s += 'background-color: green;\n'
+    s += 'width: 30%;\n'
+    s += '}\n'
+
+    s += '</style>\n'
+
+    return s
+
+
+def web( data, webpath ):
+    ### convert sample completion info into a html document for web display.
+    # input arguments:
+    # - data: a dictionary as generated by the main section.
+    #         it should contain the key 'samples' and optionally the key 'meta';
+    #         the value for the 'meta' key is a str->str dict with meta-information
+    #         to be displayed at the top of the page,
+    #         the value for the 'samples' key is a dict matching sample names to status dicts.
+    #         the sample names are assumed to be production/sample/version,
+    #         and the status dicts are assumed to be str->str with status to fraction matching.
+    #         example: data = {'samples': {
+    #    'singlelepton_MC_2017_ULv5/
+    #     WWG_TuneCP5_13TeV-amcatnlo-pythia8/
+    #     crab_RunIISummer20UL17MiniAOD-106X_mc2017_realistic_v6-v2_singlelepton_MC_2017_ULv5': 
+    #     {'running': '13.3%', 'finished': '73.3%', 'idle': '13.3%'}}}
+    # - webpath: directory where the resulting index.html file should be stored.
+    #            if it does not exist yet, it will be created;
+    #            if it already exists and contains an index.html file, that file will be overwritten.
+
+    # initializations
+    now = datetime.now()
+    if not os.path.exists(webpath): os.makedirs(webpath)
+
+    # make the page layout and header
+    page = '<html>\n'
+    page += '<head>\n'+style()+'</head>\n'
+    page += '<body>\n'
+    page += '<table style="background-color:#2C3E50;color:#EAECEE;'
+    page += 'font-size:40px;width:100%;text-align: center;">'
+    page += '<tr><td>Status of ntuple production</td></tr>'
+    page += '<tr><td style="font-size:15px;">Last update: '+now.strftime("%d/%m/%Y %H:%M:%S")+'</td></tr>'
+    page += '</table>\n'
+
+    # print some meta information
+    page += '<div id="meta-info"><h1>Meta-info</h1></div>\n'
+    if 'meta' in data.keys():
+        meta = data['meta']
+        print(meta)
+        for key,val in meta.items():
+            page += '<table class="divide" cellpadding="5px" cellspacing="0">\n'
+            page += '<tr>\n'
+            page += '<td style="width:30%">'+key+'</td>'
+            page += '<td style="widht:70%">'+val+'</td>\n'
+            page += '</tr>\n'
+        page += '</table>\n'
+    else:
+        page += '<table class="divide" cellpadding="5px" cellspacing="0">\n'
+        page += '<tr>\n'
+        page += '<td>(nothing to display)</td>'
+        page += '</tr>\n'
+        page += '</table>\n'
+
+    # get the sample data
+    sampledata = data['samples']
+
+    # sort the sample list
+    samples = sorted(list(sampledata.keys()),key=lambda x:x.lower())
+
+    # loop over samples
+    page += '<div id="samples"><h1>Samples</h1></div>\n'
+    for sample in samples:
+
+        # format sample name
+        production = sample.split('/')[0]
+        samplename = sample.split('/')[1]
+        versionname = sample.split('/')[2]
+
+        # get the data for this sample
+        sample_status = sampledata[sample]
+        status_str = ', '.join('{}: {}'.format(key,val) for key,val in sample_status.items())
+        finished_fraction = '0%'
+        if 'finished' in sample_status.keys(): finished_fraction = sample_status['finished']
+
+        # format the webpage entry
+        page += '<table class="divide" cellpadding="5px" cellspacing="0">\n'
+        page += '<tr>\n'
+        #page += '<td style="width:20%">'+production+'</td>'
+        page += '<td>'+samplename+'</td>'
+        page += '<td title='+versionname+'>version</td>'
+        page += '<td> <div id="progress">'
+        page += '<div id="percent" style="width=100%">'+status_str+'</div>'
+        page += '<div id="bar" style="width:'+finished_fraction+'"></div>'
+        page += '</div></td>\n'
+        page += '</tr>\n'
+
+    page += '</table>\n'    
+    page += '</body>\n'
+    page += '</html>'
+
+    wfile = open(os.path.join(webpath,'index.html'), 'w')
+    wfile.write(page)
+    wfile.close()
+
+
+if __name__ == '__main__':
+
+    # build command line parser
+    usage = "Usage: %prog [options]\n Monitoring script"
+    parser = OptionParser(usage)
+    parser.add_option('--crabdir', dest='crabdir',
+      help='main crab folder containing log files for all samples')
+    parser.add_option('--resubmit', dest='resubmit', default=False,
+      help='do resubmission of failed jobs (default: False, only monitor)')
+    parser.add_option('--proxy', dest='proxy', default=None,
+      help='path to your proxy (default: do not export proxy explicitly)')
+    parser.add_option('--webpage', dest='webpage', default='crab_status',
+      help='name of the webpage where the results will be displayed')
+    parser.add_option('--istest', dest='istest', default=False,
+      help='run in test mode, process only a few samples (default: False)')
+    (options, args) = parser.parse_args(sys.argv[1:])
+
+    # print command line arguments
+    print('Running monitorCrabJobs.py with following command line arguments:')
+    for opt in vars(options):
+        print('  - {}: {}'.format(opt, getattr(options,opt)))
+
+    # check command line arguments
+    if not os.path.isdir(options.crabdir):
+        raise Exception('ERROR: crab directory {} does not exist'.format(options.crabdir))
+    if options.proxy is not None:
+        if not os.path.exists(options.proxy):
+            raise Exception('ERROR: provided proxy {} does not exist'.format(options.proxy))
+
+    # export the proxy if requested
+    if options.proxy is not None:
+        os.system('export X509_USER_PROXY={}'.format(options.proxy))
+
+    # parse the web path
+    home = os.path.expanduser("~")
+    webpath = os.path.join(home, 'public_html', options.webpage)
+
+    # initializations
+    data = {'meta': {'generating script': os.path.abspath(__file__),
+            'command-line arguments': str(options)},
+            'samples': {}}
+    wdir = os.getcwd()
+    passp = ['Enter GRID pass phrase for this identity:', pexpect.EOF]
+    # note: once a proxy is created the password should not be needed anymore.
+    # still, T2 asks the password sometimes, in which case simply '\n' should suffice
+    # (see below)
+
+    # move to crab directory and find all sample folders
+    os.chdir(options.crabdir)
+    fproc = glob.glob('*/*/*')
+    nfproc = len(fproc)
+
+    # only for testing: subselect samples
+    if options.istest:
+        ntest = min(3, nfproc)
+        print('WARNING: running in test mode, will only process'
+             +' {} out of {} samples'.format(ntest, nfproc))
+        fproc = fproc[:ntest]
+
+    # initialize all samples to 0% finished
+    for fidx, f in enumerate(fproc):
+        data['samples'][f] = {'finished':'0%'}
+
+    # loop over samples
+    for fidx, f in enumerate(fproc):
+        print('Now processing sample {} of {}'.format(fidx+1,len(fproc)))
+        print('({})'.format(f))
+
+        # run crab status command and write the output to a log file
+        success = False
+        attempt = 0
+        while (attempt<5 and not success):
+            ch = pexpect.spawn('crab status -d '+f)
+            ch.timeout = 180 # in seconds, put large enough so the process finishes before limit
+            ch.logfile = open('monitor_tmp_log', 'w')
+            passpindex = ch.expect(passp)
+            if passpindex==0: ch.sendline('\n')
+            ch.read()
+            ch.close()
+            # check the output
+            with open('monitor_tmp_log', 'r') as fin:
+                outlines = fin.read().splitlines()
+            if len(outlines)==0:
+                print('Crab status seems to have failed, retrying...')
+                attempt += 1
+            else: success = True
+        if not success:
+            print('Crab status seems to have failed, skipping this sample.')
+            data['samples'][f] = {'crab status': 'failed'}
+            continue
+
+        # read the log file
+        jobsfailed = False
+        statuscompleted = False
+        with open('monitor_tmp_log', 'r') as fin:
+            outlines = fin.read().splitlines()
+        for line in outlines:
+            line = line.replace('Jobs status:','')
+            words = line.split()
+            if len(words)==0: continue
+            # check for known job statuses
+            for status in (['finished', 'running', 'transferring',
+                            'failed', 'killed', 'idle','unsubmitted']):
+                if status in words[0]:
+                    try: frac = words[2]
+                    except: frac = '<none>'
+                    # save to dict
+                    data['samples'][f][status] = frac
+                    # check if jobs failed for  this sample
+                    if( status=='failed' ): jobsfailed = True
+                    print('Percentage '+status+': '+frac)
+
+        # check if job is complete
+        if 'Status on the scheduler' in line:
+            if 'COMPLETED' in line:
+                if not os.path.isfile(f+'/results/processedLumis.json'):
+                    statuscompleted = True        
+
+        # handle case where failed jobs were found
+        if jobsfailed:
+            if options.resubmit:
+                print('Found failed jobs, now resubmitting...')
+                ch = pexpect.spawn('crab resubmit -d '+f)
+                ch.timeout = 10000
+                ch.expect(passp)
+                ch.sendline('\n')
+                ch.expect(passp)
+                ch.sendline('\n')
+                ch.read()
+                ch.close()
+                print('Done')
+
+        # handle case where job is complete
+        if statuscompleted:
+            print('This task is completed...')
+            # below is commented: no report, and no purging.
+            '''print 'Get the report...'
+            ch = pexpect.spawn('crab report -d '+f)
+            ch.expect(passp)
+            ch.sendline('\n')
+            ch.expect(passp)
+            ch.sendline('\n')
+            ch.timeout = 10000
+            ch.read()
+            ch.close()
+            print 'Report obtained, run purge ..'
+            ch = pexpect.spawn('crab purge -d '+f)
+            ch.expect(passp)
+            ch.sendline('\n')
+            ch.timeout = 10000
+            ch.read()
+            ch.close()
+            print 'Done' '''
+
+    # make web interface for gathered completion data              
+    os.chdir(wdir)
+    print('Loop over all samples completed.')
+    print('Retrieved following data:')
+    print(data)
+    web(data,webpath)
+    print('Sample status written to {}.'.format(webpath))
+    print('Done.')