# ---------------------------------------- # Automated blacklist service based on Ganga import os, time from Ganga.GPI import jobs from Ganga.GPI import jobtree from Ganga.GPI import tasks # start the monitoring from Ganga.Core import monitoring_component monitoring_component.enableMonitoring() # variables htmlSummary = "/home/mws/public_web/summary.html" # html Summary page whitelistFile = "/home/mws/public_web/whitelist.txt" # white list stopFile = "/home/mws/blacklist/stopfile" # File checked for stopping the service archiveSize = 7 * 24 # number of hours backlog of jobs to keep around activeSize = 12 # number of hours that are check for complete/failed jobs numComplete = 3 # number of completed jobs in last X hours before whitelisting maxFailed = 1 # maximum numbe of failed jobs before blacklisting loopTime = 30 # number of mins per loop online_dict = {} # Main loop while not os.path.exists("/home/mws/blacklist/stopfile"): iter_time = time.time() + loopTime * 60 # First, find the list of CEs available to this VO b = LCG() ce_dict = b.get_ce_list() # (restrict to UK sites) for ce in ce_dict.keys(): if ce.find('uk') == -1: del(ce_dict[ce]) # create header for summary sum_file_text = """
%s | " % ce # loop over the jobs in this dir for j in jobtree.getjobs(): # get creation time ( I prefer working in unix time :)) newtime = time.mktime( j.time.timestamps['new'].timetuple() ) # kill old jobs if newtime < (time.time() - activeSize*60*60): j.kill() continue # remove older jobs if newtime < (time.time() - archiveSize*60*60): j.remove() continue # total up failed/completed jobs if newtime > (time.time() - activeSize*60*60): if j.status == "completed": num_comp += 1 elif j.status == "failed": num_fail += 1 col = "44" else: col = "AA" # add summary if j.status == "failed": sum_file_text += "%s | " % (col, col, j.status[0].upper()) elif j.status == "completed": sum_file_text += "%s | " % (col, col, j.status[0].upper()) elif j.status == "submitted": sum_file_text += "%s | " % (col, j.status[0].upper()) elif j.status == "running": sum_file_text += "%s | " % (col, col, j.status[0].upper()) else: sum_file_text += "%s | " % j.status[0].upper() # check for black/whitelist if num_comp >= numComplete and num_fail <= maxFailed: online_dict[ce] = True if online_dict[ce]: sum_file_text += "ONLINE | OFFLINE | " # write summary sum_file_text += """