# ---------------------------------------- # Automated blacklist service based on Ganga import os, time from Ganga.GPI import jobs from Ganga.GPI import jobtree from Ganga.GPI import tasks # start the monitoring from Ganga.Core import monitoring_component monitoring_component.enableMonitoring() # variables htmlSummary = "/home/mws/public_web/summary.html" # html Summary page whitelistFile = "/home/mws/public_web/whitelist.txt" # white list stopFile = "/home/mws/blacklist/stopfile" # File checked for stopping the service archiveSize = 7 * 24 # number of hours backlog of jobs to keep around activeSize = 12 # number of hours that are check for complete/failed jobs numComplete = 3 # number of completed jobs in last X hours before whitelisting maxFailed = 1 # maximum numbe of failed jobs before blacklisting loopTime = 30 # number of mins per loop online_dict = {} # Main loop while not os.path.exists("/home/mws/blacklist/stopfile"): iter_time = time.time() + loopTime * 60 # First, find the list of CEs available to this VO b = LCG() ce_dict = b.get_ce_list() # (restrict to UK sites) for ce in ce_dict.keys(): if ce.find('uk') == -1: del(ce_dict[ce]) # create header for summary sum_file_text = """

Summary of Blacklisting

""" # create jobtree entry for each CE for ce in ce_dict.keys(): jobtree.cd('/') if not jobtree.exists(ce.replace("/", "_")): jobtree.mkdir(ce.replace("/", "_")) jobtree.cd(ce.replace("/", "_")) # submit a job j = Job(backend = LCG(requirements = LCGRequirements(allowedCEs = ce))) jobtree.add(j) try: j.submit() except: pass num_comp = 0 num_fail = 0 online_dict[ce] = False # add summary sum_file_text += "" % ce # loop over the jobs in this dir for j in jobtree.getjobs(): # get creation time ( I prefer working in unix time :)) newtime = time.mktime( j.time.timestamps['new'].timetuple() ) # kill old jobs if newtime < (time.time() - activeSize*60*60): j.kill() continue # remove older jobs if newtime < (time.time() - archiveSize*60*60): j.remove() continue # total up failed/completed jobs if newtime > (time.time() - activeSize*60*60): if j.status == "completed": num_comp += 1 elif j.status == "failed": num_fail += 1 col = "44" else: col = "AA" # add summary if j.status == "failed": sum_file_text += "" % (col, col, j.status[0].upper()) elif j.status == "completed": sum_file_text += "" % (col, col, j.status[0].upper()) elif j.status == "submitted": sum_file_text += "" % (col, j.status[0].upper()) elif j.status == "running": sum_file_text += "" % (col, col, j.status[0].upper()) else: sum_file_text += "" % j.status[0].upper() # check for black/whitelist if num_comp >= numComplete and num_fail <= maxFailed: online_dict[ce] = True if online_dict[ce]: sum_file_text += "" else: sum_file_text += "" # write summary sum_file_text += """
%s%s%s%s%s%sONLINE
OFFLINE
""" # write summary open(htmlSummary, "w").write(sum_file_text) # write a list of site to blacklist file that are online online_ces = [] for ce in online_dict.keys(): online_ces.append(ce) open(whitelistFile, "w").write( '\n'.join(online_ces) ) # wait for next iteration while (time.time() < iter_time) and not os.path.exists("/home/mws/blacklist/stopfile"): time.sleep(60)