Skip to content
Snippets Groups Projects
Commit d2c9a9b6 authored by David Verelst's avatar David Verelst
Browse files

add cpu_user_queue to launch.py to avoid creating too big queues

parent 20d07ccf
No related branches found
No related tags found
No related merge requests found
Pipeline #
...@@ -330,7 +330,8 @@ class Scheduler: ...@@ -330,7 +330,8 @@ class Scheduler:
def __init__(self, nr_cpus, path_pbs_files=None, search_crit_re=r'\.p$', def __init__(self, nr_cpus, path_pbs_files=None, search_crit_re=r'\.p$',
dryrun=False, tsleep=5.00, logfile=None, cache=False, dryrun=False, tsleep=5.00, logfile=None, cache=False,
cpu_free=48, qsub_cmd='qsub %s', sort=False, debug=False): cpu_free=48, qsub_cmd='qsub %s', sort=False, debug=False,
cpu_user_queue=500):
""" """
Regular expression examples: Regular expression examples:
--re .veer-0\\.89_. --re .veer-0\\.89_.
...@@ -366,6 +367,8 @@ class Scheduler: ...@@ -366,6 +367,8 @@ class Scheduler:
more jobs being launced even if the user occupies less than the more jobs being launced even if the user occupies less than the
number of cpu's defined with nr_cpus. number of cpu's defined with nr_cpus.
cpu_user_queue : int, default=500
qsub_cmd : str, default='qsub %s' qsub_cmd : str, default='qsub %s'
When launching from a node on Gorm, ssh to g-000 to bypass the When launching from a node on Gorm, ssh to g-000 to bypass the
lack of permissions to launch jobs from a node. lack of permissions to launch jobs from a node.
...@@ -380,6 +383,7 @@ class Scheduler: ...@@ -380,6 +383,7 @@ class Scheduler:
print ' nr_cpus ; %3i' % nr_cpus print ' nr_cpus ; %3i' % nr_cpus
print ' cpu_free ; %3i' % cpu_free print ' cpu_free ; %3i' % cpu_free
print 'cpu_user_queue ; %3i' % cpu_user_queue
print 'path_pbs_files ; %s' % path_pbs_files print 'path_pbs_files ; %s' % path_pbs_files
print 'search_crit_re ; %s' % search_crit_re print 'search_crit_re ; %s' % search_crit_re
print ' dryrun ; %s' % dryrun print ' dryrun ; %s' % dryrun
...@@ -456,12 +460,12 @@ class Scheduler: ...@@ -456,12 +460,12 @@ class Scheduler:
cpu_user = 0 cpu_user = 0
# add queued jobs as well, if any # add queued jobs as well, if any
try: try:
cpu_user += users[uid]['Q'] cpu_user_queue = users[uid]['Q']
except KeyError: except KeyError:
pass cpu_user_queue = 0
cpu_free, nodeSum = pbswrap.count_cpus(users, host, pbsnodes) cpu_free, nodeSum = pbswrap.count_cpus(users, host, pbsnodes)
return cpu_free, cpu_user return cpu_free, cpu_user, cpu_user_queue
def launch(self): def launch(self):
""" """
...@@ -499,11 +503,12 @@ class Scheduler: ...@@ -499,11 +503,12 @@ class Scheduler:
return return
while True: while True:
cpu_free, cpu_user = self.check_nodes(self.uid) cpu_free, cpu_user, cpu_user_queue = self.check_nodes(self.uid)
# we only launch a new job when we are not using more than our # we only launch a new job when we are not using more than our
# quota (nr_cpus), or when there is still some room for others # quota (nr_cpus), or when there is still some room for others
# to breath # to breath
if self.nr_cpus > cpu_user and cpu_free > self.cpu_free: if (self.nr_cpus > cpu_user) and (cpu_free > self.cpu_free) and \
(cpu_user_queue < self.cpu_user_queue):
fname = self.pbsflist[ii] fname = self.pbsflist[ii]
# read the PBS file # read the PBS file
f = open(fname) f = open(fname)
...@@ -541,6 +546,8 @@ class Scheduler: ...@@ -541,6 +546,8 @@ class Scheduler:
print ' nr_cpus: %4i, cpu_user: %4i' % rpl print ' nr_cpus: %4i, cpu_user: %4i' % rpl
rpl = (cpu_free, self.cpu_free) rpl = (cpu_free, self.cpu_free)
print 'cpu_free: %4i, cpu_free_req: %4i' % rpl print 'cpu_free: %4i, cpu_free_req: %4i' % rpl
rpl = (cpu_user_queue, self.cpu_user_queue)
print ' u queue: %4i, u queue max: %4i' % rpl
time.sleep(self.tsleep) time.sleep(self.tsleep)
# stop when we handled all the files # stop when we handled all the files
...@@ -697,6 +704,12 @@ if __name__ == '__main__': ...@@ -697,6 +704,12 @@ if __name__ == '__main__':
'amount of cpus free. This will make sure there is ' 'amount of cpus free. This will make sure there is '
'room for others on the cluster, but might mean less ' 'room for others on the cluster, but might mean less '
'cpus available for you. Default=48') 'cpus available for you. Default=48')
parser.add_argument('--cpu_user_queue', action='store', dest='cpu_user_queue',
type='int', default=500, help='No more jobs will be '
'launched after having cpu_user_queue number of jobs'
'in the queue. This prevents users from filling the'
'queue, while still allowing to aim for a high cpu_free'
'target.'
parser.add_argument('--qsub_cmd', action='store', dest='qsub_cmd', parser.add_argument('--qsub_cmd', action='store', dest='qsub_cmd',
default='qsub %s', default='qsub %s',
help='Is set automatically by --node flag') help='Is set automatically by --node flag')
...@@ -768,5 +781,5 @@ if __name__ == '__main__': ...@@ -768,5 +781,5 @@ if __name__ == '__main__':
tsleep=options.tsleep, logfile=path_log, tsleep=options.tsleep, logfile=path_log,
cache=options.cache, cpu_free=options.cpu_free, cache=options.cache, cpu_free=options.cpu_free,
qsub_cmd=options.qsub_cmd, sort=options.sort, qsub_cmd=options.qsub_cmd, sort=options.sort,
debug=options.debug) debug=options.debug, cpu_user_queue=opt.cpu_user_queue)
ss(options.depend) ss(options.depend)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment