From b9fda18ef5ffc73c03469617f9415861b9fa55c0 Mon Sep 17 00:00:00 2001 From: "Mads M. Pedersen" <mmpe@dtu.dk> Date: Thu, 24 Nov 2016 10:35:19 +0100 Subject: [PATCH] improved error handling in simulation.py --- wetb/hawc2/simulation.py | 60 +++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/wetb/hawc2/simulation.py b/wetb/hawc2/simulation.py index 5501f146..5b93e7dc 100755 --- a/wetb/hawc2/simulation.py +++ b/wetb/hawc2/simulation.py @@ -235,12 +235,18 @@ class Simulation(object): return dst output_patterns = [fmt(dst) for dst in self.htcFile.output_files() + ([], self.htcFile.turbulence_files())[self.copy_turbulence] + [self.stdout_filename]] output_files = set([f for pattern in output_patterns for f in self.host.glob(unix_path(os.path.join(self.tmp_modelpath, pattern)))]) - self.host._finish_simulation(output_files) - self.set_id(self.filename) - if self.status != ERROR: - self.status = CLEANED - self.logFile.reset() - self.htcFile.reset() + try: + self.host._finish_simulation(output_files) + if self.status != ERROR: + self.status = CLEANED + except Exception as e: + self.errors.append(str(e)) + raise + + finally: + self.set_id(self.filename) + self.logFile.reset() + self.htcFile.reset() @@ -418,20 +424,26 @@ class LocalSimulationHost(SimulationResource): def _finish_simulation(self, output_files): + missing_result_files = [] for src_file in output_files: dst_file = os.path.join(self.modelpath, os.path.relpath(src_file, self.tmp_modelpath)) # exist_ok does not exist in Python27 - if not os.path.isdir(os.path.dirname(dst_file)): - os.makedirs(os.path.dirname(dst_file)) #, exist_ok=True) - if not os.path.isfile(dst_file) or os.path.getmtime(dst_file) != os.path.getmtime(src_file): - shutil.copy(src_file, dst_file) + try: + if not os.path.isdir(os.path.dirname(dst_file)): + os.makedirs(os.path.dirname(dst_file)) #, exist_ok=True) + if not os.path.isfile(dst_file) or os.path.getmtime(dst_file) != os.path.getmtime(src_file): + shutil.copy(src_file, dst_file) + except: + missing_result_files.append(dst_file) self.logFile.filename = os.path.join(self.modelpath, self.log_filename) - + if missing_result_files: + raise Warning("Failed to copy %s from %s"%(",".join(missing_result_files), self.host)) try: shutil.rmtree(self.tmp_modelpath) except (PermissionError, OSError) as e: - raise Warning(str(e)) + raise Warning("Fail to remove temporary files and folders on %s\n%s"%(self.host, str(e))) + def update_logFile_status(self): self.logFile.update_status() @@ -535,19 +547,24 @@ class PBSClusterSimulationHost(SimulationResource, SSHClient): def _finish_simulation(self, output_files): with self: + download_failed = [] for src_file in output_files: try: dst_file = os.path.join(self.modelpath, os.path.relpath(src_file, self.tmp_modelpath)) os.makedirs(os.path.dirname(dst_file), exist_ok=True) - self.download(src_file, dst_file, retry=3) + self.download(src_file, dst_file, retry=10) except Exception as e: - print (self.modelpath, src_file, self.tmp_modelpath) - raise e - try: - self.execute('rm -r .hawc2launcher/%s' % self.simulation_id) - self.execute('rm .hawc2launcher/status_%s' % self.simulation_id) - except: - pass + download_failed.append(dst_file) + if download_failed: + raise Warning("Failed to download %s from %s"%(",".join(download_failed), self.host)) + else: + try: + self.execute('rm -r .hawc2launcher/%s' % self.simulation_id) + finally: + try: + self.execute('rm .hawc2launcher/status_%s' % self.simulation_id) + except: + raise Warning("Fail to remove temporary files and folders on %s"%self.host) def _simulate(self): @@ -668,6 +685,3 @@ cd /scratch/ ### rm -r $PBS_JOBID exit""" % (self.simulation_id, self.stdout_filename, self.modelpath, self.htcFile.filename, self.resource.python_cmd, rel_htcfilename, self.resource.wine_cmd, self.hawc2exe, cp_back) - - - -- GitLab