Simulations.py

            the output (that was optionally defined in ch_sel), and the value
            is the dataframe containing the statistical values for all the
            different selected cases.

        """

        def add_df_row(df_dict, **kwargs):
            """
            add a new channel to the df_dict format of ch_df
            """
            for col, value in kwargs.items():
                df_dict[col].append(value)
            for col in (self.res.cols - set(kwargs.keys())):
                df_dict[col].append('')
            return df_dict

        # in case the output changes, remember the original ch_sel
        if ch_sel is not None:
            ch_sel_init = ch_sel.copy()
        else:
            ch_sel_init = None

        if ch_fatigue is None:
            ch_fatigue_init = None
        else:
            ch_fatigue_init = ch_fatigue

        # TODO: should the default tags not be all the tags in the cases dict?
        tag_default = ['[case_id]', '[sim_id]']
        tag_chan = 'channel'
        # merge default with other tags
        for tag in tag_default:
            if tag not in tags:
                tags.append(tag)

        # tags can only be unique, when there the same tag appears twice
        # it will break the DataFrame creation
        if len(tags) is not len(set(tags)):
            raise ValueError('tags can only contain unique entries')

        # get some basic parameters required to calculate statistics
        try:
            case = list(self.cases.keys())[0]
        except IndexError:
            print('no cases to select so no statistics, aborting ...')
            return None

        post_dir = self.cases[case]['[post_dir]']
        if not new_sim_id:
            # select the sim_id from a random case
            sim_id = self.cases[case]['[sim_id]']
        else:
            sim_id = new_sim_id

        if not silent:
            nrcases = len(self.cases)
            print('='*79)
            print('statistics for %s, nr cases: %i' % (sim_id, nrcases))

        df_dict = None
        add_stats = True
        # for finding [] tags
        regex = re.compile('(\\[.*?\\])')

        for ii, (cname, case) in enumerate(self.cases.items()):

            # build the basic df_dict if not defined
            if df_dict is None:
                # the dictionary that will be used to create a pandas dataframe
                df_dict = { tag:[] for tag in tags }
                df_dict[tag_chan] = []
                # add more columns that will help with IDing the channel
                df_dict['channel_name'] = []
                df_dict['channel_units'] = []
                df_dict['channel_nr'] = []
                df_dict['channel_desc'] = []
                add_stats = True

            if not silent:
                pc = '%6.2f' % (float(ii)*100.0/float(nrcases))
                pc += ' %'
                print('stats progress: %4i/%i %s | %s' % (ii, nrcases, pc, cname))

            # make sure the selected tags exist
            if len(tags) != len(set(case) and tags):
                raise KeyError('    not all selected tags exist in cases')

            self.load_result_file(case)
            ch_dict_new = {}
            # this is really messy, now we are also in parallal using the
            # channel DataFrame structure
            ch_df_new = {col:[] for col in self.res.cols}
            ch_df_new['ch_name'] = []
            # calculate the statistics values
#            stats = self.res.calc_stats(self.sig, i0=i0, i1=i1)
            i_new_chans = self.sig.shape[1] # self.Nch
            sig_size = self.res.N  # len(self.sig[i0:i1,0])
            new_sigs = np.ndarray((sig_size, 0))

            for name, expr in add_sigs.items():
                channel_tags = regex.findall(expr)
                # replace all sensor names with expressions
                template = "self.sig[:,self.res.ch_dict['{}']['chi']]"
                for chan in channel_tags:
                    # first remove the [] from the tag
                    # FIXME: fails when the same channel occurs more than once
                    expr = expr.replace(chan, chan[1:-1])
                    expr = expr.replace(chan[1:-1], template.format(chan[1:-1]))

                sig_add = np.ndarray((len(self.sig[:,0]), 1))
                sig_add[:,0] = eval(expr)

                ch_dict_new[name] = {}
                ch_dict_new[name]['chi'] = i_new_chans
                ch_df_new = add_df_row(ch_df_new, **{'chi':i_new_chans,
                                                   'ch_name':name})
                i_new_chans += 1
                new_sigs = np.append(new_sigs, sig_add, axis=1)

            if add_sensor is not None:
                chi1 = self.res.ch_dict[add_sensor['ch1_name']]['chi']
                chi2 = self.res.ch_dict[add_sensor['ch2_name']]['chi']
                name = add_sensor['ch_name_add']
                factor = add_sensor['factor']
                operator = add_sensor['operator']

                p1 = self.sig[:,chi1]
                p2 = self.sig[:,chi2]
                sig_add = np.ndarray((len(p1), 1))
                if operator == '*':
                    sig_add[:,0] = p1*p2*factor
                elif operator == '/':
                    sig_add[:,0] = factor*p1/p2
                else:
                    raise ValueError('Operator needs to be either * or /')
#                add_stats = self.res.calc_stats(sig_add)
#                add_stats_i = stats['max'].shape[0]
                # add a new channel description for the mechanical power
                ch_dict_new[name] = {}
                ch_dict_new[name]['chi'] = i_new_chans
                ch_df_new = add_df_row(ch_df_new, **{'chi':i_new_chans,
                                                   'ch_name':name})
                i_new_chans += 1
                new_sigs = np.append(new_sigs, sig_add, axis=1)
#                # and append to all the statistics types
#                for key, stats_arr in stats.iteritems():
#                    stats[key] = np.append(stats_arr, add_stats[key])

            # calculate the resultants
            sig_resultants = np.ndarray((sig_size, len(chs_resultant)))
            inc = []
            for j, chs in enumerate(chs_resultant):
                sig_res = np.ndarray((sig_size, len(chs)))
                lab = ''
                no_channel = False
                for i, ch in enumerate(chs):
                    # if the channel does not exist, zet to zero
                    try:
                        chi = self.res.ch_dict[ch]['chi']
                        sig_res[:,i] = self.sig[:,chi]
                        no_channel = False
                    except KeyError:
                        no_channel = True
                    lab += ch.split('-')[-1]
                name = '-'.join(ch.split('-')[:-1] + [lab])
                # when on of the components do no exist, we can not calculate
                # the resultant!
                if no_channel:
                    rpl = (name, cname)
                    print('    missing channel, no resultant for: %s, %s' % rpl)
                    continue
                inc.append(j)
                sig_resultants[:,j] = np.sqrt(sig_res*sig_res).sum(axis=1)
#                resultant = np.sqrt(sig_resultants[:,j].reshape(self.res.N, 1))
#                add_stats = self.res.calc_stats(resultant)
#                add_stats_i = stats['max'].shape[0]
                # add a new channel description for this resultant
                ch_dict_new[name] = {}
                ch_dict_new[name]['chi'] = i_new_chans
                ch_df_new = add_df_row(ch_df_new, **{'chi':i_new_chans,
                                                   'ch_name':name})
                i_new_chans += 1
                # and append to all the statistics types
#                for key, stats_arr in stats.iteritems():
#                    stats[key] = np.append(stats_arr, add_stats[key])
            if len(chs_resultant) > 0:
                # but only take the channels that where not missing
                new_sigs = np.append(new_sigs, sig_resultants[:,inc], axis=1)

            # calculate mechanical power first before deriving statistics
            # from it
            if calc_mech_power:
                name = 'stats-shaft-power'
                sig_pmech = np.ndarray((sig_size, 1))
                sig_pmech[:,0] = self.shaft_power()
#                P_mech_stats = self.res.calc_stats(sig_pmech)
#                mech_stats_i = stats['max'].shape[0]
                # add a new channel description for the mechanical power
                ch_dict_new[name] = {}
                ch_dict_new[name]['chi'] = i_new_chans
                ch_df_new = add_df_row(ch_df_new, **{'chi':i_new_chans,
                                                   'ch_name':name})
                i_new_chans += 1
                new_sigs = np.append(new_sigs, sig_pmech, axis=1)

                # and C_p_mech
                if A is not None:
                    name = 'stats-cp-mech'
                    if ch_wind is None:
                        chiwind = self.res.ch_dict[self.find_windchan_hub()]['chi']
                    else:
                        chiwind = self.res.ch_dict[ch_wind]['chi']
                    wind = self.res.sig[:,chiwind]
                    cp = np.ndarray((sig_size, 1))
                    cp[:,0] = self.cp(-sig_pmech[:,0], wind, A)
                    # add a new channel description for the mechanical power
                    ch_dict_new[name] = {}
                    ch_dict_new[name]['chi'] = i_new_chans
                    ch_df_new = add_df_row(ch_df_new, **{'chi':i_new_chans,
                                                       'ch_name':name})
                    i_new_chans += 1
                    new_sigs = np.append(new_sigs, cp, axis=1)

                    try:
                        try:
                            nn_shaft = self.config['nn_shaft']
                        except:
                            nn_shaft = 4

                        chan_t = 'shaft_nonrotate-shaft-node-%3.3i-forcevec-z'%nn_shaft
                        i = self.res.ch_dict[chan_t]['chi']
                        thrust = self.res.sig[:,i]
                        name = 'stats-ct'
                        ct = np.ndarray((sig_size, 1))
                        ct[:,0] = self.ct(thrust, wind, A)
                        ch_dict_new[name] = {}
                        ch_dict_new[name]['chi'] = i_new_chans
                        ch_df_new = add_df_row(ch_df_new, **{'chi':i_new_chans,
                                                           'ch_name':name})
                        i_new_chans += 1
                        new_sigs = np.append(new_sigs, ct, axis=1)
                    except KeyError:
                        print('    can not calculate CT')

                # and append to all the statistics types
#                for key, stats_arr in stats.iteritems():
#                    stats[key] = np.append(stats_arr, P_mech_stats[key])

            if save_new_sigs and new_sigs.shape[1] > 0:
                chis, keys = [], []
                for key, value in ch_dict_new.items():
                    chis.append(value['chi'])
                    keys.append(key)
                # sort on channel number, so it agrees with the new_sigs array
                isort = np.array(chis).argsort()
                keys = np.array(keys)[isort].tolist()
                df_new_sigs = pd.DataFrame(new_sigs, columns=keys)
                respath = os.path.join(case['[run_dir]'], case['[res_dir]'])
                resfile = case['[case_id]']
                fname = os.path.join(respath, resfile + '_postres.csv')
                print('    saving post-processed res: %s...' % fname, end='')
                df_new_sigs.to_csv(fname, sep='\t')
                print('done!')
                del df_new_sigs

            ch_dict = self.res.ch_dict.copy()
            ch_dict.update(ch_dict_new)

#            ch_df = pd.concat([self.res.ch_df, pd.DataFrame(ch_df_new)])

            # put all the extra channels into the results if we want to also
            # be able to calculate the fatigue loads on them.
            self.sig = np.append(self.sig, new_sigs, axis=1)

            # calculate the statistics values
            stats = self.res.calc_stats(self.sig, i0=i0, i1=i1)

            # Because each channel is a new row, it doesn't matter how many
            # data channels each case has, and this approach does not brake
            # when different cases have a different number of output channels
            # By default, just take all channels in the result file.
            if ch_sel_init is None:
                ch_sel = list(ch_dict.keys())
#                ch_sel = ch_df.unique_ch_name.tolist()
#                ch_sel = [str(k) for k in ch_sel]
                print('    selecting all channels for statistics')

            # calculate the fatigue properties from selected channels
            fatigue, tags_fatigue = {}, []
            if ch_fatigue_init is None:
                ch_fatigue = ch_sel
                print('    selecting all channels for fatigue')
            else:
                ch_fatigue = ch_fatigue_init

            for ch_id in ch_fatigue:
                chi = ch_dict[ch_id]['chi']
                signal = self.sig[:,chi]
                if neq is None:
                    neq_ = float(case['[duration]'])
                else:
                    neq_ = neq
                eq = self.res.calc_fatigue(signal, no_bins=no_bins, neq=neq_,
                                           m=m)

                # save in the fatigue results
                fatigue[ch_id] = {}
                fatigue[ch_id]['neq'] = neq_
                # when calc_fatigue succeeds, we should have as many items
                # as in m
                if len(eq) == len(m):
                    for eq_, m_ in zip(eq, m):
                        fatigue[ch_id]['m=%2.01f' % m_] = eq_
                # when it fails, we get an empty list back
                else:
                    for m_ in m:
                        fatigue[ch_id]['m=%2.01f' % m_] = np.nan

            # build the fatigue tags
            for m_ in m:
                tag = 'm=%2.01f' % m_
                tags_fatigue.append(tag)
            tags_fatigue.append('neq')

            # -----------------------------------------------------------------
            # define the pandas data frame dict on first run
            # -----------------------------------------------------------------
            # Only build the ch_sel collection once. By definition, the
            # statistics, fatigue and htc tags will not change
            if add_stats:
                # statistical parameters
                for statparam in list(stats.keys()):
                    df_dict[statparam] = []
#                # additional tags
#                for tag in tags:
#                    df_dict[tag] = []
                # fatigue data
                for tag in tags_fatigue:
                    df_dict[tag] = []
                add_stats = False

            for ch_id in ch_sel:

                chi = ch_dict[ch_id]['chi']
                # ch_name is not unique anymore, this doesn't work obviously!
                # use the channel index instead, that is unique
#                chi = ch_df[ch_df.unique_ch_name==ch_id].chi.values[0]

                # sig_stat = [(0=value,1=index),statistic parameter, channel]
                # stat params = 0 max, 1 min, 2 mean, 3 std, 4 range, 5 abs max
                # note that min, mean, std, and range are not relevant for index
                # values. Set to zero there.

                # -------------------------------------------------------------
                # Fill in all the values for the current data entry
                # -------------------------------------------------------------

                # the auxiliry columns
                try:
                    name = self.res.ch_details[chi,0]
                    unit = self.res.ch_details[chi,1]
                    desc = self.res.ch_details[chi,2]
                # the new channels from new_sigs are not in here
                except (IndexError, AttributeError) as e:
                    name = ch_id
                    desc = ''
                    unit = ''
                df_dict['channel_name'].append(name)
                df_dict['channel_units'].append(unit)
                df_dict['channel_desc'].append(desc)
                df_dict['channel_nr'].append(chi)

                # each df line is a channel of case that needs to be id-eed
                df_dict[tag_chan].append(ch_id)

                # for all the statistics keys, save the values for the
                # current channel
                for statparam in list(stats.keys()):
                    df_dict[statparam].append(stats[statparam][chi])
                # and save the tags from the input htc file in order to
                # label each different case properly
                for tag in tags:
                    df_dict[tag].append(case[tag])
                # append any fatigue channels if applicable, otherwise nan
                if ch_id in fatigue:
                    for m_fatigue, eq_ in fatigue[ch_id].items():
                        df_dict[m_fatigue].append(eq_)
                else:
                    for tag in tags_fatigue:
                        # TODO: or should this be NaN?
                        df_dict[tag].append(np.nan)
            # when dealing with a lot of cases, save the stats data at
            # intermediate points to avoid memory issues
            if math.fmod(ii+1, saveinterval) == 0.0:
                df_dict2 = self._df_dict_check_datatypes(df_dict)
                # convert, save/update
                if isinstance(suffix, str):
                    ext = suffix
                elif suffix is True:
                    ext = '_%06i' % (ii+1)
                else:
                    ext = ''
#                dfs = self._df_dict_save(df_dict2, post_dir, sim_id, save=save,
#                                         update=update, csv=csv, suffix=ext)
                # TODO: test this first
                fname = os.path.join(post_dir, sim_id + '_statistics' + ext)
                dfs = misc.dict2df(df_dict2, fname, save=save, update=update,
                                   csv=csv, xlsx=xlsx, check_datatypes=False,
                                   complib=self.complib)

                df_dict2 = None
                df_dict = None
                add_stats = True

        # only save again when there is actual data in df_dict
        if df_dict is not None:
            # make consistent data types
            df_dict2 = self._df_dict_check_datatypes(df_dict)
            # convert, save/update
            if isinstance(suffix, str):
                ext = suffix
            elif suffix is True:
                ext = '_%06i' % ii
            else:
                ext = ''
#            dfs = self._df_dict_save(df_dict2, post_dir, sim_id, save=save,
#                                     update=update, csv=csv, suffix=ext)
            # TODO: test this first
            fname = os.path.join(post_dir, sim_id + '_statistics' + ext)
            dfs = misc.dict2df(df_dict2, fname, save=save, update=update,
                               csv=csv, xlsx=xlsx, check_datatypes=False,
                               complib=self.complib)

        return dfs

    def _add2newsigs(self, ch_dict, name, i_new_chans, new_sigs, addendum):

        ch_dict[name] = {}
        ch_dict[name]['chi'] = i_new_chans
        i_new_chans += 1
        return ch_dict, np.append(new_sigs, addendum, axis=1)

    # TODO: use the version in misc instead.
    def _df_dict_save(self, df_dict2, post_dir, sim_id, save=True,
                      update=False, csv=True, suffix=None):
        """
        Convert the df_dict to df and save/update.

        DEPRICATED, use misc.dict2df instead
        """
        if isinstance(suffix, str):
            fpath = os.path.join(post_dir, sim_id + '_statistics' + suffix)
        else:
            fpath = os.path.join(post_dir, sim_id + '_statistics')

        # in case converting to dataframe fails, fall back
        try:
            dfs = pd.DataFrame(df_dict2)
        except Exception as e:

            FILE = open(fpath + '.pkl', 'wb')
            pickle.dump(df_dict2, FILE, protocol=2)
            FILE.close()
            # check what went wrong
            misc.check_df_dict(df_dict2)
            print('failed to convert to data frame, saved as dict')
            raise(e)

#        # apply categoricals to objects
#        for column_name, column_dtype in dfs.dtypes.iteritems():
#            # applying categoricals mostly makes sense for objects
#            # we ignore all others
#            if column_dtype.name == 'object':
#                dfs[column_name] = dfs[column_name].astype('category')

        # and save/update the statistics database
        if save:
            if update:
                print('updating statistics: %s ...' % (post_dir + sim_id), end='')
                try:
                    dfs.to_hdf('%s.h5' % fpath, 'table', mode='r+', append=True,
                               format='table', complevel=9, complib=self.complib)
                except IOError:
                    print('Can not update, file does not exist. Saving instead'
                          '...', end='')
                    dfs.to_hdf('%s.h5' % fpath, 'table', mode='w',
                               format='table', complevel=9, complib=self.complib)
            else:
                print('saving statistics: %s ...' % (post_dir + sim_id), end='')
                if csv:
                    dfs.to_csv('%s.csv' % fpath)
                dfs.to_hdf('%s.h5' % fpath, 'table', mode='w',
                           format='table', complevel=9, complib=self.complib)

            print('DONE!!\n')

        return dfs

    # TODO: use the version in misc instead.
    def _df_dict_check_datatypes(self, df_dict):
        """
        there might be a mix of strings and numbers now, see if we can have
        the same data type throughout a column
        nasty hack: because of the unicode -> string conversion we might not
        overwrite the same key in the dict.

        DEPRICATED, use misc.df_dict_check_datatypes instead
        """
        # FIXME: this approach will result in twice the memory useage though...
        # we can not pop/delete items from a dict while iterating over it
        df_dict2 = {}
        for colkey, col in df_dict.items():
            # if we have a list, convert to string
            if type(col[0]).__name__ == 'list':
                for ii, item in enumerate(col):
                    col[ii] = '**'.join(item)
            # if we already have an array (statistics) or a list of numbers
            # do not try to cast into another data type, because downcasting
            # in that case will not raise any exception
            elif type(col[0]).__name__[:3] in ['flo', 'int', 'nda']:
                df_dict2[str(colkey)] = np.array(col)
                continue
            # in case we have unicodes instead of strings, we need to convert
            # to strings otherwise the saved .h5 file will have pickled elements
            try:
                df_dict2[str(colkey)] = np.array(col, dtype=np.int32)
            except OverflowError:
                try:
                    df_dict2[str(colkey)] = np.array(col, dtype=np.int64)
                except OverflowError:
                    df_dict2[str(colkey)] = np.array(col, dtype=np.float64)
            except ValueError:
                try:
                    df_dict2[str(colkey)] = np.array(col, dtype=np.float64)
                except ValueError:
                    df_dict2[str(colkey)] = np.array(col, dtype=np.str)
            except TypeError:
                # in all other cases, make sure we have converted them to
                # strings and NOT unicode
                df_dict2[str(colkey)] = np.array(col, dtype=np.str)
            except Exception as e:
                print('failed to convert column %s to single data type' % colkey)
                raise(e)
        return df_dict2

    def fatigue_lifetime(self, dfs, neq_life, res_dir='res/', fh_lst=None,
                         dlc_folder="dlc%s_iec61400-1ed3/", extra_cols=[],
                         save=False, update=False, csv=False, new_sim_id=False,
                         xlsx=False, years=20.0, silent=False):
        """
        Cacluate the fatigue over a selection of cases and indicate how many
        hours each case contributes to its life time.

        This approach can only work reliably if the common DLC folder
        structure is followed. This also means that a 'dlc_config.xlsx' Excel
        file is required in the HAWC2 root directory (as defined in the
        [run_dir] tag).

        Parameters
        ----------

        dfs : DataFrame
            Statistics Pandas DataFrame. When extra_cols is not defined, it
            should only hold the results of one standard organized DLC (one
            turbine, one inflow case).

        neq_life : float
            Reference number of cycles. Usually, neq is either set to 10e6,
            10e7 or 10e8.

        res_dir : str, default='res/'
            Base directory of the results. Results would be located in
            res/dlc_folder/*.sel. Only relevant when fh_lst is None.

        dlc_folder : str, default="dlc%s_iec61400-1ed3/"
            String with the DLC subfolder names. One string substitution is
            required (%s), and should represent the DLC number (withouth comma
            or point). Not relevant when fh_lst is defined.

        extra_cols : list, default=[]
            The included columns are the material constants, and each row is
            a channel. When multiple DLC cases are included in dfs, the user
            has to define additional columns in order to distinguish between
            the DLC cases.

        fh_lst : list, default=None
            Number of hours for each case over its life time. Format:
            [(filename, hours),...] where, filename is the name of the file
            (can be a full path, but only the base path is considered), hours
            is the number of hours over the life time. When fh_lst is set,
            years, res_dir, dlc_folder and dlc_name are not used.

        years : float, default=20
            Total life time expressed in years, only relevant when fh_lst is
            None.

        Returns
        -------

        df_Leq : DataFrame
            Pandas DataFrame with the life time equivalent load for the given
            neq, all the channels, and a range of material parameters m.
        """
        if not silent:
            print('Calculating life time fatigue load')

        if not isinstance(neq_life, float):
            neq_type = type(neq_life).__name__
            msg = 'neq_life (reference nr of cycles for life time fatigue '
            msg += 'load) should be a float instead of %s' % neq_type
            raise ValueError(msg)

        # get some basic parameters required to calculate statistics
        try:
            case = list(self.cases.keys())[0]
        except IndexError:
            if not silent:
                print('no cases to select so no statistics, aborting ...')
            return None
        post_dir = self.cases[case]['[post_dir]']
        if not new_sim_id:
            # select the sim_id from a random case
            sim_id = self.cases[case]['[sim_id]']
        else:
            sim_id = new_sim_id

        # FIXME: for backward compatibility, the column name of the unique
        # channel name has been changed in the past....
        if 'unique_ch_name' in dfs.columns:
            chan_col_name  = 'unique_ch_name'
        else:
            chan_col_name  = 'channel'

        if fh_lst is None:
            # FIXME: wb has overlap with dlc_config.xlsx, and shape_k doesn't
            # seemed to be used by DLCHighLevel
            wb = WeibullParameters()
            if 'Weibull' in self.config:
                for key in self.config['Weibull']:
                    setattr(wb, key, self.config['Weibull'][key])

            # we assume the run_dir (root) is the same every where
            run_dir = self.cases[case]['[run_dir]']
            fname = os.path.join(run_dir, 'dlc_config.xlsx')
            dlc_cfg = dlc.DLCHighLevel(fname, shape_k=wb.shape_k,
                                       fail_on_resfile_not_found=True)
            # if you need all DLCs, make sure to have %s in the file name
            dlc_cfg.res_folder = os.path.join(run_dir, res_dir, dlc_folder)
            # no need to build list of result files, we already have it form
            # the statistics analysis
            # TODO: could be faster if working with df directly, but how to
            # assure you're res_dir is always ending with path separator?
            # only take the values from 1 channel, not all of them!!
            # FIXME: breaks when not all channels are present for all cases !
            # solution: set channel "Time" as a minimum required channel!
            val = dfs[chan_col_name].values[0]
            sel = dfs[dfs[chan_col_name]==val]
            p1, p2 = sel['[res_dir]'].values, sel['[case_id]'].values
            files = [os.path.join(q1, q2) + '.sel' for q1, q2 in zip(p1, p2)]
            fh_lst = dlc_cfg.file_hour_lst(years=years, files=files)

        # now we have a full path to the result files, but we only need the
        # the case_id to indentify the corresponding entry from the statistics
        # DataFrame (exluciding the .sel extension)
        case_ids = [os.path.basename(k[0].replace('.sel', '')) for k in fh_lst]
        hours = [k[1] for k in fh_lst]

        # safe how many hours each case is active for AEP calculations for
        # debugging and inspection reasons.
        # FIXME: this should be somewhere in its own method or something,
        # and duplication with what is in AEP should be removed
        fname = os.path.join(post_dir, sim_id + '_Leq_hourlist')
        dict_Leq_h = {'case_id':case_ids, 'hours':hours}
        df_Leq_h = misc.dict2df(dict_Leq_h, fname, update=update, csv=csv,
                                save=save, check_datatypes=True, xlsx=xlsx,
                                complib=self.complib)

        # ---------------------------------------------------------------------
        # column definitions
        # ---------------------------------------------------------------------
        # available material constants
        ms, cols = [], []
        for key in dfs:
            if key[:2] == 'm=':
                ms.append(key)
        # when multiple DLC cases are included, add extra cols to identify each
        # DLC group. Make a copy, because extra_cols does not get re-initiated
        # when defined as an optional keyword argument
        extra_cols_ = copy.copy(extra_cols + [chan_col_name])
        cols = copy.copy(ms)
        cols.extend(extra_cols_)
        # ---------------------------------------------------------------------

        # Built the DataFrame, we do not have a unqique channel index
        dict_Leq = {col:[] for col in cols}
        # index on case_id on the original DataFrame so we can select accordingly
        dfs = dfs.set_index('[case_id]')
        # which rows to keep: a
        # select for each channel all the cases
        for grname, gr in dfs.groupby(dfs[chan_col_name]):
            # if one m has any nan's, assume none of them are good and throw
            # away
#            if np.isnan(gr[ms[0]].values).any():
#                sel_rows.pop(grname)
#                continue
            # select the cases in the same order as the corresponding hours
            try:
                sel_sort = gr.loc[case_ids]
            except KeyError:
                if not silent:
                    print('    ignore sensor for Leq:', grname)
            for col in extra_cols_:
                # at this stage we already should have one case, so its
                # identifiers should also be.
                val_unique = sel_sort[col].unique()
                if len(val_unique) > 1:
                    print('found %i sets instead of 1:' % len(val_unique))
                    print(val_unique)
                    raise ValueError('For Leq load, the given DataFrame can '
                                     'only hold one complete DLC set.')
                # values of the identifier columns for each case. We do this
                # in case the original dfs holds multiple DLC cases.
                dict_Leq[col].append(sel_sort[col].unique()[0])

            # R_eq is assumed to be expressed as the 1Hz equivalent load
            # where neq is set to the simulation lenght
#            neq_1hz = sel_sort['neq'].values

            for m in ms:
                # sel_sort[m] holds the equivalent loads for each of the DLC
                # cases: such all the different wind speeds for dlc1.2
                m_ = float(m.split('=')[1])
                # do not multi-ply out neq_1hz from R_eq
                R_eq_mod = np.power(sel_sort[m].values, m_)
                # R_eq_mod will have to be scaled from its simulation length
                # to 1 hour (hour distribution is in hours...). Since the
                # simulation time has not been multiplied out of R_eq_mod yet,
                # we can just multiply with 3600 (instead of doing 3600/neq)
                tmp = (R_eq_mod * np.array(hours) * 3600).sum()
                # the effective Leq for each of the material constants
                dict_Leq[m].append(math.pow(tmp/neq_life, 1.0/m_))
                # the following is twice as slow:
                # [i*j for (i,j) in zip(sel_sort[m].values.tolist(),hours)]

#        collens = misc.check_df_dict(dict_Leq)
        # make consistent data types, and convert to DataFrame
        fname = os.path.join(post_dir, sim_id + '_Leq')
        df_Leq = misc.dict2df(dict_Leq, fname, save=save, update=update,
                              csv=csv, check_datatypes=True, xlsx=xlsx,
                              complib=self.complib)

        # only keep the ones that do not have nan's (only works with index)
        return df_Leq

    def AEP(self, dfs, fh_lst=None, ch_powe='DLL-2-inpvec-2', extra_cols=[],
            res_dir='res/', dlc_folder="dlc%s_iec61400-1ed3/", csv=False,
            new_sim_id=False, save=False, years=20.0, update=False, xlsx=False):

        """
        Calculate the Annual Energy Production (AEP) for DLC1.2 cases.

        Parameters
        ----------

        dfs : DataFrame
            Statistics Pandas DataFrame. When extra_cols is not defined, it
            should only hold the results of one standard organized DLC (one
            turbine, one inflow case).

        fh_lst : list, default=None
            Number of hours for each case over its life time. Format:
            [(filename, hours),...] where, filename is the name of the file
            (can be a full path, but only the base path is considered), hours
            is the number of hours over the life time. When fh_lst is set,
            dlc_folder and dlc_name are not used.

        ch_powe : string, default='DLL-2-inpvec-2'

        extra_cols : list, default=[]
            The included column is just the AEP, and each row is
            a channel. When multiple DLC cases are included in dfs, the user
            has to define additional columns in order to distinguish between
            the DLC cases.

        res_dir : str, default='res/'
            Base directory of the results. Results would be located in
            res/dlc_folder/*.sel

        dlc_folder : str, default="dlc%s_iec61400-1ed3/"
            String with the DLC subfolder names. One string substitution is
            required (%s), and should represent the DLC number (withouth comma
            or point). Not relevant when fh_lst is defined.
        """

        # get some basic parameters required to calculate statistics
        try:
            case = list(self.cases.keys())[0]
        except IndexError:
            print('no cases to select so no statistics, aborting ...')
            return None
        post_dir = self.cases[case]['[post_dir]']
        if not new_sim_id:
            # select the sim_id from a random case
            sim_id = self.cases[case]['[sim_id]']
        else:
            sim_id = new_sim_id

        # FIXME: for backward compatibility, the column name of the unique
        # channel name has been changed in the past....
        if 'unique_ch_name' in dfs.columns:
            chan_col_name  = 'unique_ch_name'
        else:
            chan_col_name  = 'channel'

        if fh_lst is None:
            wb = WeibullParameters()
            if 'Weibull' in self.config:
                for key in self.config['Weibull']:
                    setattr(wb, key, self.config['Weibull'][key])

            # we assume the run_dir (root) is the same every where
            run_dir = self.cases[list(self.cases.keys())[0]]['[run_dir]']
            fname = os.path.join(run_dir, 'dlc_config.xlsx')
            dlc_cfg = dlc.DLCHighLevel(fname, shape_k=wb.shape_k)
            # if you need all DLCs, make sure to have %s in the file name
            dlc_cfg.res_folder = os.path.join(run_dir, res_dir, dlc_folder)
            # TODO: could be faster if working with df directly, but how to
            # assure you're res_dir is always ending with path separator?
            # FIXME: breaks when not all channels are present for all cases !
            # solution: set channel "Time" as a minimum required channel!
            val = dfs[chan_col_name].values[0]
            sel = dfs[dfs[chan_col_name]==val]
            p1, p2 = sel['[res_dir]'].values, sel['[case_id]'].values
            files = [os.path.join(q1, q2) + '.sel' for q1, q2 in zip(p1, p2)]
            fh_lst = dlc_cfg.file_hour_lst(years=1.0, files=files)

        # now we have a full path to the result files, but we only need the
        # the case_id to indentify the corresponding entry from the statistics
        # DataFrame (exluciding the .sel extension)
        def basename(k):
            return os.path.basename(k[0].replace('.sel', ''))
        fh_lst_basename = [(basename(k), k[1]) for k in fh_lst]
        # only take dlc12 for power production
        case_ids = [k[0] for k in fh_lst_basename if k[0][:5]=='dlc12']
        hours = [k[1] for k in fh_lst_basename if k[0][:5]=='dlc12']

        # safe how many hours each case is active for AEP calculations for
        # debugging and inspection reasons.
        # FIXME: this should be somewhere in its own method or something,
        # and duplication with what is in fatigue_lifetime should be removed
        fname = os.path.join(post_dir, sim_id + '_AEP_hourlist')
        dict_AEP_h = {'case_id':case_ids, 'hours':hours}
        df_AEP_h = misc.dict2df(dict_AEP_h, fname, update=update, csv=csv,
                                save=save, check_datatypes=True, xlsx=xlsx,
                                complib=self.complib)

        # and select only the power channels
        dfs_powe = dfs[dfs[chan_col_name]==ch_powe]

        # by default we have AEP as a column
        cols = ['AEP']
        cols.extend(extra_cols)
        # Built the DataFrame, we do not have a unqique channel index
        dict_AEP = {col:[] for col in cols}
        # index on case_id on the original DataFrame so we can select accordingly
        dfs_powe = dfs_powe.set_index('[case_id]')

        # select the cases in the same order as the corresponding hours
        sel_sort = dfs_powe.loc[case_ids]
        for col in extra_cols:
            # at this stage we already should have one case, so its
            # identifiers should also be.
            val_unique = sel_sort[col].unique()
            if len(val_unique) > 1:
                print('found %i sets instead of 1:' % len(val_unique))
                print(val_unique)
                raise ValueError('For AEP, the given DataFrame can only hold'
                                 'one complete DLC set. Make sure to identify '
                                 'the proper extra_cols to identify the '
                                 'different DLC sets.')
            # values of the identifier columns for each case. We do this
            # in case the original dfs holds multiple DLC cases.
            dict_AEP[col].append(sel_sort[col].unique()[0])

        # and the AEP: take the average, multiply with the duration
#        duration = sel_sort['[duration]'].values
#        power_mean = sel_sort['mean'].values
        AEP = (sel_sort['mean'].values * np.array(hours)).sum()
        dict_AEP['AEP'].append(AEP)

        # make consistent data types, and convert to DataFrame
        fname = os.path.join(post_dir, sim_id + '_AEP')
        df_AEP = misc.dict2df(dict_AEP, fname, update=update, csv=csv,
                              save=save, check_datatypes=True, xlsx=xlsx,
                              complib=self.complib)

        return df_AEP

    def stats2dataframe(self, ch_sel=None, tags=['[seed]','[windspeed]']):
        """
        Convert the archaic statistics dictionary of a group of cases to
        a more convienent pandas dataframe format.

        DEPRICATED, use statistics instead!!

        Parameters
        ----------

        ch_sel : dict, default=None
            Map short names to the channel id's defined in ch_dict in order to
            have more human readable column names in the pandas dataframe. By
            default, if ch_sel is None, a dataframe for each channel in the
            ch_dict (so in the HAWC2 output) will be created. When ch_sel is
            defined, only those channels are considered.
            ch_sel[short name] = full ch_dict identifier

        tags : list, default=['[seed]','[windspeed]']
            Select which tag values from cases should be included in the
            dataframes. This will help in selecting and identifying the
            different cases.

        Returns
        -------

        dfs : dict
            Dictionary of dataframes, where the key is the channel name of
            the output (that was optionally defined in ch_sel), and the value
            is the dataframe containing the statistical values for all the
            different selected cases.
        """

        df_dict = {}

        for cname, case in self.cases.items():

            # make sure the selected tags exist
            if len(tags) != len(set(case) and tags):
                raise KeyError('not all selected tags exist in cases')

            sig_stats = self.stats_dict[cname]['sig_stats']
            ch_dict = self.stats_dict[cname]['ch_dict']

            if ch_sel is None:
                ch_sel = { (i, i) for i in ch_dict }

            for ch_short, ch_name in ch_sel.items():

                chi = ch_dict[ch_name]['chi']
                # sig_stat = [(0=value,1=index),statistic parameter, channel]
                # stat params = 0 max, 1 min, 2 mean, 3 std, 4 range, 5 abs max
                # note that min, mean, std, and range are not relevant for index
                # values. Set to zero there.
                try:
                    df_dict[ch_short]['case name'].append(cname)
                    df_dict[ch_short]['max'].append(   sig_stats[0,0,chi])
                    df_dict[ch_short]['min'].append(   sig_stats[0,1,chi])
                    df_dict[ch_short]['mean'].append(  sig_stats[0,2,chi])
                    df_dict[ch_short]['std'].append(   sig_stats[0,3,chi])
                    df_dict[ch_short]['range'].append( sig_stats[0,4,chi])
                    df_dict[ch_short]['absmax'].append(sig_stats[0,5,chi])
                    for tag in tags:
                        df_dict[ch_short][tag].append(case[tag])
                except KeyError:
                    df_dict[ch_short] = {'case name' : [cname]}
                    df_dict[ch_short]['max']    = [sig_stats[0,0,chi]]
                    df_dict[ch_short]['min']    = [sig_stats[0,1,chi]]
                    df_dict[ch_short]['mean']   = [sig_stats[0,2,chi]]
                    df_dict[ch_short]['std']    = [sig_stats[0,3,chi]]
                    df_dict[ch_short]['range']  = [sig_stats[0,4,chi]]
                    df_dict[ch_short]['absmax'] = [sig_stats[0,5,chi]]
                    for tag in tags:
                        df_dict[ch_short][tag] = [ case[tag] ]

        # and create for each channel a dataframe
        dfs = {}
        for ch_short, df_values in df_dict.items():
            dfs[ch_short] = pd.DataFrame(df_values)

        return dfs

    def load_azimuth(self, azi, load, sectors=360):
        """
        Establish load dependency on rotor azimuth angle
        """

        # sort on azimuth angle
        isort = np.argsort(azi)
        azi = azi[isort]
        load = load[isort]

        azi_sel = np.linspace(0, 360, num=sectors)
        load_sel = np.interp(azi_sel, azi, load)

    def find_windchan_hub(self):
        """
        """
        # if we sort we'll get the largest absolute coordinate last
        for ch in sorted(self.res.ch_dict.keys()):
            if ch[:29] == 'windspeed-global-Vy-0.00-0.00':
                chan_found = ch