Skip to content
Snippets Groups Projects
misc.py 40.1 KiB
Newer Older
                loc=0, scale=scale), x_hist, y_hist)

    pdf_fit =  stats.lognorm.pdf(xnew, shape_out, loc=0, scale=scale_out)
    # normalize
    width = np.diff(x_hist).mean()
    pdf_fit = pdf_fit / (pdf_fit * width).sum()
    return shape_out, scale_out, pdf_fit


def histfit_arbritrary(edges, pdf, edges_new, resolution=100):
    """Re-bin based on the CDF of a PDF. Assume normal distribution within
    a bin to transform the CDF to higher resolution.

    Parameters
    ----------

    edges : ndarray(n+1)
        edges of the bins, inlcuding most left and right edges.

    pdf : ndarray(n)
        probability of the bins

    edges_new : ndarray(m+1)
        edges of the new bins

    resolution : int
        resolution of the intermediate CDF used for re-fitting.


    Returns
    -------

    centers_new : ndarray(m)

    widths_new : ndarray(m)

    pdf_new : ndarray(m)

    """

    x_hd = np.ndarray((0,))
    cdf_hd = np.ndarray((0,))
    binw = np.ndarray((0,))

    for i in range(len(pdf)):
        # HD grid for x
        x_inc = np.linspace(edges[i], edges[i+1], num=resolution)

        # FIXME: let the distribution in a bin be a user configurable input
        # define a distribution within the bin: norm
        shape = 2.5
        scale = shape*2/10
        x_inc = np.linspace(0, scale*10, num=resolution)
        cdf_inc = stats.norm.cdf(x_inc, shape, scale=scale)

        # scale cdf_inc and x-coordinates
        cdf_inc_scale = pdf[i] * cdf_inc / cdf_inc[-1]
        binw = edges[i+1] - edges[i]
        x_inc_scale = edges[i] + (binw * x_inc / x_inc[-1])

        # add to the new hd corodinates and cdf
        x_hd = np.append(x_hd, x_inc_scale)
        if i == 0:
            cdf_i  = 0
        else:
            cdf_i = cdf_hd[-1]
        cdf_hd = np.append(cdf_hd, cdf_inc_scale + cdf_i)

#        plt.plot(x_inc, cdf_inc)
#        plt.plot(x_inc_scale, cdf_inc_scale)

    cdf_new = interp(x_hd, cdf_hd, edges_new)
    # last point includes everything that comes after
    cdf_new[-1] = 1
    pdf_new = np.diff(cdf_new)
    widths_new = np.diff(edges_new)
    centers_new = widths_new + edges[0]
    # the first bin also includes everything that came before
    pdf_new[0] += cdf_new[0]
    pdf_new /= pdf_new.sum()

#    plt.plot(x_hd, cdf_hd)
#    plt.plot(edges_new, cdf_new, 'rs')
#
#    plt.bar(edges_new[:-1], pdf_new, width=widths_new, color='b')
#    plt.bar(edges[:-1], pdf, width=np.diff(edges), color='r', alpha=0.7)

    return centers_new, widths_new, pdf_new


def hist_centers2edges(centers):
    """Given the centers of bins, return its edges and bin widths.
    """

    binw_c = centers[1:] - centers[:-1]
    edges = np.ndarray((len(centers)+1,))
    edges[0] = centers[0] - binw_c[0]/2.0
    edges[-1] = centers[-1] + binw_c[-1]/2.0
    edges[1:-1] = centers[1:] - binw_c/2.0
    binw_e = edges[1:] - edges[:-1]

    return edges, binw_e


def df_dict_check_datatypes(df_dict):
    """
    there might be a mix of strings and numbers now, see if we can have
    the same data type throughout a column
    nasty hack: because of the unicode -> string conversion we might not
    overwrite the same key in the dict.
    """
    # FIXME: this approach will result in twice the memory useage though...
    # we can not pop/delete items from a dict while iterating over it
    df_dict2 = {}
    for colkey, col in df_dict.items():
        # if we have a list, convert to string
        if type(col[0]).__name__ == 'list':
            for ii, item in enumerate(col):
                col[ii] = '**'.join(item)
        # if we already have an array (statistics) or a list of numbers
        # do not try to cast into another data type, because downcasting
        # in that case will not raise any exception
        elif type(col[0]).__name__[:3] in ['flo', 'int', 'nda']:
            df_dict2[str(colkey)] = np.array(col)
            continue
        # in case we have unicodes instead of strings, we need to convert
        # to strings otherwise the saved .h5 file will have pickled elements
        try:
            df_dict2[str(colkey)] = np.array(col, dtype=np.int32)
        except OverflowError:
            try:
                df_dict2[str(colkey)] = np.array(col, dtype=np.int64)
            except OverflowError:
                df_dict2[str(colkey)] = np.array(col, dtype=np.float64)
        except ValueError:
            try:
                df_dict2[str(colkey)] = np.array(col, dtype=np.float64)
            except ValueError:
                df_dict2[str(colkey)] = np.array(col, dtype=np.str)
        except TypeError:
            # in all other cases, make sure we have converted them to
            # strings and NOT unicode
            df_dict2[str(colkey)] = np.array(col, dtype=np.str)
        except Exception as e:
            print('failed to convert column %s to single data type' % colkey)
            raise(e)
    return df_dict2


def dict2df(df_dict, fname, save=True, update=False, csv=False, colsort=None,
            check_datatypes=False, rowsort=None, csv_index=False, xlsx=False,
            complib='blosc'):
        """
        Convert the df_dict to df and save/update if required. If converting
        to df fails, pickle the object. Optionally save as csv too.

        Parameters
        ----------

        df_dict : dict
            Dictionary that will be converted to a DataFrame

        fname : str
            File name excluding the extension. .pkl, .h5 and/or .csv will be
            added.
        """
        if check_datatypes:
            df_dict = df_dict_check_datatypes(df_dict)

        # in case converting to dataframe fails, fall back
        try:
            if colsort is not None:
                dfs = pd.DataFrame(df_dict)[colsort]
#                try:
#                    dfs = dfs[colsort]
#                except KeyError as e:
#                    print('Re-ordering the columns failed. colsort cols are:')
#                    print(sorted(colsort))
#                    print('Actual columns:')
#                    print(sorted(dfs.keys()))
#                    print('&', set(colsort) & set(dfs.keys()))
#                    print('-', set(colsort) - set(dfs.keys()))
#                    raise e
            else:
                dfs = pd.DataFrame(df_dict)
        except Exception as e:
            print('failed to convert to data frame', end='')
            if fname is not None:
                with open(fname + '.pkl', 'wb') as f:
                    pickle.dump(df_dict, f, protocol=2)
                print(', saved as dict')
            else:
                print('')
            print('df_dict has following columns and corresponding nr of rows')
            check_df_dict(df_dict)
            raise(e)

        if rowsort is not None:
            dfs.sort(columns=rowsort, inplace=True)

#        # apply categoricals to objects: reduce in memory footprint. In theory
#        # when using compression on a saved hdf5 object, this shouldn't make
#        # any difference.
#        for column_name, column_dtype in dfs.dtypes.iteritems():
#            # applying categoricals mostly makes sense for objects
#            # we ignore all others
#            if column_dtype.name == 'object':
#                dfs[column_name] = dfs[column_name].astype('category')

        # and save/update the statistics database
        if save and fname is not None:
            if update:
                print('updating: %s ...' % (fname), end='')
                try:
                    dfs.to_hdf('%s.h5' % fname, 'table', mode='r+', append=True,
                               format='table', complevel=9, complib=complib)
                except IOError:
                    print('Can not update, file does not exist. Saving instead'
                          '...', end='')
                    dfs.to_hdf('%s.h5' % fname, 'table', mode='w',
                               format='table', complevel=9, complib=complib)
            else:
                print('saving: %s ...' % (fname), end='')
                if csv:
                    dfs.to_csv('%s.csv' % fname, index=csv_index)
                if xlsx:
                    dfs.to_excel('%s.xlsx' % fname, index=csv_index)
                dfs.to_hdf('%s.h5' % fname, 'table', mode='w',
                           format='table', complevel=9, complib=complib)

            print('DONE!!\n')

        return dfs


class Tests(unittest.TestCase):

    def setUp(self):
        pass

    def test_rebin1(self):
        hist = np.array([2,5,5,9,2,6])
        bins = np.arange(7)
        nrbins = 3
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([7, 14, 8])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([0.0, 2.0, 4.0, 6.0])
        self.assertTrue(np.allclose(binanswer, bins_))

    def test_rebin2(self):
        hist = np.array([2,5,5,9,2,6])
        bins = np.arange(7)
        nrbins = 1
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([hist.sum()])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([bins[0],bins[-1]])
        self.assertTrue(np.allclose(binanswer, bins_))

    def test_rebin3(self):
        hist = np.array([1,1,1])
        bins = np.arange(4)
        nrbins = 2
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([1.5, 1.5])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([0, 1.5, 3.0])
        self.assertTrue(np.allclose(binanswer, bins_))

    def test_rebin4(self):
        hist = np.array([1,1,1])
        bins = np.arange(2, 14, 3)
        nrbins = 2
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([1.5, 1.5])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([2, 6.5, 11.0])
        self.assertTrue(np.allclose(binanswer, bins_))

    def test_rebin5(self):
        hist = np.array([1,4,2,5,6,11,9,10,8,0.5])
        bins = np.linspace(-2, 10, 11)
        nrbins = 8
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([2, 4, 4.75, 7.25, 13.25, 11.75, 11, 2.5])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([-2., -0.5, 1., 2.5, 4., 5.5, 7., 8.5, 10.0])
        self.assertTrue(np.allclose(binanswer, bins_))


if __name__ == '__main__':
    unittest.main()