Skip to content
Snippets Groups Projects
misc.py 37.3 KiB
Newer Older
def df_dict_check_datatypes(df_dict):
    """
    there might be a mix of strings and numbers now, see if we can have
    the same data type throughout a column
    nasty hack: because of the unicode -> string conversion we might not
    overwrite the same key in the dict.
    """
    # FIXME: this approach will result in twice the memory useage though...
    # we can not pop/delete items from a dict while iterating over it
    df_dict2 = {}
    for colkey, col in df_dict.items():
        # if we have a list, convert to string
        if type(col[0]).__name__ == 'list':
            for ii, item in enumerate(col):
                col[ii] = '**'.join(item)
        # if we already have an array (statistics) or a list of numbers
        # do not try to cast into another data type, because downcasting
        # in that case will not raise any exception
        elif type(col[0]).__name__[:3] in ['flo', 'int', 'nda']:
            df_dict2[str(colkey)] = np.array(col)
            continue
        # in case we have unicodes instead of strings, we need to convert
        # to strings otherwise the saved .h5 file will have pickled elements
        try:
            df_dict2[str(colkey)] = np.array(col, dtype=np.int32)
        except OverflowError:
            try:
                df_dict2[str(colkey)] = np.array(col, dtype=np.int64)
            except OverflowError:
                df_dict2[str(colkey)] = np.array(col, dtype=np.float64)
        except ValueError:
            try:
                df_dict2[str(colkey)] = np.array(col, dtype=np.float64)
            except ValueError:
                df_dict2[str(colkey)] = np.array(col, dtype=np.str)
        except TypeError:
            # in all other cases, make sure we have converted them to
            # strings and NOT unicode
            df_dict2[str(colkey)] = np.array(col, dtype=np.str)
        except Exception as e:
            print('failed to convert column %s to single data type' % colkey)
            raise(e)
    return df_dict2


def dict2df(df_dict, fname, save=True, update=False, csv=False, colsort=None,
            check_datatypes=False, rowsort=None, csv_index=False, xlsx=False,
            complib='blosc'):
        """
        Convert the df_dict to df and save/update if required. If converting
        to df fails, pickle the object. Optionally save as csv too.

        Parameters
        ----------

        df_dict : dict
            Dictionary that will be converted to a DataFrame

        fname : str
            File name excluding the extension. .pkl, .h5 and/or .csv will be
            added.
        """
        if check_datatypes:
            df_dict = df_dict_check_datatypes(df_dict)

        # in case converting to dataframe fails, fall back
        try:
            if colsort is not None:
                dfs = pd.DataFrame(df_dict)[colsort]
#                try:
#                    dfs = dfs[colsort]
#                except KeyError as e:
#                    print('Re-ordering the columns failed. colsort cols are:')
#                    print(sorted(colsort))
#                    print('Actual columns:')
#                    print(sorted(dfs.keys()))
#                    print('&', set(colsort) & set(dfs.keys()))
#                    print('-', set(colsort) - set(dfs.keys()))
#                    raise e
            else:
                dfs = pd.DataFrame(df_dict)
        except Exception as e:
            print('failed to convert to data frame', end='')
            if fname is not None:
                with open(fname + '.pkl', 'wb') as f:
                    pickle.dump(df_dict, f, protocol=2)
                print(', saved as dict')
            else:
                print('')
            print('df_dict has following columns and corresponding nr of rows')
            check_df_dict(df_dict)
            raise(e)

        if rowsort is not None:
            dfs.sort(columns=rowsort, inplace=True)

#        # apply categoricals to objects: reduce in memory footprint. In theory
#        # when using compression on a saved hdf5 object, this shouldn't make
#        # any difference.
#        for column_name, column_dtype in dfs.dtypes.iteritems():
#            # applying categoricals mostly makes sense for objects
#            # we ignore all others
#            if column_dtype.name == 'object':
#                dfs[column_name] = dfs[column_name].astype('category')

        # and save/update the statistics database
        if save and fname is not None:
            if update:
                print('updating: %s ...' % (fname), end='')
                try:
                    dfs.to_hdf('%s.h5' % fname, 'table', mode='r+', append=True,
                               format='table', complevel=9, complib=complib)
                except IOError:
                    print('Can not update, file does not exist. Saving instead'
                          '...', end='')
                    dfs.to_hdf('%s.h5' % fname, 'table', mode='w',
                               format='table', complevel=9, complib=complib)
            else:
                print('saving: %s ...' % (fname), end='')
                if csv:
                    dfs.to_csv('%s.csv' % fname, index=csv_index)
                if xlsx:
                    dfs.to_excel('%s.xlsx' % fname, index=csv_index)
                dfs.to_hdf('%s.h5' % fname, 'table', mode='w',
                           format='table', complevel=9, complib=complib)

            print('DONE!!\n')

        return dfs


class Tests(unittest.TestCase):

    def setUp(self):
        pass

    def test_rebin1(self):
        hist = np.array([2,5,5,9,2,6])
        bins = np.arange(7)
        nrbins = 3
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([7, 14, 8])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([0.0, 2.0, 4.0, 6.0])
        self.assertTrue(np.allclose(binanswer, bins_))

    def test_rebin2(self):
        hist = np.array([2,5,5,9,2,6])
        bins = np.arange(7)
        nrbins = 1
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([hist.sum()])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([bins[0],bins[-1]])
        self.assertTrue(np.allclose(binanswer, bins_))

    def test_rebin3(self):
        hist = np.array([1,1,1])
        bins = np.arange(4)
        nrbins = 2
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([1.5, 1.5])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([0, 1.5, 3.0])
        self.assertTrue(np.allclose(binanswer, bins_))

    def test_rebin4(self):
        hist = np.array([1,1,1])
        bins = np.arange(2, 14, 3)
        nrbins = 2
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([1.5, 1.5])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([2, 6.5, 11.0])
        self.assertTrue(np.allclose(binanswer, bins_))

    def test_rebin5(self):
        hist = np.array([1,4,2,5,6,11,9,10,8,0.5])
        bins = np.linspace(-2, 10, 11)
        nrbins = 8
        hist_, bins_ = rebin(hist, bins, nrbins)
        answer = np.array([2, 4, 4.75, 7.25, 13.25, 11.75, 11, 2.5])
        self.assertTrue(np.allclose(answer, hist_))
        binanswer = np.array([-2., -0.5, 1., 2.5, 4., 5.5, 7., 8.5, 10.0])
        self.assertTrue(np.allclose(binanswer, bins_))


if __name__ == '__main__':
    unittest.main()