from __future__ import division, print_function, absolute_import, unicode_literals import warnings from wetb.gtsdf.unix_time import from_unix try: import h5py except ImportError as e: raise ImportError("HDF5 library cannot be loaded. Windows XP is a known cause of this problem\n%s" % e) import os import numpy as np import numpy.ma as ma block_name_fmt = "block%04d" def load(filename, dtype=None): """Load a 'General Time Series Data Format'-hdf5 datafile Parameters ---------- filename : str or h5py.File filename or open file object dtype: data type, optional type of returned data array, e.g. float16, float32 or float64. If None(default) the type of the returned data depends on the type of the file data Returns ------- time : ndarray(dtype=float64), shape (no_observations,) time data : ndarray(dtype=dtype), shape (no_observations, no_attributes) data info : dict info containing: - type: "General Time Series Data Format" - name: name of dataset or filename if not present in file - no_attributes: Number of attributes - no_blocks: Number of datablocks - [description]: description of dataset or "" if not present in file - [attribute_names]: list of attribute names - [attribute_units]: list of attribute units - [attribute_descriptions]: list of attribute descriptions See Also -------- gtsdf, save Examples -------- >>> import gtsdf >>> data = np.arange(6).reshape(3,2) >>> gtsdf.save('test.hdf5', data) >>> time, data, info = gtsdf.load('test.hdf5') >>> print time [ 0. 1. 2.] >>> print data [[ 0. 1.] [ 2. 3.] [ 4. 5.]] >>> print info {'no_blocks': 1, 'type': 'General time series data format', 'name': 'test', 'no_attributes': 2, 'description': ''} >>> gtsdf.save('test.hdf5', data, name='MyDataset', description='MyDatasetDescription', attribute_names=['Att1', 'Att2'], attribute_units=['m', "m/s"], attribute_descriptions=['Att1Desc', 'Att2Desc'], time = np.array([0,1,4]), time_start = 10, time_step=2, dtype=np.float64) >>> time, data, info = gtsdf.load('test.hdf5') >>> print time [ 10. 12. 18.] >>> print data [[ 0. 1.] [ 2. 3.] [ 4. 5.]] >>> print info {'attribute_names': array(['Att1', 'Att2'], dtype='|S4'), 'attribute_units': array(['m', 'm/s'], dtype='|S3'), 'attribute_descriptions': array(['Att1Desc', 'Att2Desc'], dtype='|S8'), 'name': 'MyDataset', 'no_attributes': 2, 'no_blocks': 1, 'type': 'General time series data format', 'description': 'MyDatasetDescription'} """ if isinstance(filename, h5py.File): f = filename filename = f.filename else: assert os.path.isfile(filename), "File, %s, does not exists" % filename f = h5py.File(filename, 'r') try: def decode(v): if isinstance(v, bytes): return v.decode('latin1') return v info = {k: decode(v) for k, v in f.attrs.items()} check_type(f) if (block_name_fmt % 0) not in f: raise ValueError("HDF5 file must contain a group named '%s'" % (block_name_fmt % 0)) block0 = f[block_name_fmt % 0] if 'data' not in block0: raise ValueError("group %s must contain a dataset called 'data'" % (block_name_fmt % 0)) _, no_attributes = block0['data'].shape if 'name' not in info: info['name'] = os.path.splitext(os.path.basename(filename))[0] if 'attribute_names' in f: info['attribute_names'] = [v.decode('latin1') for v in f['attribute_names']] if 'attribute_units' in f: info['attribute_units'] = [v.decode('latin1') for v in f['attribute_units']] if 'attribute_descriptions' in f: info['attribute_descriptions'] = [v.decode('latin1') for v in f['attribute_descriptions']] no_blocks = f.attrs['no_blocks'] if dtype is None: file_dtype = f[block_name_fmt % 0]['data'].dtype if "float" in str(file_dtype): dtype = file_dtype elif file_dtype in [np.int8, np.uint8, np.int16, np.uint16]: dtype = np.float32 else: dtype = np.float64 data = np.empty((0, no_attributes), dtype=dtype) time = np.empty((0), dtype=np.float64) for i in range(no_blocks): block = f[block_name_fmt % i] no_observations, no_attributes = block['data'].shape block_time = (block.get('time', np.arange(no_observations))[:]).astype(np.float64) if 'time_step' in block.attrs: block_time *= block.attrs['time_step'] if 'time_start' in block.attrs: block_time += block.attrs['time_start'] time = np.append(time, block_time) block_data = block['data'][:].astype(dtype) if "int" in str(block['data'].dtype): block_data[block_data == np.iinfo(block['data'].dtype).max] = np.nan if 'gains' in block: block_data *= block['gains'][:] if 'offsets' in block: block_data += block['offsets'][:] data = np.append(data, block_data, 0) f.close() return time, data.astype(dtype), info except (ValueError, AssertionError): f.close() raise def save(filename, data, **kwargs): """Save a 'General Time Series Data Format'-hdf5 datafile Additional datablocks can be appended later using gtsdf.append_block Parameters ---------- filename : str data : array_like, shape (no_observations, no_attributes) name : str, optional Name of dataset description : str, optional Description of dataset attribute_names : array_like, shape (no_attributes,), optional Names of attributes attribute_units : array_like, shape (no_attributes,), optinoal Units of attributes attribute_descriptions : array_like, shape(no_attributes,), optional Descriptions of attributes time : array_like, shape (no_observations, ), optional Time, default is [0..no_observations-1] time_start : int or float, optional Time offset (e.g. start time in seconds since 1/1/1970), default is 0, see notes time_step : int or float, optional Time scale factor (e.g. 1/sample frequency), default is 1, see notes dtype : data-type, optional Data type of saved data array, default uint16.\n Recommended choices: - uint16: Data is compressed into 2 byte integers using a gain and offset factor for each attribute - float64: Data is stored with high precision using 8 byte floats Notes ----- Time can be specified by either - time (one value for each observation). Required inhomogeneous time distributions - time_start and/or time_step (one or two values), Recommended for homogeneous time distributions - time and time_start and/or time_step (one value for each observation + one or two values) When reading the file, the returned time-array is calculated as time * time_step + time_start See Also -------- gtsdf, append_block, load Examples -------- >>> import gtsdf >>> data = np.arange(12).reshape(6,2) >>> gtsdf.save('test.hdf5', data) >>> gtsdf.save('test.hdf5', data, name='MyDataset', description='MyDatasetDescription', attribute_names=['Att1', 'Att2'], attribute_units=['m', "m/s"], attribute_descriptions=['Att1Desc', 'Att2Desc'], time = np.array([0,1,2,6,7,8]), time_start = 10, time_step=2, dtype=np.float64) """ if not filename.lower().endswith('.hdf5'): filename += ".hdf5" os.makedirs(os.path.dirname(os.path.abspath(filename)), exist_ok=True) f = h5py.File(filename, "w") try: f.attrs["type"] = "General time series data format" no_observations, no_attributes = data.shape if 'name' in kwargs: f.attrs['name'] = kwargs['name'] if 'description' in kwargs: f.attrs['description'] = kwargs['description'] f.attrs['no_attributes'] = no_attributes if 'attribute_names' in kwargs: assert len(kwargs['attribute_names']) == no_attributes, "len(attribute_names)=%d but data shape is %s" % (len(kwargs['attribute_names']), data.shape) f.create_dataset("attribute_names", data=np.array([v.encode('utf-8') for v in kwargs['attribute_names']])) if 'attribute_units' in kwargs: assert(len(kwargs['attribute_units']) == no_attributes) f.create_dataset("attribute_units", data=np.array([v.encode('utf-8') for v in kwargs['attribute_units']])) if 'attribute_descriptions' in kwargs: assert(len(kwargs['attribute_descriptions']) == no_attributes) f.create_dataset("attribute_descriptions", data=np.array([v.encode('utf-8') for v in kwargs['attribute_descriptions']])) f.attrs['no_blocks'] = 0 except Exception: raise finally: f.close() append_block(filename, data, **kwargs) def append_block(filename, data, **kwargs): """Append a data block and corresponding time data to already existing file Parameters ---------- filename : str data : array_like, shape (no_observations, no_attributes) time : array_like, shape (no_observations, ), optional Time, default is [0..no_observations-1] time_start : int or float, optional Time offset (e.g. start time in seconds since 1/1/1970), default is 0, see notes time_step : int or float, optional Time scale factor (e.g. 1/sample frequency), default is 1, see notes dtype : data-type, optional Data type of saved data array, default uint16.\n Recommended choices: - uint16: Data is compressed into 2 byte integers using a gain and offset factor for each attribute - float64: Data is stored with high precision using 8 byte floats Notes ----- Time can be specified by either - time (one value for each observation). Required inhomogeneous time distributions - time_start and/or time_step (one or two values), Recommended for homogeneous time distributions - time and time_start and/or time_step (one value for each observation + one or two values) When reading the file, the returned time-array is calculated as time * time_step + time_start See Also -------- gtsdf, save Examples -------- >>> import gtsdf >>> data = np.arange(12).reshape(6,2) >>> gtsdf.save('test.hdf5', data) >>> gtsdf.append_block('test.hdf5', data+6) >>> time, data, info = gtsdf.load('test.hdf5') >>> print time [ 0. 1. 2. 3. 4. 5.] >>> print data [[ 0. 1.] [ 2. 3.] [ 4. 5.] [ 6. 7.] [ 8. 9.] [ 10. 11.]] >>> print info {'no_blocks': 2, 'type': 'General time series data format', 'name': 'test', 'no_attributes': 2} """ try: f = h5py.File(filename, "a") check_type(f) no_observations, no_attributes = data.shape assert(no_attributes == f.attrs['no_attributes']) blocknr = f.attrs['no_blocks'] if blocknr == 0: dtype = kwargs.get('dtype', np.uint16) else: dtype = f[block_name_fmt % 0]['data'].dtype block = f.create_group(block_name_fmt % blocknr) if 'time' in kwargs: assert(len(kwargs['time']) == no_observations) block.create_dataset('time', data=kwargs['time']) if 'time_step' in kwargs: time_step = kwargs['time_step'] block.attrs['time_step'] = time_step if 'time_start' in kwargs: block.attrs['time_start'] = kwargs['time_start'] pct_res = np.array([1]) if "int" in str(dtype): if np.any(np.isinf(data)): f.close() raise ValueError ("Int compression does not support 'inf'\nConsider removing outliers or use float datatype") nan = np.isnan(data) non_nan_data = ma.masked_array(data, nan) offsets = np.min(non_nan_data, 0) try: data = np.copy(data).astype(np.float64) except MemoryError: data = np.copy(data) data -= offsets with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignore warning caused by abs(nan) and np.nanmax(nan) pct_res = (np.percentile(data[~np.isnan(data)], 75, 0) - np.percentile(data[~np.isnan(data)], 25, 0)) / np.nanmax(np.abs(data), 0) # percent of resolution for middle half of data gains = np.max(non_nan_data - offsets, 0).astype(np.float64) / (np.iinfo(dtype).max - 1) #-1 to save value for NaN not0 = np.where(gains != 0) data[:, not0] /= gains[not0] data = data.astype(dtype) data[nan] = np.iinfo(dtype).max block.create_dataset('gains', data=gains) block.create_dataset('offsets', data=offsets) block.create_dataset("data", data=data.astype(dtype)) f.attrs['no_blocks'] = blocknr + 1 f.close() if "int" in str(dtype): int_res = (np.iinfo(dtype).max - np.iinfo(dtype).min) with np.errstate(invalid='ignore'): if min(pct_res[pct_res > 0]) * int_res < 256: raise Warning("Less than 256 values are used to represent 50%% of the values in column(s): %s\nConsider removing outliers or use float datatype" % np.where(pct_res[pct_res > 0] * int_res < 256)[0]) except Exception: try: f.close() except: pass raise def load_pandas(filename, dtype=None): import pandas as pd time, data, info = load(filename, dtype) df = pd.DataFrame() df["Time"] = time df["Date"] = [from_unix(t) for t in time] for n, d in zip(info['attribute_names'], data.T): df[n] = d return df def check_type(f): if 'type' not in f.attrs or \ (f.attrs['type'].lower() != "general time series data format" and f.attrs['type'].lower() != b"general time series data format"): raise ValueError("HDF5 file must contain a 'type'-attribute with the value 'General time series data format'") if 'no_blocks' not in f.attrs: raise ValueError("HDF5 file must contain an attribute named 'no_blocks'")