From f181b12ca922f82e1bd11eba39449bdc1e2ddfc7 Mon Sep 17 00:00:00 2001 From: "Mads M. Pedersen" <mmpe@dtu.dk> Date: Thu, 13 Jul 2017 11:17:01 +0200 Subject: [PATCH] implemented statistics in gtsdf --- wetb/gtsdf/__init__.py | 3 + wetb/gtsdf/gtsdf.py | 210 ++++++++++++++++++---------- wetb/gtsdf/tests/test_gtsdf_stat.py | 64 +++++++++ 3 files changed, 204 insertions(+), 73 deletions(-) create mode 100644 wetb/gtsdf/tests/test_gtsdf_stat.py diff --git a/wetb/gtsdf/__init__.py b/wetb/gtsdf/__init__.py index 90236478..56c191e1 100644 --- a/wetb/gtsdf/__init__.py +++ b/wetb/gtsdf/__init__.py @@ -38,6 +38,9 @@ from .gtsdf import save from .gtsdf import load from .gtsdf import append_block from .gtsdf import load_pandas +from .gtsdf import add_statistic +from .gtsdf import load_statistic +from .gtsdf import compress2statistics class Dataset(object): def __init__(self, filename): diff --git a/wetb/gtsdf/gtsdf.py b/wetb/gtsdf/gtsdf.py index 6ec004be..859545bc 100644 --- a/wetb/gtsdf/gtsdf.py +++ b/wetb/gtsdf/gtsdf.py @@ -3,6 +3,7 @@ from builtins import zip from builtins import range from builtins import str from future import standard_library +from wetb.fatigue_tools.fatigue import eq_load standard_library.install_aliases() import warnings from wetb.gtsdf.unix_time import from_unix @@ -13,6 +14,7 @@ except ImportError as e: import os import numpy as np import numpy.ma as ma +import pandas as pd block_name_fmt = "block%04d" def load(filename, dtype=None): @@ -89,80 +91,95 @@ def load(filename, dtype=None): 'type': 'General time series data format', 'description': 'MyDatasetDescription'} """ + f = _open_h5py_file(filename) + try: + info = _load_info(f) + time, data = _load_timedata(f,dtype) + return time, data, info + finally: + try: + f.close() + except: + pass + +def _open_h5py_file(filename): if isinstance(filename, h5py.File): f = filename filename = f.filename else: assert os.path.isfile(filename), "File, %s, does not exists" % filename f = h5py.File(filename, 'r') - try: - def decode(v): - if isinstance(v, bytes): - return v.decode('latin1') - return v - - - info = {k: decode(v) for k, v in f.attrs.items()} - check_type(f) - if (block_name_fmt % 0) not in f: - raise ValueError("HDF5 file must contain a group named '%s'" % (block_name_fmt % 0)) - block0 = f[block_name_fmt % 0] - if 'data' not in block0: - raise ValueError("group %s must contain a dataset called 'data'" % (block_name_fmt % 0)) - _, no_attributes = block0['data'].shape - if 'name' not in info: - info['name'] = os.path.splitext(os.path.basename(filename))[0] - if 'attribute_names' in f: - info['attribute_names'] = [v.decode('latin1') for v in f['attribute_names']] - if 'attribute_units' in f: - info['attribute_units'] = [v.decode('latin1') for v in f['attribute_units']] - if 'attribute_descriptions' in f: - info['attribute_descriptions'] = [v.decode('latin1') for v in f['attribute_descriptions']] - no_blocks = f.attrs['no_blocks'] - - if dtype is None: - file_dtype = f[block_name_fmt % 0]['data'].dtype - if "float" in str(file_dtype): - dtype = file_dtype - elif file_dtype in [np.int8, np.uint8, np.int16, np.uint16]: - dtype = np.float32 - else: - dtype = np.float64 - time = [] - data = [] - for i in range(no_blocks): - - try: - block = f[block_name_fmt % i] - except KeyError: - continue - no_observations, no_attributes = block['data'].shape - block_time = (block.get('time', np.arange(no_observations))[:]).astype(np.float64) - if 'time_step' in block.attrs: - block_time *= block.attrs['time_step'] - if 'time_start' in block.attrs: - block_time += block.attrs['time_start'] - time.extend(block_time) - - block_data = block['data'][:].astype(dtype) - if "int" in str(block['data'].dtype): - block_data[block_data == np.iinfo(block['data'].dtype).max] = np.nan - - if 'gains' in block: - block_data *= block['gains'][:] - if 'offsets' in block: - block_data += block['offsets'][:] - data.append(block_data) - - f.close() - if no_blocks > 0: - data = np.vstack(data) - return np.array(time).astype(np.float64), np.array(data).astype(dtype), info - except (ValueError, AssertionError): - f.close() - raise - + return f + +def decode(v): + if isinstance(v, bytes): + return v.decode('latin1') + elif hasattr(v,'len'): + return [decode(v_) for v_ in v] + return v + +def _load_info(f): + + + info = {k: decode(v) for k, v in f.attrs.items()} + check_type(f) + if 'name' not in info: + info['name'] = os.path.splitext(os.path.basename(f.filename))[0] + if 'attribute_names' in f: + info['attribute_names'] = [v.decode('latin1') for v in f['attribute_names']] + if 'attribute_units' in f: + info['attribute_units'] = [v.decode('latin1') for v in f['attribute_units']] + if 'attribute_descriptions' in f: + info['attribute_descriptions'] = [v.decode('latin1') for v in f['attribute_descriptions']] + return info + +def _load_timedata(f, dtype): + no_blocks = f.attrs['no_blocks'] + if (block_name_fmt % 0) not in f: + raise ValueError("HDF5 file must contain a group named '%s'" % (block_name_fmt % 0)) + block0 = f[block_name_fmt % 0] + if 'data' not in block0: + raise ValueError("group %s must contain a dataset called 'data'" % (block_name_fmt % 0)) + _, no_attributes = block0['data'].shape + + + if dtype is None: + file_dtype = f[block_name_fmt % 0]['data'].dtype + if "float" in str(file_dtype): + dtype = file_dtype + elif file_dtype in [np.int8, np.uint8, np.int16, np.uint16]: + dtype = np.float32 + else: + dtype = np.float64 + time = [] + data = [] + for i in range(no_blocks): + try: + block = f[block_name_fmt % i] + except KeyError: + continue + no_observations, no_attributes = block['data'].shape + block_time = (block.get('time', np.arange(no_observations))[:]).astype(np.float64) + if 'time_step' in block.attrs: + block_time *= block.attrs['time_step'] + if 'time_start' in block.attrs: + block_time += block.attrs['time_start'] + time.extend(block_time) + + block_data = block['data'][:].astype(dtype) + if "int" in str(block['data'].dtype): + block_data[block_data == np.iinfo(block['data'].dtype).max] = np.nan + + if 'gains' in block: + block_data *= block['gains'][:] + if 'offsets' in block: + block_data += block['offsets'][:] + data.append(block_data) + + if no_blocks > 0: + data = np.vstack(data) + return np.array(time).astype(np.float64), np.array(data).astype(dtype) def save(filename, data, **kwargs): """Save a 'General Time Series Data Format'-hdf5 datafile @@ -226,36 +243,44 @@ def save(filename, data, **kwargs): time_step=2, dtype=np.float64) """ - if not filename.lower().endswith('.hdf5'): filename += ".hdf5" # exist_ok does not exist in Python27 if not os.path.exists(os.path.dirname(os.path.abspath(filename))): os.makedirs(os.path.dirname(os.path.abspath(filename))) #, exist_ok=True) + _save_info(filename, data.shape, **kwargs) + append_block(filename, data, **kwargs) + +def _save_info(filename, data_shape, **kwargs): + f = h5py.File(filename, "w") try: f.attrs["type"] = "General time series data format" - no_observations, no_attributes = data.shape + no_observations, no_attributes = data_shape + if 'name' in kwargs: f.attrs['name'] = kwargs['name'] if 'description' in kwargs: f.attrs['description'] = kwargs['description'] f.attrs['no_attributes'] = no_attributes if 'attribute_names' in kwargs: - assert len(kwargs['attribute_names']) == no_attributes, "len(attribute_names)=%d but data shape is %s" % (len(kwargs['attribute_names']), data.shape) + if no_attributes: + assert len(kwargs['attribute_names']) == no_attributes, "len(attribute_names)=%d but data shape is %s" % (len(kwargs['attribute_names']), data_shape) f.create_dataset("attribute_names", data=np.array([v.encode('utf-8') for v in kwargs['attribute_names']])) if 'attribute_units' in kwargs: - assert(len(kwargs['attribute_units']) == no_attributes) + if no_attributes: + assert(len(kwargs['attribute_units']) == no_attributes) f.create_dataset("attribute_units", data=np.array([v.encode('utf-8') for v in kwargs['attribute_units']])) if 'attribute_descriptions' in kwargs: - assert(len(kwargs['attribute_descriptions']) == no_attributes) + if no_attributes: + assert(len(kwargs['attribute_descriptions']) == no_attributes) f.create_dataset("attribute_descriptions", data=np.array([v.encode('utf-8') for v in kwargs['attribute_descriptions']])) f.attrs['no_blocks'] = 0 except Exception: raise finally: f.close() - append_block(filename, data, **kwargs) + def append_block(filename, data, **kwargs): """Append a data block and corresponding time data to already existing file @@ -398,3 +423,42 @@ def check_type(f): raise ValueError("HDF5 file must contain a 'type'-attribute with the value 'General time series data format'") if 'no_blocks' not in f.attrs: raise ValueError("HDF5 file must contain an attribute named 'no_blocks'") + + +def _get_statistic(time, data, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']): + def get_stat(stat): + if hasattr(np, stat): + return getattr(np,stat)(data,0) + elif (stat.startswith("eq") and stat[2:].isdigit()): + m = float(stat[2:]) + return [eq_load(sensor, 46, m, time[-1]-time[0]+time[1]-time[0])[0][0] for sensor in data.T] + return np.array([get_stat(stat) for stat in statistics]).T + +def _add_statistic_data(file, stat_data, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']): + f = h5py.File(file, "a") + stat_grp = f.create_group("Statistic") + stat_grp.create_dataset("statistic_names", data=np.array([v.encode('utf-8') for v in statistics])) + stat_grp.create_dataset("statistic_data", data=stat_data.astype(np.float)) + f.close() + +def add_statistic(file, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']): + time, data, info = load(file) + stat_data = _get_statistic(time, data, statistics) + _add_statistic_data(file, stat_data, statistics) + + +def load_statistic(filename): + f = _open_h5py_file(filename) + info = _load_info(f) + names = decode(f['Statistic']['statistic_names']) + data =np.array(f['Statistic']['statistic_data']) + return pd.DataFrame(data, columns=names), info + +def compress2statistics(filename, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']): + time, data, info = load(filename) + stat_data = _get_statistic(time, data, statistics) + _save_info(filename, data.shape, **info) + _add_statistic_data(filename, stat_data, statistics) + + + diff --git a/wetb/gtsdf/tests/test_gtsdf_stat.py b/wetb/gtsdf/tests/test_gtsdf_stat.py new file mode 100644 index 00000000..350ef593 --- /dev/null +++ b/wetb/gtsdf/tests/test_gtsdf_stat.py @@ -0,0 +1,64 @@ +''' +Created on 12/09/2013 + +@author: mmpe +''' +from __future__ import division +from __future__ import unicode_literals +from __future__ import print_function +from __future__ import absolute_import +from builtins import super +from builtins import range +from future import standard_library +standard_library.install_aliases() + +import h5py +import numpy as np +from wetb import gtsdf + +import unittest +import os + +tmp_path = os.path.dirname(__file__) + "/tmp/" +tfp = os.path.dirname(os.path.abspath(__file__)) + "/test_files/" + +class Test_gsdf(unittest.TestCase): + def setUp(self): + unittest.TestCase.setUp(self) + if not os.path.isdir(tmp_path): + os.makedirs(tmp_path) + + @classmethod + def tearDownClass(cls): + super(Test_gsdf, cls).tearDownClass() + #shutil.rmtree(tmp_path) + + + def test_gtsdf_stat(self): + time, data, info = gtsdf.load(tfp+'test.hdf5') + print (data.shape) + fn = tmp_path + "test_stat.hdf5" + gtsdf.save(fn, data, time=time, **info) + gtsdf.add_statistic(fn) + stat_data,info = gtsdf.load_statistic(fn) + self.assertEqual(data[:,0].min(), stat_data.values[0,0]) + self.assertEqual(stat_data.shape, (49,10)) + + def test_gtsdf_compress2stat(self): + time, data, info = gtsdf.load(tfp+'test.hdf5') + fn = tmp_path + "test_compress2stat.hdf5" + gtsdf.save(fn, data, time=time, **info) + gtsdf.save(tmp_path + "test_compress2stat2.hdf5", data, time=time, dtype=np.float, **info) + gtsdf.compress2statistics(fn) + self.assertLess(os.path.getsize(fn)*50, os.path.getsize(tfp+'test.hdf5')) + + + + + + + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() -- GitLab