implemented statistics in gtsdf

f181b12c · Mads M. Pedersen · a759fea8 · f181b12c · f181b12c · f181b12c
Commit f181b12c authored 7 years ago by Mads M. Pedersen
--- a/wetb/gtsdf/__init__.py
+++ b/wetb/gtsdf/__init__.py
@@ -38,6 +38,9 @@ from .gtsdf import save
 from .gtsdf import load
 from .gtsdf import append_block
 from .gtsdf import load_pandas
+from .gtsdf import add_statistic
+from .gtsdf import load_statistic
+from .gtsdf import compress2statistics

 class Dataset(object):
    def __init__(self, filename):

--- a/wetb/gtsdf/gtsdf.py
+++ b/wetb/gtsdf/gtsdf.py
@@ -3,6 +3,7 @@ from builtins import zip
 from builtins import range
 from builtins import str
 from future import standard_library
+from wetb.fatigue_tools.fatigue import eq_load
 standard_library.install_aliases()
 import warnings
 from wetb.gtsdf.unix_time import from_unix
@@ -13,6 +14,7 @@ except ImportError as e:
 import os
 import numpy as np
 import numpy.ma as ma
+import pandas as pd
 block_name_fmt = "block%04d"

 def load(filename, dtype=None):
@@ -89,80 +91,95 @@ def load(filename, dtype=None):
     'type': 'General time series data format',
     'description': 'MyDatasetDescription'}
    """
+    f = _open_h5py_file(filename)
+    try:
+        info = _load_info(f)
+        time, data = _load_timedata(f,dtype)
+        return time, data, info
+    finally:
+        try:
+            f.close()
+        except:
+            pass
+
+def _open_h5py_file(filename):
    if isinstance(filename, h5py.File):
        f = filename
        filename = f.filename
    else:
        assert os.path.isfile(filename), "File, %s, does not exists" % filename
        f = h5py.File(filename, 'r')
-    try:
-        def decode(v):
-            if isinstance(v, bytes):
-                return v.decode('latin1')
-            return v
-
-
-        info = {k: decode(v) for k, v in f.attrs.items()}
-        check_type(f)
-        if (block_name_fmt % 0) not in f:
-            raise ValueError("HDF5 file must contain a group named '%s'" % (block_name_fmt % 0))
-        block0 = f[block_name_fmt % 0]
-        if 'data' not in block0:
-            raise ValueError("group %s must contain a dataset called 'data'" % (block_name_fmt % 0))
-        _, no_attributes = block0['data'].shape
-        if 'name' not in info:
-            info['name'] = os.path.splitext(os.path.basename(filename))[0]
-        if 'attribute_names' in f:
-            info['attribute_names'] = [v.decode('latin1') for v in f['attribute_names']]
-        if 'attribute_units' in f:
-            info['attribute_units'] = [v.decode('latin1') for v in f['attribute_units']]
-        if 'attribute_descriptions' in f:
-            info['attribute_descriptions'] = [v.decode('latin1') for v in f['attribute_descriptions']]
-        no_blocks = f.attrs['no_blocks']
-
-        if dtype is None:
-            file_dtype = f[block_name_fmt % 0]['data'].dtype
-            if "float" in str(file_dtype):
-                dtype = file_dtype
-            elif file_dtype in [np.int8, np.uint8, np.int16, np.uint16]:
-                dtype = np.float32
-            else:
-                dtype = np.float64
-        time = []
-        data = []
-        for i in range(no_blocks):
-
-            try:
-                block = f[block_name_fmt % i]
-            except KeyError:
-                continue
-            no_observations, no_attributes = block['data'].shape
-            block_time = (block.get('time', np.arange(no_observations))[:]).astype(np.float64)
-            if 'time_step' in block.attrs:
-                block_time *= block.attrs['time_step']
-            if 'time_start' in block.attrs:
-                block_time += block.attrs['time_start']
-            time.extend(block_time)
-
-            block_data = block['data'][:].astype(dtype)
-            if "int" in str(block['data'].dtype):
-                block_data[block_data == np.iinfo(block['data'].dtype).max] = np.nan
-
-            if 'gains' in block:
-                block_data *= block['gains'][:]
-            if 'offsets' in block:
-                block_data += block['offsets'][:]
-            data.append(block_data)
-
-        f.close()
-        if no_blocks > 0:
-            data = np.vstack(data)
-        return np.array(time).astype(np.float64), np.array(data).astype(dtype), info
-    except (ValueError, AssertionError):
-        f.close()
-        raise
-
+    return f
+
+def decode(v):
+        if isinstance(v, bytes):
+            return v.decode('latin1')
+        elif hasattr(v,'len'):
+            return [decode(v_) for v_ in v]
+        return v
+
+def _load_info(f):
+    
+
+    info = {k: decode(v) for k, v in f.attrs.items()}
+    check_type(f)
+    if 'name' not in info:
+        info['name'] = os.path.splitext(os.path.basename(f.filename))[0]
+    if 'attribute_names' in f:
+        info['attribute_names'] = [v.decode('latin1') for v in f['attribute_names']]
+    if 'attribute_units' in f:
+        info['attribute_units'] = [v.decode('latin1') for v in f['attribute_units']]
+    if 'attribute_descriptions' in f:
+        info['attribute_descriptions'] = [v.decode('latin1') for v in f['attribute_descriptions']]
+    return info
+    
+def _load_timedata(f, dtype):
+    no_blocks = f.attrs['no_blocks']
+    if (block_name_fmt % 0) not in f:
+        raise ValueError("HDF5 file must contain a group named '%s'" % (block_name_fmt % 0))
+    block0 = f[block_name_fmt % 0]
+    if 'data' not in block0:
+        raise ValueError("group %s must contain a dataset called 'data'" % (block_name_fmt % 0))
+    _, no_attributes = block0['data'].shape
+    
+
+    if dtype is None:
+        file_dtype = f[block_name_fmt % 0]['data'].dtype
+        if "float" in str(file_dtype):
+            dtype = file_dtype
+        elif file_dtype in [np.int8, np.uint8, np.int16, np.uint16]:
+            dtype = np.float32
+        else:
+            dtype = np.float64
+    time = []
+    data = []
+    for i in range(no_blocks):

+        try:
+            block = f[block_name_fmt % i]
+        except KeyError:
+            continue
+        no_observations, no_attributes = block['data'].shape
+        block_time = (block.get('time', np.arange(no_observations))[:]).astype(np.float64)
+        if 'time_step' in block.attrs:
+            block_time *= block.attrs['time_step']
+        if 'time_start' in block.attrs:
+            block_time += block.attrs['time_start']
+        time.extend(block_time)
+
+        block_data = block['data'][:].astype(dtype)
+        if "int" in str(block['data'].dtype):
+            block_data[block_data == np.iinfo(block['data'].dtype).max] = np.nan
+
+        if 'gains' in block:
+            block_data *= block['gains'][:]
+        if 'offsets' in block:
+            block_data += block['offsets'][:]
+        data.append(block_data)
+
+    if no_blocks > 0:
+        data = np.vstack(data)
+    return np.array(time).astype(np.float64), np.array(data).astype(dtype)    

 def save(filename, data, **kwargs):
    """Save a 'General Time Series Data Format'-hdf5 datafile
@@ -226,36 +243,44 @@ def save(filename, data, **kwargs):
                                      time_step=2,
                                      dtype=np.float64)
    """
-
    if not filename.lower().endswith('.hdf5'):
        filename += ".hdf5"
    # exist_ok does not exist in Python27
    if not os.path.exists(os.path.dirname(os.path.abspath(filename))):
        os.makedirs(os.path.dirname(os.path.abspath(filename)))  #, exist_ok=True)
+    _save_info(filename, data.shape, **kwargs)
+    append_block(filename, data, **kwargs)
+
+def _save_info(filename, data_shape, **kwargs):
+
    f = h5py.File(filename, "w")
    try:
        f.attrs["type"] = "General time series data format"
-        no_observations, no_attributes = data.shape
+        no_observations, no_attributes = data_shape
+        
        if 'name' in kwargs:
            f.attrs['name'] = kwargs['name']
        if 'description' in kwargs:
            f.attrs['description'] = kwargs['description']
        f.attrs['no_attributes'] = no_attributes
        if 'attribute_names' in kwargs:
-            assert len(kwargs['attribute_names']) == no_attributes, "len(attribute_names)=%d but data shape is %s" % (len(kwargs['attribute_names']), data.shape)
+            if no_attributes:
+                assert len(kwargs['attribute_names']) == no_attributes, "len(attribute_names)=%d but data shape is %s" % (len(kwargs['attribute_names']), data_shape)
            f.create_dataset("attribute_names", data=np.array([v.encode('utf-8') for v in kwargs['attribute_names']]))
        if 'attribute_units' in kwargs:
-            assert(len(kwargs['attribute_units']) == no_attributes)
+            if no_attributes:
+                assert(len(kwargs['attribute_units']) == no_attributes)
            f.create_dataset("attribute_units", data=np.array([v.encode('utf-8') for v in kwargs['attribute_units']]))
        if 'attribute_descriptions' in kwargs:
-            assert(len(kwargs['attribute_descriptions']) == no_attributes)
+            if no_attributes:
+                assert(len(kwargs['attribute_descriptions']) == no_attributes)
            f.create_dataset("attribute_descriptions", data=np.array([v.encode('utf-8') for v in kwargs['attribute_descriptions']]))
        f.attrs['no_blocks'] = 0
    except Exception:
        raise
    finally:
        f.close()
-    append_block(filename, data, **kwargs)
+

 def append_block(filename, data, **kwargs):
    """Append a data block and corresponding time data to already existing file
@@ -398,3 +423,42 @@ def check_type(f):
        raise ValueError("HDF5 file must contain a 'type'-attribute with the value 'General time series data format'")
    if 'no_blocks' not in f.attrs:
        raise ValueError("HDF5 file must contain an attribute named 'no_blocks'")
+    
+    
+def _get_statistic(time, data, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']):
+    def get_stat(stat):
+        if hasattr(np, stat):
+            return getattr(np,stat)(data,0)
+        elif (stat.startswith("eq") and stat[2:].isdigit()):
+            m = float(stat[2:])
+            return [eq_load(sensor, 46, m, time[-1]-time[0]+time[1]-time[0])[0][0] for sensor in data.T]
+    return np.array([get_stat(stat) for stat in statistics]).T
+    
+def _add_statistic_data(file, stat_data, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']):
+    f = h5py.File(file, "a")
+    stat_grp = f.create_group("Statistic")
+    stat_grp.create_dataset("statistic_names", data=np.array([v.encode('utf-8') for v in statistics]))
+    stat_grp.create_dataset("statistic_data", data=stat_data.astype(np.float))
+    f.close()
+    
+def add_statistic(file, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']):
+    time, data, info = load(file)
+    stat_data = _get_statistic(time, data, statistics)
+    _add_statistic_data(file, stat_data, statistics)
+
+    
+def load_statistic(filename):
+    f = _open_h5py_file(filename)
+    info = _load_info(f)
+    names = decode(f['Statistic']['statistic_names'])
+    data =np.array(f['Statistic']['statistic_data'])
+    return pd.DataFrame(data, columns=names), info
+
+def compress2statistics(filename, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']):
+    time, data, info = load(filename)
+    stat_data = _get_statistic(time, data, statistics)
+    _save_info(filename, data.shape, **info)
+    _add_statistic_data(filename, stat_data, statistics)
+    
+    
+    
--- a/wetb/gtsdf/tests/test_gtsdf_stat.py
+++ b/wetb/gtsdf/tests/test_gtsdf_stat.py
+'''
+Created on 12/09/2013
+
+@author: mmpe
+'''
+from __future__ import division
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import absolute_import
+from builtins import super
+from builtins import range
+from future import standard_library
+standard_library.install_aliases()
+
+import h5py
+import numpy as np
+from wetb import gtsdf
+
+import unittest
+import os
+
+tmp_path = os.path.dirname(__file__) + "/tmp/"
+tfp = os.path.dirname(os.path.abspath(__file__)) + "/test_files/"
+
+class Test_gsdf(unittest.TestCase):
+    def setUp(self):
+        unittest.TestCase.setUp(self)
+        if not os.path.isdir(tmp_path):
+            os.makedirs(tmp_path)
+
+    @classmethod
+    def tearDownClass(cls):
+        super(Test_gsdf, cls).tearDownClass()
+        #shutil.rmtree(tmp_path)
+
+    
+    def test_gtsdf_stat(self):
+        time, data, info = gtsdf.load(tfp+'test.hdf5')
+        print (data.shape)
+        fn = tmp_path + "test_stat.hdf5"
+        gtsdf.save(fn, data, time=time, **info)
+        gtsdf.add_statistic(fn)
+        stat_data,info = gtsdf.load_statistic(fn)
+        self.assertEqual(data[:,0].min(), stat_data.values[0,0])
+        self.assertEqual(stat_data.shape, (49,10))        
+        
+    def test_gtsdf_compress2stat(self):
+        time, data, info = gtsdf.load(tfp+'test.hdf5')
+        fn = tmp_path + "test_compress2stat.hdf5"
+        gtsdf.save(fn, data, time=time, **info)
+        gtsdf.save(tmp_path + "test_compress2stat2.hdf5", data, time=time, dtype=np.float, **info)
+        gtsdf.compress2statistics(fn)
+        self.assertLess(os.path.getsize(fn)*50, os.path.getsize(tfp+'test.hdf5'))
+        
+        
+
+
+
+
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()