Skip to content
Snippets Groups Projects
Commit f181b12c authored by Mads M. Pedersen's avatar Mads M. Pedersen
Browse files

implemented statistics in gtsdf

parent a759fea8
No related branches found
No related tags found
No related merge requests found
...@@ -38,6 +38,9 @@ from .gtsdf import save ...@@ -38,6 +38,9 @@ from .gtsdf import save
from .gtsdf import load from .gtsdf import load
from .gtsdf import append_block from .gtsdf import append_block
from .gtsdf import load_pandas from .gtsdf import load_pandas
from .gtsdf import add_statistic
from .gtsdf import load_statistic
from .gtsdf import compress2statistics
class Dataset(object): class Dataset(object):
def __init__(self, filename): def __init__(self, filename):
......
...@@ -3,6 +3,7 @@ from builtins import zip ...@@ -3,6 +3,7 @@ from builtins import zip
from builtins import range from builtins import range
from builtins import str from builtins import str
from future import standard_library from future import standard_library
from wetb.fatigue_tools.fatigue import eq_load
standard_library.install_aliases() standard_library.install_aliases()
import warnings import warnings
from wetb.gtsdf.unix_time import from_unix from wetb.gtsdf.unix_time import from_unix
...@@ -13,6 +14,7 @@ except ImportError as e: ...@@ -13,6 +14,7 @@ except ImportError as e:
import os import os
import numpy as np import numpy as np
import numpy.ma as ma import numpy.ma as ma
import pandas as pd
block_name_fmt = "block%04d" block_name_fmt = "block%04d"
def load(filename, dtype=None): def load(filename, dtype=None):
...@@ -89,80 +91,95 @@ def load(filename, dtype=None): ...@@ -89,80 +91,95 @@ def load(filename, dtype=None):
'type': 'General time series data format', 'type': 'General time series data format',
'description': 'MyDatasetDescription'} 'description': 'MyDatasetDescription'}
""" """
f = _open_h5py_file(filename)
try:
info = _load_info(f)
time, data = _load_timedata(f,dtype)
return time, data, info
finally:
try:
f.close()
except:
pass
def _open_h5py_file(filename):
if isinstance(filename, h5py.File): if isinstance(filename, h5py.File):
f = filename f = filename
filename = f.filename filename = f.filename
else: else:
assert os.path.isfile(filename), "File, %s, does not exists" % filename assert os.path.isfile(filename), "File, %s, does not exists" % filename
f = h5py.File(filename, 'r') f = h5py.File(filename, 'r')
try: return f
def decode(v):
if isinstance(v, bytes): def decode(v):
return v.decode('latin1') if isinstance(v, bytes):
return v return v.decode('latin1')
elif hasattr(v,'len'):
return [decode(v_) for v_ in v]
info = {k: decode(v) for k, v in f.attrs.items()} return v
check_type(f)
if (block_name_fmt % 0) not in f: def _load_info(f):
raise ValueError("HDF5 file must contain a group named '%s'" % (block_name_fmt % 0))
block0 = f[block_name_fmt % 0]
if 'data' not in block0: info = {k: decode(v) for k, v in f.attrs.items()}
raise ValueError("group %s must contain a dataset called 'data'" % (block_name_fmt % 0)) check_type(f)
_, no_attributes = block0['data'].shape if 'name' not in info:
if 'name' not in info: info['name'] = os.path.splitext(os.path.basename(f.filename))[0]
info['name'] = os.path.splitext(os.path.basename(filename))[0] if 'attribute_names' in f:
if 'attribute_names' in f: info['attribute_names'] = [v.decode('latin1') for v in f['attribute_names']]
info['attribute_names'] = [v.decode('latin1') for v in f['attribute_names']] if 'attribute_units' in f:
if 'attribute_units' in f: info['attribute_units'] = [v.decode('latin1') for v in f['attribute_units']]
info['attribute_units'] = [v.decode('latin1') for v in f['attribute_units']] if 'attribute_descriptions' in f:
if 'attribute_descriptions' in f: info['attribute_descriptions'] = [v.decode('latin1') for v in f['attribute_descriptions']]
info['attribute_descriptions'] = [v.decode('latin1') for v in f['attribute_descriptions']] return info
no_blocks = f.attrs['no_blocks']
def _load_timedata(f, dtype):
if dtype is None: no_blocks = f.attrs['no_blocks']
file_dtype = f[block_name_fmt % 0]['data'].dtype if (block_name_fmt % 0) not in f:
if "float" in str(file_dtype): raise ValueError("HDF5 file must contain a group named '%s'" % (block_name_fmt % 0))
dtype = file_dtype block0 = f[block_name_fmt % 0]
elif file_dtype in [np.int8, np.uint8, np.int16, np.uint16]: if 'data' not in block0:
dtype = np.float32 raise ValueError("group %s must contain a dataset called 'data'" % (block_name_fmt % 0))
else: _, no_attributes = block0['data'].shape
dtype = np.float64
time = []
data = [] if dtype is None:
for i in range(no_blocks): file_dtype = f[block_name_fmt % 0]['data'].dtype
if "float" in str(file_dtype):
try: dtype = file_dtype
block = f[block_name_fmt % i] elif file_dtype in [np.int8, np.uint8, np.int16, np.uint16]:
except KeyError: dtype = np.float32
continue else:
no_observations, no_attributes = block['data'].shape dtype = np.float64
block_time = (block.get('time', np.arange(no_observations))[:]).astype(np.float64) time = []
if 'time_step' in block.attrs: data = []
block_time *= block.attrs['time_step'] for i in range(no_blocks):
if 'time_start' in block.attrs:
block_time += block.attrs['time_start']
time.extend(block_time)
block_data = block['data'][:].astype(dtype)
if "int" in str(block['data'].dtype):
block_data[block_data == np.iinfo(block['data'].dtype).max] = np.nan
if 'gains' in block:
block_data *= block['gains'][:]
if 'offsets' in block:
block_data += block['offsets'][:]
data.append(block_data)
f.close()
if no_blocks > 0:
data = np.vstack(data)
return np.array(time).astype(np.float64), np.array(data).astype(dtype), info
except (ValueError, AssertionError):
f.close()
raise
try:
block = f[block_name_fmt % i]
except KeyError:
continue
no_observations, no_attributes = block['data'].shape
block_time = (block.get('time', np.arange(no_observations))[:]).astype(np.float64)
if 'time_step' in block.attrs:
block_time *= block.attrs['time_step']
if 'time_start' in block.attrs:
block_time += block.attrs['time_start']
time.extend(block_time)
block_data = block['data'][:].astype(dtype)
if "int" in str(block['data'].dtype):
block_data[block_data == np.iinfo(block['data'].dtype).max] = np.nan
if 'gains' in block:
block_data *= block['gains'][:]
if 'offsets' in block:
block_data += block['offsets'][:]
data.append(block_data)
if no_blocks > 0:
data = np.vstack(data)
return np.array(time).astype(np.float64), np.array(data).astype(dtype)
def save(filename, data, **kwargs): def save(filename, data, **kwargs):
"""Save a 'General Time Series Data Format'-hdf5 datafile """Save a 'General Time Series Data Format'-hdf5 datafile
...@@ -226,36 +243,44 @@ def save(filename, data, **kwargs): ...@@ -226,36 +243,44 @@ def save(filename, data, **kwargs):
time_step=2, time_step=2,
dtype=np.float64) dtype=np.float64)
""" """
if not filename.lower().endswith('.hdf5'): if not filename.lower().endswith('.hdf5'):
filename += ".hdf5" filename += ".hdf5"
# exist_ok does not exist in Python27 # exist_ok does not exist in Python27
if not os.path.exists(os.path.dirname(os.path.abspath(filename))): if not os.path.exists(os.path.dirname(os.path.abspath(filename))):
os.makedirs(os.path.dirname(os.path.abspath(filename))) #, exist_ok=True) os.makedirs(os.path.dirname(os.path.abspath(filename))) #, exist_ok=True)
_save_info(filename, data.shape, **kwargs)
append_block(filename, data, **kwargs)
def _save_info(filename, data_shape, **kwargs):
f = h5py.File(filename, "w") f = h5py.File(filename, "w")
try: try:
f.attrs["type"] = "General time series data format" f.attrs["type"] = "General time series data format"
no_observations, no_attributes = data.shape no_observations, no_attributes = data_shape
if 'name' in kwargs: if 'name' in kwargs:
f.attrs['name'] = kwargs['name'] f.attrs['name'] = kwargs['name']
if 'description' in kwargs: if 'description' in kwargs:
f.attrs['description'] = kwargs['description'] f.attrs['description'] = kwargs['description']
f.attrs['no_attributes'] = no_attributes f.attrs['no_attributes'] = no_attributes
if 'attribute_names' in kwargs: if 'attribute_names' in kwargs:
assert len(kwargs['attribute_names']) == no_attributes, "len(attribute_names)=%d but data shape is %s" % (len(kwargs['attribute_names']), data.shape) if no_attributes:
assert len(kwargs['attribute_names']) == no_attributes, "len(attribute_names)=%d but data shape is %s" % (len(kwargs['attribute_names']), data_shape)
f.create_dataset("attribute_names", data=np.array([v.encode('utf-8') for v in kwargs['attribute_names']])) f.create_dataset("attribute_names", data=np.array([v.encode('utf-8') for v in kwargs['attribute_names']]))
if 'attribute_units' in kwargs: if 'attribute_units' in kwargs:
assert(len(kwargs['attribute_units']) == no_attributes) if no_attributes:
assert(len(kwargs['attribute_units']) == no_attributes)
f.create_dataset("attribute_units", data=np.array([v.encode('utf-8') for v in kwargs['attribute_units']])) f.create_dataset("attribute_units", data=np.array([v.encode('utf-8') for v in kwargs['attribute_units']]))
if 'attribute_descriptions' in kwargs: if 'attribute_descriptions' in kwargs:
assert(len(kwargs['attribute_descriptions']) == no_attributes) if no_attributes:
assert(len(kwargs['attribute_descriptions']) == no_attributes)
f.create_dataset("attribute_descriptions", data=np.array([v.encode('utf-8') for v in kwargs['attribute_descriptions']])) f.create_dataset("attribute_descriptions", data=np.array([v.encode('utf-8') for v in kwargs['attribute_descriptions']]))
f.attrs['no_blocks'] = 0 f.attrs['no_blocks'] = 0
except Exception: except Exception:
raise raise
finally: finally:
f.close() f.close()
append_block(filename, data, **kwargs)
def append_block(filename, data, **kwargs): def append_block(filename, data, **kwargs):
"""Append a data block and corresponding time data to already existing file """Append a data block and corresponding time data to already existing file
...@@ -398,3 +423,42 @@ def check_type(f): ...@@ -398,3 +423,42 @@ def check_type(f):
raise ValueError("HDF5 file must contain a 'type'-attribute with the value 'General time series data format'") raise ValueError("HDF5 file must contain a 'type'-attribute with the value 'General time series data format'")
if 'no_blocks' not in f.attrs: if 'no_blocks' not in f.attrs:
raise ValueError("HDF5 file must contain an attribute named 'no_blocks'") raise ValueError("HDF5 file must contain an attribute named 'no_blocks'")
def _get_statistic(time, data, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']):
def get_stat(stat):
if hasattr(np, stat):
return getattr(np,stat)(data,0)
elif (stat.startswith("eq") and stat[2:].isdigit()):
m = float(stat[2:])
return [eq_load(sensor, 46, m, time[-1]-time[0]+time[1]-time[0])[0][0] for sensor in data.T]
return np.array([get_stat(stat) for stat in statistics]).T
def _add_statistic_data(file, stat_data, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']):
f = h5py.File(file, "a")
stat_grp = f.create_group("Statistic")
stat_grp.create_dataset("statistic_names", data=np.array([v.encode('utf-8') for v in statistics]))
stat_grp.create_dataset("statistic_data", data=stat_data.astype(np.float))
f.close()
def add_statistic(file, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']):
time, data, info = load(file)
stat_data = _get_statistic(time, data, statistics)
_add_statistic_data(file, stat_data, statistics)
def load_statistic(filename):
f = _open_h5py_file(filename)
info = _load_info(f)
names = decode(f['Statistic']['statistic_names'])
data =np.array(f['Statistic']['statistic_data'])
return pd.DataFrame(data, columns=names), info
def compress2statistics(filename, statistics=['min','mean','max','std','eq3','eq4','eq6','eq8','eq10','eq12']):
time, data, info = load(filename)
stat_data = _get_statistic(time, data, statistics)
_save_info(filename, data.shape, **info)
_add_statistic_data(filename, stat_data, statistics)
'''
Created on 12/09/2013
@author: mmpe
'''
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import absolute_import
from builtins import super
from builtins import range
from future import standard_library
standard_library.install_aliases()
import h5py
import numpy as np
from wetb import gtsdf
import unittest
import os
tmp_path = os.path.dirname(__file__) + "/tmp/"
tfp = os.path.dirname(os.path.abspath(__file__)) + "/test_files/"
class Test_gsdf(unittest.TestCase):
def setUp(self):
unittest.TestCase.setUp(self)
if not os.path.isdir(tmp_path):
os.makedirs(tmp_path)
@classmethod
def tearDownClass(cls):
super(Test_gsdf, cls).tearDownClass()
#shutil.rmtree(tmp_path)
def test_gtsdf_stat(self):
time, data, info = gtsdf.load(tfp+'test.hdf5')
print (data.shape)
fn = tmp_path + "test_stat.hdf5"
gtsdf.save(fn, data, time=time, **info)
gtsdf.add_statistic(fn)
stat_data,info = gtsdf.load_statistic(fn)
self.assertEqual(data[:,0].min(), stat_data.values[0,0])
self.assertEqual(stat_data.shape, (49,10))
def test_gtsdf_compress2stat(self):
time, data, info = gtsdf.load(tfp+'test.hdf5')
fn = tmp_path + "test_compress2stat.hdf5"
gtsdf.save(fn, data, time=time, **info)
gtsdf.save(tmp_path + "test_compress2stat2.hdf5", data, time=time, dtype=np.float, **info)
gtsdf.compress2statistics(fn)
self.assertLess(os.path.getsize(fn)*50, os.path.getsize(tfp+'test.hdf5'))
if __name__ == "__main__":
#import sys;sys.argv = ['', 'Test.testName']
unittest.main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment