From 4ef58a7531dbf7059154d2b47b6a40e98fdf24f3 Mon Sep 17 00:00:00 2001
From: dave <dave@dtu.dk>
Date: Tue, 9 Aug 2016 11:06:40 +0200
Subject: [PATCH] prepost.simchunks: merge archived log and statsdel analysis
 into df

---
 wetb/prepost/simchunks.py | 60 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/wetb/prepost/simchunks.py b/wetb/prepost/simchunks.py
index 0e230d15..0535cc2b 100644
--- a/wetb/prepost/simchunks.py
+++ b/wetb/prepost/simchunks.py
@@ -22,9 +22,12 @@ from builtins import object
 import os
 import zipfile
 import copy
+import tarfile
+import glob
 
 import numpy as np
 import pandas as pd
+#from tqdm import tqdm
 
 from wetb.prepost.Simulations import Cases
 
@@ -390,5 +393,62 @@ def create_chunks_htc_pbs(cases, sort_by_values=['[Windspeed]'], ppn=20,
     df_ind.to_hdf(fname+'.h5', 'table', compression=9, complib='zlib')
     df_ind.to_csv(fname+'.csv')
 
+
+def merge_from_tarfiles(df_fname, path, pattern, tarmode='r:xz',
+                        tqdm=False, header='infer', sep=','):
+    """Merge all csv files from various tar archives into a big pd.DataFrame
+    store.
+
+    Parameters
+    ----------
+
+    df_fname : str
+        file name of the pd.DataFrame h5 store in which all chunks will be
+        merged. Names usually used are:
+            * [sim_id]_ErrorLogs.h5
+            * [sim_id]_statistics.h5
+
+    path : str
+        Directory in which all chunks are located.
+
+    pattern : str
+        Search pattern used to select (using glob.glob) files in path
+
+    tarmode : str, default='r:xz'
+        File opening mode for tarfile (used when opening each of the chunks).
+
+    tqdm : boolean, default=False
+       If True, an interactive progress bar will be displayed (requires the
+       tqdm module). If set to False no progress bar will be displayed.
+
+    header : str, default='infer'
+        Argument passed on to pandas.read_csv. Set to None if the compressed
+        chunks do not contain any headers.
+
+    sep : str, default=','
+        Argument passed on to pandas.read_csv. Set to ';' when handling the
+        ErrorLogs.
+
+    """
+
+    store = pd.HDFStore(os.path.join(path, df_fname), mode='w', format='table',
+                        complevel=9, complib='zlib')
+
+    if tqdm:
+        from tqdm import tqdm
+    else:
+        def tqdm(itereable):
+            return itereable
+
+    for tar_fname in tqdm(glob.glob(os.path.join(path, pattern))):
+        with tarfile.open(tar_fname, mode=tarmode) as tar:
+            df = pd.DataFrame()
+            for tarinfo in tar.getmembers():
+                fileobj = tar.extractfile(tarinfo)
+                df = df.append(pd.read_csv(fileobj, header=header, sep=sep))
+        store.append('table', df, min_itemsize={})
+    store.close()
+
+
 if __name__ == '__main__':
     pass
-- 
GitLab