(issue 724) :

- moved LHDFStore to inout/hdf.py - implemented PandasStorer and PytablesStorer - updated LArray/Axis/Group.to_hdf - removed Metadata.to_hdf and Metadata.from_hdf - renamed PandasHDFHandler as HDFHandler
larray-project · Aug 23, 2019 · 03e2fc4 · 03e2fc4
1 parent 40c8dd9
commit 03e2fc4
Show file tree

Hide file tree

Showing 11 changed files with 660 additions and 165 deletions.
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -706,6 +706,25 @@ ReportSheet
    ReportSheet.add_graphs
    ReportSheet.newline
 
+HDF
+===
+
+.. autosummary::
+   :toctree: _generated/
+
+   LHDFStore
+
+.. autosummary::
+   :toctree: _generated/
+
+    LHDFStore.filename
+    LHDFStore.is_open
+    LHDFStore.keys
+    LHDFStore.items
+    LHDFStore.summary
+    LHDFStore.close
+
+
 .. _api-misc:
 
 Miscellaneous

diff --git a/larray/__init__.py b/larray/__init__.py
@@ -26,7 +26,7 @@
 from larray.inout.pandas import from_frame, from_series
 from larray.inout.csv import read_csv, read_tsv, read_eurostat
 from larray.inout.excel import read_excel
-from larray.inout.hdf import read_hdf
+from larray.inout.hdf import read_hdf, LHDFStore
 from larray.inout.sas import read_sas
 from larray.inout.stata import read_stata
 from larray.inout.xw_excel import open_excel, Workbook
@@ -78,6 +78,7 @@
     'from_lists', 'from_string', 'from_frame', 'from_series', 'read_csv', 'read_tsv',
     'read_eurostat', 'read_excel', 'read_hdf', 'read_sas', 'read_stata',
     'open_excel', 'Workbook', 'ExcelReport', 'ReportSheet',
+    'LHDFStore',
     # utils
     'get_options', 'set_options',
     # viewer

diff --git a/larray/core/array.py b/larray/core/array.py
@@ -62,7 +62,7 @@
 from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis
 from larray.util.misc import (table2str, size2str, basestring, izip, rproduct, ReprString, duplicates,
                               float_error_handler_factory, _isnoneslice, light_product, unique_list, common_type,
-                              renamed_to, deprecate_kwarg, LHDFStore, lazy_attribute, unique_multi, SequenceZip,
+                              renamed_to, deprecate_kwarg, lazy_attribute, unique_multi, SequenceZip,
                               Repeater, Product, ensure_no_numpy_type, PY2)
 from larray.util.options import _OPTIONS, DISPLAY_MAXLINES, DISPLAY_EDGEITEMS, DISPLAY_WIDTH, DISPLAY_PRECISION
 
@@ -6734,13 +6734,9 @@ def to_hdf(self, filepath, key):
 
         >>> a.to_hdf('test.h5', 'arrays/a')  # doctest: +SKIP
         """
-        key = _translate_group_key_hdf(key)
+        from larray.inout.hdf import LHDFStore
         with LHDFStore(filepath) as store:
-            store.put(key, self.to_frame())
-            attrs = store.get_storer(key).attrs
-            attrs.type = 'Array'
-            attrs.writer = 'LArray'
-            self.meta.to_hdf(store, key)
+            store.put(key, self)
 
     def to_stata(self, filepath_or_buffer, **kwargs):
         r"""

diff --git a/larray/core/axis.py b/larray/core/axis.py
@@ -16,7 +16,7 @@
                                _range_to_slice, _seq_group_to_name, _translate_group_key_hdf, remove_nested_groups)
 from larray.util.oset import *
 from larray.util.misc import (basestring, PY2, unicode, long, duplicates, array_lookup2, ReprString, index_by_id,
-                              renamed_to, common_type, LHDFStore, lazy_attribute, _isnoneslice, unique_multi, Product)
+                              renamed_to, common_type, lazy_attribute, _isnoneslice, unique_multi, Product)
 
 
 np_frompyfunc = np.frompyfunc
@@ -1344,19 +1344,13 @@ def to_hdf(self, filepath, key=None):
 
         >>> a.to_hdf('test.h5', 'axes/a')  # doctest: +SKIP
         """
+        from larray.inout.hdf import LHDFStore
         if key is None:
             if self.name is None:
                 raise ValueError("Argument key must be provided explicitly in case of anonymous axis")
             key = self.name
-        key = _translate_group_key_hdf(key)
-        dtype_kind = self.labels.dtype.kind
-        data = np.char.encode(self.labels, 'utf-8') if dtype_kind == 'U' else self.labels
-        s = pd.Series(data=data, name=self.name)
         with LHDFStore(filepath) as store:
-            store.put(key, s)
-            store.get_storer(key).attrs.type = 'Axis'
-            store.get_storer(key).attrs.dtype_kind = dtype_kind
-            store.get_storer(key).attrs.wildcard = self.iswildcard
+            store.put(key, self)
 
     @property
     def dtype(self):

diff --git a/larray/core/group.py b/larray/core/group.py
@@ -13,7 +13,7 @@
 from larray.core.abstractbases import ABCAxis, ABCAxisReference, ABCLArray
 from larray.util.oset import *
 from larray.util.misc import (basestring, PY2, unique, find_closing_chr, _parse_bound, _seq_summary, _isintstring,
-                              renamed_to, LHDFStore)
+                              renamed_to)
 
 
 def _slice_to_str(key, repr_func=str):
@@ -1453,27 +1453,13 @@ def to_hdf(self, filepath, key=None, axis_key=None):
         >>> # save both the group 'b01' and the associated axis 'b'
         >>> b01.to_hdf('test.h5')                   # doctest: +SKIP
         """
+        from larray.inout.hdf import LHDFStore
         if key is None:
             if self.name is None:
                 raise ValueError("Argument key must be provided explicitly in case of anonymous group")
             key = self.name
-        key = _translate_group_key_hdf(key)
-        if axis_key is None:
-            if self.axis.name is None:
-                raise ValueError("Argument axis_key must be provided explicitly if the associated axis is anonymous")
-            axis_key = self.axis.name
-        data = self.eval()
-        dtype_kind = data.dtype.kind if isinstance(data, np.ndarray) else ''
-        if dtype_kind == 'U':
-            data = np.char.encode(data, 'utf-8')
-        s = pd.Series(data=data, name=self.name)
         with LHDFStore(filepath) as store:
-            store.put(key, s)
-            store.get_storer(key).attrs.type = 'Group'
-            store.get_storer(key).attrs.dtype_kind = dtype_kind
-            if axis_key not in store:
-                self.axis.to_hdf(store, key=axis_key)
-            store.get_storer(key).attrs.axis_key = axis_key
+            store.put(key, self, axis_key=axis_key)
 
     # this makes range(LGroup(int)) possible
     def __index__(self):

diff --git a/larray/core/metadata.py b/larray/core/metadata.py
@@ -162,17 +162,3 @@ def _convert_value(value):
             return value
 
         return Metadata([(key, _convert_value(value)) for key, value in zip(array.axes.labels[0], array.data)])
-
-    # ---------- IO methods ----------
-    def to_hdf(self, hdfstore, key=None):
-        if len(self):
-            attrs = hdfstore.get_storer(key).attrs if key is not None else hdfstore.root._v_attrs
-            attrs.metadata = self
-
-    @classmethod
-    def from_hdf(cls, hdfstore, key=None):
-        attrs = hdfstore.get_storer(key).attrs if key is not None else hdfstore.root._v_attrs
-        if 'metadata' in attrs:
-            return attrs.metadata
-        else:
-            return None
diff --git a/larray/core/session.py b/larray/core/session.py
@@ -1,22 +1,41 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function
 
+import fnmatch
 import os
-import sys
 import re
-import fnmatch
+import sys
 import warnings
 from collections import OrderedDict, Iterable
 
 import numpy as np
 
-from larray.core.metadata import Metadata
-from larray.core.group import Group
+from larray.core.array import LArray, get_axes, ndtest, zeros, zeros_like, sequence
 from larray.core.axis import Axis
 from larray.core.constants import nan
-from larray.core.array import LArray, get_axes, ndtest, zeros, zeros_like, sequence, aslarray
-from larray.util.misc import float_error_handler_factory, is_interactive_interpreter, renamed_to, inverseop, basestring
+from larray.core.group import Group
+from larray.core.metadata import Metadata
 from larray.inout.session import ext_default_engine, get_file_handler
+from larray.util.misc import float_error_handler_factory, is_interactive_interpreter, renamed_to, inverseop, basestring
+
+
+def _get_handler(engine, fname, overwrite, **kwargs):
+    if engine == 'auto':
+        _, ext = os.path.splitext(fname)
+        ext = ext.strip('.') if '.' in ext else 'csv'
+        engine = ext_default_engine[ext]
+    if engine == 'hdf':
+        engine_hdf = 'auto'
+    if '_hdf' in engine:
+        engine_hdf, engine = engine.split('_')
+    handler_cls = get_file_handler(engine)
+    if engine == 'pandas_csv' and 'sep' in kwargs:
+        handler = handler_cls(fname, overwrite, kwargs['sep'])
+    elif engine == 'hdf':
+        handler = handler_cls(fname, overwrite, engine=engine_hdf)
+    else:
+        handler = handler_cls(fname, overwrite)
+    return handler
 
 
 # XXX: inherit from OrderedDict or LArray?
@@ -358,7 +377,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
             List of objects to load.
             If `fname` is None, list of paths to CSV files.
             Defaults to all valid objects present in the file/directory.
-        engine : {'auto', 'pandas_csv', 'pandas_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
+        engine : {'auto', 'pandas_csv', 'pandas_hdf', 'tables_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
             Load using `engine`. Defaults to 'auto' (use default engine for the format guessed from the file extension).
         display : bool, optional
             Whether or not to display which file is being worked on. Defaults to False.
@@ -415,15 +434,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
                 engine = ext_default_engine['csv']
             else:
                 raise ValueError("List of paths to only CSV files expected. Got {}".format(names))
-        if engine == 'auto':
-            _, ext = os.path.splitext(fname)
-            ext = ext.strip('.') if '.' in ext else 'csv'
-            engine = ext_default_engine[ext]
-        handler_cls = get_file_handler(engine)
-        if engine == 'pandas_csv' and 'sep' in kwargs:
-            handler = handler_cls(fname, kwargs['sep'])
-        else:
-            handler = handler_cls(fname)
+        handler = _get_handler(engine, fname, False, **kwargs)
         metadata, objects = handler.read(names, display=display, **kwargs)
         for k, v in objects.items():
             self[k] = v
@@ -442,7 +453,7 @@ def save(self, fname, names=None, engine='auto', overwrite=True, display=False,
             List of names of LArray/Axis/Group objects to dump.
             If `fname` is None, list of paths to CSV files.
             Defaults to all objects present in the Session.
-        engine : {'auto', 'pandas_csv', 'pandas_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
+        engine : {'auto', 'pandas_csv', 'pandas_hdf', 'tables_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
             Dump using `engine`. Defaults to 'auto' (use default engine for the format guessed from the file extension).
         overwrite: bool, optional
             Whether or not to overwrite an existing file, if any. Ignored for CSV files and 'pandas_excel' engine.
@@ -482,15 +493,7 @@ def save(self, fname, names=None, engine='auto', overwrite=True, display=False,
         >>> # replace arr1 and add arr4 in file output.h5
         >>> s2.save('output.h5', overwrite=False)           # doctest: +SKIP
         """
-        if engine == 'auto':
-            _, ext = os.path.splitext(fname)
-            ext = ext.strip('.') if '.' in ext else 'csv'
-            engine = ext_default_engine[ext]
-        handler_cls = get_file_handler(engine)
-        if engine == 'pandas_csv' and 'sep' in kwargs:
-            handler = handler_cls(fname, overwrite, kwargs['sep'])
-        else:
-            handler = handler_cls(fname, overwrite)
+        handler = _get_handler(engine, fname, overwrite, **kwargs)
         meta = self.meta if overwrite else None
         items = self.items()
         if names is not None: