From f88090a0283299fee5d897eb31924f3582e7f00b Mon Sep 17 00:00:00 2001 From: Alix Damman Date: Mon, 15 Apr 2019 16:25:47 +0200 Subject: [PATCH 1/4] (issue 724) : - moved LHDFStore to inout/hdf.py - implemented PandasStorer and PytablesStorer - updated LArray/Axis/Group.to_hdf - removed Metadata.to_hdf and Metadata.from_hdf - renamed PandasHDFHandler as HDFHandler --- doc/source/api.rst | 19 ++ larray/__init__.py | 3 +- larray/core/array.py | 10 +- larray/core/axis.py | 12 +- larray/core/group.py | 20 +- larray/core/metadata.py | 14 - larray/core/session.py | 55 ++-- larray/inout/hdf.py | 622 +++++++++++++++++++++++++++++++---- larray/tests/test_array.py | 11 +- larray/tests/test_session.py | 44 ++- larray/util/misc.py | 22 -- 11 files changed, 667 insertions(+), 165 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index a23f3794b..6b6f54e62 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -706,6 +706,25 @@ ReportSheet ReportSheet.add_graphs ReportSheet.newline +HDF +=== + +.. autosummary:: + :toctree: _generated/ + + LHDFStore + +.. autosummary:: + :toctree: _generated/ + + LHDFStore.filename + LHDFStore.is_open + LHDFStore.keys + LHDFStore.items + LHDFStore.summary + LHDFStore.close + + .. _api-misc: Miscellaneous diff --git a/larray/__init__.py b/larray/__init__.py index ab65cf548..6e011a042 100644 --- a/larray/__init__.py +++ b/larray/__init__.py @@ -26,7 +26,7 @@ from larray.inout.pandas import from_frame, from_series from larray.inout.csv import read_csv, read_tsv, read_eurostat from larray.inout.excel import read_excel -from larray.inout.hdf import read_hdf +from larray.inout.hdf import read_hdf, LHDFStore from larray.inout.sas import read_sas from larray.inout.stata import read_stata from larray.inout.xw_excel import open_excel, Workbook @@ -78,6 +78,7 @@ 'from_lists', 'from_string', 'from_frame', 'from_series', 'read_csv', 'read_tsv', 'read_eurostat', 'read_excel', 'read_hdf', 'read_sas', 'read_stata', 'open_excel', 'Workbook', 'ExcelReport', 'ReportSheet', + 'LHDFStore', # utils 'get_options', 'set_options', # viewer diff --git a/larray/core/array.py b/larray/core/array.py index 17612fa91..1e07e5a77 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -62,7 +62,7 @@ from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis from larray.util.misc import (table2str, size2str, basestring, izip, rproduct, ReprString, duplicates, float_error_handler_factory, _isnoneslice, light_product, unique_list, common_type, - renamed_to, deprecate_kwarg, LHDFStore, lazy_attribute, unique_multi, SequenceZip, + renamed_to, deprecate_kwarg, lazy_attribute, unique_multi, SequenceZip, Repeater, Product, ensure_no_numpy_type, PY2) from larray.util.options import _OPTIONS, DISPLAY_MAXLINES, DISPLAY_EDGEITEMS, DISPLAY_WIDTH, DISPLAY_PRECISION @@ -6734,13 +6734,9 @@ def to_hdf(self, filepath, key): >>> a.to_hdf('test.h5', 'arrays/a') # doctest: +SKIP """ - key = _translate_group_key_hdf(key) + from larray.inout.hdf import LHDFStore with LHDFStore(filepath) as store: - store.put(key, self.to_frame()) - attrs = store.get_storer(key).attrs - attrs.type = 'Array' - attrs.writer = 'LArray' - self.meta.to_hdf(store, key) + store.put(key, self) def to_stata(self, filepath_or_buffer, **kwargs): r""" diff --git a/larray/core/axis.py b/larray/core/axis.py index 0c11f0997..f7ee22e38 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -16,7 +16,7 @@ _range_to_slice, _seq_group_to_name, _translate_group_key_hdf, remove_nested_groups) from larray.util.oset import * from larray.util.misc import (basestring, PY2, unicode, long, duplicates, array_lookup2, ReprString, index_by_id, - renamed_to, common_type, LHDFStore, lazy_attribute, _isnoneslice, unique_multi, Product) + renamed_to, common_type, lazy_attribute, _isnoneslice, unique_multi, Product) np_frompyfunc = np.frompyfunc @@ -1344,19 +1344,13 @@ def to_hdf(self, filepath, key=None): >>> a.to_hdf('test.h5', 'axes/a') # doctest: +SKIP """ + from larray.inout.hdf import LHDFStore if key is None: if self.name is None: raise ValueError("Argument key must be provided explicitly in case of anonymous axis") key = self.name - key = _translate_group_key_hdf(key) - dtype_kind = self.labels.dtype.kind - data = np.char.encode(self.labels, 'utf-8') if dtype_kind == 'U' else self.labels - s = pd.Series(data=data, name=self.name) with LHDFStore(filepath) as store: - store.put(key, s) - store.get_storer(key).attrs.type = 'Axis' - store.get_storer(key).attrs.dtype_kind = dtype_kind - store.get_storer(key).attrs.wildcard = self.iswildcard + store.put(key, self) @property def dtype(self): diff --git a/larray/core/group.py b/larray/core/group.py index b73417381..faf58a5cb 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -13,7 +13,7 @@ from larray.core.abstractbases import ABCAxis, ABCAxisReference, ABCLArray from larray.util.oset import * from larray.util.misc import (basestring, PY2, unique, find_closing_chr, _parse_bound, _seq_summary, _isintstring, - renamed_to, LHDFStore) + renamed_to) def _slice_to_str(key, repr_func=str): @@ -1453,27 +1453,13 @@ def to_hdf(self, filepath, key=None, axis_key=None): >>> # save both the group 'b01' and the associated axis 'b' >>> b01.to_hdf('test.h5') # doctest: +SKIP """ + from larray.inout.hdf import LHDFStore if key is None: if self.name is None: raise ValueError("Argument key must be provided explicitly in case of anonymous group") key = self.name - key = _translate_group_key_hdf(key) - if axis_key is None: - if self.axis.name is None: - raise ValueError("Argument axis_key must be provided explicitly if the associated axis is anonymous") - axis_key = self.axis.name - data = self.eval() - dtype_kind = data.dtype.kind if isinstance(data, np.ndarray) else '' - if dtype_kind == 'U': - data = np.char.encode(data, 'utf-8') - s = pd.Series(data=data, name=self.name) with LHDFStore(filepath) as store: - store.put(key, s) - store.get_storer(key).attrs.type = 'Group' - store.get_storer(key).attrs.dtype_kind = dtype_kind - if axis_key not in store: - self.axis.to_hdf(store, key=axis_key) - store.get_storer(key).attrs.axis_key = axis_key + store.put(key, self, axis_key=axis_key) # this makes range(LGroup(int)) possible def __index__(self): diff --git a/larray/core/metadata.py b/larray/core/metadata.py index c0d9f32b5..beb3d07e9 100644 --- a/larray/core/metadata.py +++ b/larray/core/metadata.py @@ -162,17 +162,3 @@ def _convert_value(value): return value return Metadata([(key, _convert_value(value)) for key, value in zip(array.axes.labels[0], array.data)]) - - # ---------- IO methods ---------- - def to_hdf(self, hdfstore, key=None): - if len(self): - attrs = hdfstore.get_storer(key).attrs if key is not None else hdfstore.root._v_attrs - attrs.metadata = self - - @classmethod - def from_hdf(cls, hdfstore, key=None): - attrs = hdfstore.get_storer(key).attrs if key is not None else hdfstore.root._v_attrs - if 'metadata' in attrs: - return attrs.metadata - else: - return None diff --git a/larray/core/session.py b/larray/core/session.py index 110c0d1e3..6c597f854 100644 --- a/larray/core/session.py +++ b/larray/core/session.py @@ -1,22 +1,41 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, division, print_function +import fnmatch import os -import sys import re -import fnmatch +import sys import warnings from collections import OrderedDict, Iterable import numpy as np -from larray.core.metadata import Metadata -from larray.core.group import Group +from larray.core.array import LArray, get_axes, ndtest, zeros, zeros_like, sequence from larray.core.axis import Axis from larray.core.constants import nan -from larray.core.array import LArray, get_axes, ndtest, zeros, zeros_like, sequence, aslarray -from larray.util.misc import float_error_handler_factory, is_interactive_interpreter, renamed_to, inverseop, basestring +from larray.core.group import Group +from larray.core.metadata import Metadata from larray.inout.session import ext_default_engine, get_file_handler +from larray.util.misc import float_error_handler_factory, is_interactive_interpreter, renamed_to, inverseop, basestring + + +def _get_handler(engine, fname, overwrite, **kwargs): + if engine == 'auto': + _, ext = os.path.splitext(fname) + ext = ext.strip('.') if '.' in ext else 'csv' + engine = ext_default_engine[ext] + if engine == 'hdf': + engine_hdf = 'auto' + if '_hdf' in engine: + engine_hdf, engine = engine.split('_') + handler_cls = get_file_handler(engine) + if engine == 'pandas_csv' and 'sep' in kwargs: + handler = handler_cls(fname, overwrite, kwargs['sep']) + elif engine == 'hdf': + handler = handler_cls(fname, overwrite, engine=engine_hdf) + else: + handler = handler_cls(fname, overwrite) + return handler # XXX: inherit from OrderedDict or LArray? @@ -358,7 +377,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs): List of objects to load. If `fname` is None, list of paths to CSV files. Defaults to all valid objects present in the file/directory. - engine : {'auto', 'pandas_csv', 'pandas_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional + engine : {'auto', 'pandas_csv', 'pandas_hdf', 'tables_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional Load using `engine`. Defaults to 'auto' (use default engine for the format guessed from the file extension). display : bool, optional Whether or not to display which file is being worked on. Defaults to False. @@ -415,15 +434,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs): engine = ext_default_engine['csv'] else: raise ValueError("List of paths to only CSV files expected. Got {}".format(names)) - if engine == 'auto': - _, ext = os.path.splitext(fname) - ext = ext.strip('.') if '.' in ext else 'csv' - engine = ext_default_engine[ext] - handler_cls = get_file_handler(engine) - if engine == 'pandas_csv' and 'sep' in kwargs: - handler = handler_cls(fname, kwargs['sep']) - else: - handler = handler_cls(fname) + handler = _get_handler(engine, fname, False, **kwargs) metadata, objects = handler.read(names, display=display, **kwargs) for k, v in objects.items(): self[k] = v @@ -442,7 +453,7 @@ def save(self, fname, names=None, engine='auto', overwrite=True, display=False, List of names of LArray/Axis/Group objects to dump. If `fname` is None, list of paths to CSV files. Defaults to all objects present in the Session. - engine : {'auto', 'pandas_csv', 'pandas_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional + engine : {'auto', 'pandas_csv', 'pandas_hdf', 'tables_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional Dump using `engine`. Defaults to 'auto' (use default engine for the format guessed from the file extension). overwrite: bool, optional Whether or not to overwrite an existing file, if any. Ignored for CSV files and 'pandas_excel' engine. @@ -482,15 +493,7 @@ def save(self, fname, names=None, engine='auto', overwrite=True, display=False, >>> # replace arr1 and add arr4 in file output.h5 >>> s2.save('output.h5', overwrite=False) # doctest: +SKIP """ - if engine == 'auto': - _, ext = os.path.splitext(fname) - ext = ext.strip('.') if '.' in ext else 'csv' - engine = ext_default_engine[ext] - handler_cls = get_file_handler(engine) - if engine == 'pandas_csv' and 'sep' in kwargs: - handler = handler_cls(fname, overwrite, kwargs['sep']) - else: - handler = handler_cls(fname, overwrite) + handler = _get_handler(engine, fname, overwrite, **kwargs) meta = self.meta if overwrite else None items = self.items() if names is not None: diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py index 92bbc7516..14d6d6b46 100644 --- a/larray/inout/hdf.py +++ b/larray/inout/hdf.py @@ -1,20 +1,564 @@ from __future__ import absolute_import, print_function +import os import warnings import numpy as np -from pandas import HDFStore +from pandas import Series, HDFStore from larray.core.array import LArray from larray.core.axis import Axis from larray.core.constants import nan from larray.core.group import Group, LGroup, _translate_group_key_hdf from larray.core.metadata import Metadata -from larray.util.misc import LHDFStore -from larray.inout.session import register_file_handler +from larray.example import get_example_filepath from larray.inout.common import FileHandler from larray.inout.pandas import df_aslarray -from larray.example import get_example_filepath +from larray.inout.session import register_file_handler + + +class ClosedFileError(Exception): + pass + + +class AbstractStorer(object): + def __init__(self, filepath, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs): + pandas_hdfstore = HDFStore(filepath, mode, complevel, complib, fletcher32, **kwargs) + self._pandas_hdfstore = pandas_hdfstore + self._path = pandas_hdfstore._path + self._mode = pandas_hdfstore._mode + self._handle = pandas_hdfstore._handle + self._complevel = pandas_hdfstore._complevel + self._complib = pandas_hdfstore._complib + self._fletcher32 = pandas_hdfstore._fletcher32 + self._filters = pandas_hdfstore._filters + + @property + def root(self): + """ return the root node """ + return self._pandas_hdfstore.root + + @property + def attrs(self): + return self.root._v_attrs + + @property + def is_open(self): + """ + return a boolean indicating whether the file is open + """ + return self._pandas_hdfstore.is_open + + def __contains__(self, key): + return key in self._pandas_hdfstore + + def __len__(self): + return len(self._pandas_hdfstore) + + def get_node(self, key): + return self._pandas_hdfstore.get_node(key) + + def close(self): + self._pandas_hdfstore.close() + + def remove(self, key): + """ + Remove LArray object. + + Parameters + ---------- + key : str + Key associated to the object to be removed. + """ + s = self._pandas_hdfstore.get_storer(key) + s.group._f_remove(recursive=True) + + def groups(self): + """ + return a list of all groups containing an LArray object. + """ + raise NotImplementedError() + + def _check_if_open(self): + if not self.is_open: + raise ClosedFileError("{} file is not open!".format(self._path)) + + def _get(self, key, **kwargs): + raise NotImplementedError() + + def get(self, key, **kwargs): + key = _translate_group_key_hdf(key) + return self._get(key, **kwargs) + + def _put(self, key, value, **kwargs): + raise NotImplementedError() + + def put(self, key, value, **kwargs): + key = _translate_group_key_hdf(key) + self._put(key, value, **kwargs) + + +class PytablesStorer(AbstractStorer): + """ + Read and write LArray objects into HDF5 file using pytables. + """ + def __init__(self, filepath, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs): + AbstractStorer.__init__(self, filepath, mode, complevel, complib, fletcher32, **kwargs) + + def groups(self): + import tables + self._check_if_open() + return [g for g in self._handle.walk_groups() + if (not isinstance(g, tables.link.Link) and 'type' in g._v_attrs)] + + def _read_data(self, group, name, attrs): + dtype = np.dtype(attrs['dtype']) + data = group[name].read() + if dtype.kind == 'U': + data = np.char.decode(data, 'utf-8') + if dtype.kind == 'O': + data = data[0] + data = data.astype(dtype) + return data + + def _read_group(self, group): + def _get_name(attrs): + name = attrs['name'] + return name if name is None else str(name) + + attrs = group._v_attrs + _type = attrs.type if 'type' in attrs else 'Array' + _meta = attrs.metadata if 'metadata' in attrs else None + res = None + if _type == 'Array': + axes_keys = [n._v_pathname for n in group if n._v_name.startswith('axis')] + axes = [self._get(axis_key) for axis_key in axes_keys] + data = self._read_data(group, 'data', attrs) + res = LArray(data=data, axes=axes) + if _meta is not None: + res.meta = _meta + elif _type == 'Axis': + name = _get_name(attrs) + labels = self._read_data(group, 'labels', attrs) + res = Axis(labels=labels, name=name) + res._iswildcard = attrs['wildcard'] + elif _type == 'Group': + axis = self._get(attrs['axis_key']) + name = _get_name(attrs) + key = self._read_data(group, 'key', attrs) + res = LGroup(key=key, name=name, axis=axis) + return res + + def _get(self, key, **kwargs): + group = self.get_node(key) + if group is None: + raise KeyError('No object named {} in the file'.format(key)) + return self._read_group(group) + + def _dump_data(self, group, name, data, attrs): + import tables + data = np.asarray(data) + dtype = data.dtype + attrs['dtype'] = dtype + # https://www.pytables.org/MIGRATING_TO_3.x.html#unicode-all-the-strings + # Warning: In Python 3, all strings are natively in Unicode. + # This introduces some difficulties, as the native HDF5 string format is not Unicode-compatible. + # To minimize explicit conversion troubles when writing, especially when creating data sets + # from existing Python objects, string objects are implicitly cast to non-Unicode byte strings + # for HDF5 storage by default. + # To avoid such problem, one way is to use the VLArray class and dump unicode string arrays + # as object arrays. + if dtype.kind == 'O': + vlarr = self._handle.create_vlarray(group, name=name, filters=self._filters, atom=tables.ObjectAtom()) + vlarr.append(data) + else: + if dtype.kind == 'U': + data = np.char.encode(data, 'utf-8') + self._handle.create_carray(group, name=name, obj=data, filters=self._filters) + + def _write_obj(self, group, value, **kwargs): + if isinstance(value, LArray): + attrs = group._v_attrs + attrs['type'] = 'Array' + # dump axes + for axis in value.axes: + axis_key = 'axis_{}'.format(value.axes.axis_id(axis)) + axis_group = self._handle.create_group(group, axis_key) + self._write_obj(axis_group, axis) + # dump data + self._dump_data(group, name='data', data=value.data, attrs=attrs) + # dump metadata + self._write_obj(group, value.meta) + elif isinstance(value, Axis): + attrs = group._v_attrs + attrs['type'] = 'Axis' + attrs['name'] = value.name + attrs['wildcard'] = value.iswildcard + self._dump_data(group, name='labels', data=value.labels, attrs=attrs) + elif isinstance(value, Group): + axis_key = kwargs.pop('axis_key', None) + if axis_key is None: + if value.axis.name is None: + raise ValueError( + "Argument axis_key must be provided explicitly if the associated axis is anonymous") + axis_key = value.axis.name + if self.get_node(axis_key) is None: + self._put(axis_key, value.axis) + attrs = group._v_attrs + attrs['type'] = 'Group' + attrs['name'] = value.name + attrs['axis_key'] = axis_key + self._dump_data(group, name='key', data=value.eval(), attrs=attrs) + elif isinstance(value, Metadata): + if len(value): + group._v_attrs['metadata'] = value + else: + warnings.warn('{}: Type {} is currently not supported'.format(group._v_name, type(value))) + + def _write_group(self, key, value, **kwargs): + # remove the group if exists already + group = self.get_node(key) + if group is not None: + self._handle.remove_node(group, recursive=True) + paths = key.split('/') + # recursively create the parent groups + path = '/' + for p in paths: + if not len(p): + continue + new_path = path + if not path.endswith('/'): + new_path += '/' + new_path += p + group = self.get_node(new_path) + if group is None: + group = self._handle.create_group(path, p) + path = new_path + self._write_obj(group, value, **kwargs) + + def _put(self, key, value, **kwargs): + key = _translate_group_key_hdf(key) + self._write_group(key, value, **kwargs) + + +class PandasStorer(AbstractStorer): + """ + Read and write LArray objects into HDF5 file using pandas. + """ + def __init__(self, filepath, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs): + AbstractStorer.__init__(self, filepath, mode, complevel, complib, fletcher32, **kwargs) + + def groups(self): + return self._pandas_hdfstore.groups() + + def _get(self, key, **kwargs): + name = kwargs.pop('name', None) + pd_obj = self._pandas_hdfstore.get(key) + attrs = self._pandas_hdfstore.get_storer(key).attrs + _writer = attrs.writer if 'writer' in attrs else None + # for backward compatibility but any object read from an hdf file should have an attribute 'type' + _type = attrs.type if 'type' in attrs else 'Array' + _meta = attrs.metadata if 'metadata' in attrs else None + res = None + if _type == 'Array': + sort_rows = kwargs.pop('sort_rows', False) + sort_columns = kwargs.pop('sort_columns', False) + fill_value = kwargs.pop('fill_value', nan) + # cartesian product is not necessary if the array was written by LArray + cartesian_prod = _writer != 'LArray' + res = df_aslarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value, + parse_header=False, cartesian_prod=cartesian_prod) + if _meta is not None: + res.meta = _meta + elif _type == 'Axis': + if name is None: + name = str(pd_obj.name) + if name == 'None': + name = None + labels = pd_obj.values + dtype = attrs['dtype'] if 'dtype' in attrs else None + if dtype is not None and dtype.kind == 'U': + labels = np.char.decode(labels, 'utf-8') + res = Axis(labels=labels, name=name) + res._iswildcard = attrs['wildcard'] + elif _type == 'Group': + if name is None: + name = str(pd_obj.name) + if name == 'None': + name = None + key = pd_obj.values + dtype = attrs['dtype'] if 'dtype' in attrs else None + if dtype is not None and dtype.kind == 'U': + key = np.char.decode(key, 'utf-8') + axis = self._get(attrs['axis_key']) + res = LGroup(key=key, name=name, axis=axis) + return res + + def _put(self, key, value, **kwargs): + pd_store = self._pandas_hdfstore + if isinstance(value, LArray): + pd_store.put(key, value.to_frame()) + attrs = pd_store.get_storer(key).attrs + attrs.type = 'Array' + attrs.writer = 'LArray' + self._put(key, value.meta) + elif isinstance(value, Axis): + dtype = value.dtype + labels = np.char.encode(value.labels, 'utf-8') if dtype.kind == 'U' else value.labels + s = Series(data=labels, name=value.name) + pd_store.put(key, s) + attrs = pd_store.get_storer(key).attrs + attrs.type = 'Axis' + attrs.dtype = dtype + attrs.wildcard = value.iswildcard + elif isinstance(value, Group): + axis_key = kwargs.pop('axis_key', None) + if axis_key is None: + if value.axis.name is None: + raise ValueError( + "Argument axis_key must be provided explicitly if the associated axis is anonymous") + axis_key = value.axis.name + if axis_key not in pd_store: + self._put(axis_key, value.axis) + data = value.eval() + dtype = data.dtype if isinstance(data, np.ndarray) else None + if dtype is not None and dtype.kind == 'U': + data = np.char.encode(data, 'utf-8') + s = Series(data=data, name=value.name) + pd_store.put(key, s) + attrs = pd_store.get_storer(key).attrs + attrs.type = 'Group' + attrs.dtype = dtype + attrs.axis_key = axis_key + elif isinstance(value, Metadata): + if len(value): + pd_store.get_storer(key).attrs.metadata = value + else: + warnings.warn('{}: Type {} is currently not supported'.format(key, type(value))) + + +_hdf_store_cls = {'pandas': PandasStorer, 'tables': PytablesStorer} + + +class LHDFStore(object): + """Context manager for reading and writing LArray objects. + + Parameters + ---------- + filepath : str or PathLike object + File path to HDF5 file + mode : {'a', 'w', 'r', 'r+'}, default 'a' + + ``'r'`` + Read-only; no data can be modified. + ``'w'`` + Write; a new file is created (an existing file with the same + name would be deleted). + ``'a'`` + Append; an existing file is opened for reading and writing, + and if the file does not exist it is created. + ``'r+'`` + It is similar to ``'a'``, but the file must already exist. + complevel : int, 0-9, default None + Specifies a compression level for data. + A value of 0 disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' + Specifies the compression library to be used. + fletcher32 : bool, default False + If applying compression use the fletcher32 checksum + engine: {'auto', 'tables', 'pandas'}, optional + Load using `engine`. Use 'pandas' to read an HDF file generated with a LArray version previous to 0.31. + Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file). + + Examples + -------- + # TODO : write examples + """ + def __init__(self, filepath, mode=None, complevel=None, complib=None, + fletcher32=False, engine='auto', **kwargs): + try: + import tables + except ImportError: + raise ImportError('LHDFStore requires PyTables to be installed') + + is_new_file = not os.path.exists(filepath) + if is_new_file and mode in ['r', 'r+']: + raise ValueError('The file {} has not been found.'.format(filepath)) + + if engine == 'auto': + if is_new_file: + engine = 'tables' + else: + import tables + handle = tables.open_file(filepath, mode='r') + # for backward compatibility, we assume that the used engine is 'pandas' + # if not found among root attributes + engine = getattr(handle.root._v_attrs, 'engine', 'pandas') + handle.close() + if engine not in _hdf_store_cls.keys(): + raise ValueError("Value of the 'engine' argument must be in list: " + "auto" + ", ".join(_hdf_store_cls.keys())) + + storer = _hdf_store_cls[engine](filepath, mode, complevel, complib, fletcher32, **kwargs) + + if is_new_file or mode == 'w': + storer.attrs['engine'] = engine + + if getattr(storer.attrs, 'engine', 'pandas') != engine: + raise Exception("Cannot {action} file {file}. Passed value for 'engine' argument was {engine_arg} " + "while the file {file} was originally created using " + "{engine}".format(action="read from" if mode == 'r' else "write into", file=filepath, + engine_arg=engine, engine=storer.attrs['engine'])) + + self._storer = storer + + def __fspath__(self): + return self._storer._path + + @property + def filename(self): + """ File path to HDF5 file """ + return self._storer._path + + @property + def is_open(self): + """ + Return a boolean indicating whether the file is open + """ + return self._storer.is_open + + @property + def meta(self): + return getattr(self._storer.attrs, 'metadata', Metadata()) + + @meta.setter + def meta(self, meta): + self._storer.attrs.metadata = meta + + def __getitem__(self, key): + return self.get(key) + + def __setitem__(self, key, value): + self.put(key, value) + + def __delitem__(self, key): + return self._storer.remove(key) + + # TODO: not sure about this. Should be implemented in LazySession. + def __getattr__(self, key): + """ allow attribute access to get stores """ + if key in self.keys(): + return self.get(key) + else: + raise AttributeError("'{}' object has no attribute '{}'".format(self.__class__.__name__, key)) + + def __contains__(self, key): + return key in self._storer + + def __len__(self): + return len(self._storer) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def close(self): + """ + Close the PyTables file handle + """ + self._storer.close() + + def keys(self): + """ + Return a (potentially unordered) list of the keys corresponding to the + objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. + have the leading '/' + """ + return [n._v_pathname for n in self._storer.groups()] + + def __iter__(self): + return iter(self.keys()) + + def items(self): + """ + Iterate on key->group + """ + for g in self._storer.groups(): + yield g._v_pathname, g + + iteritems = items + + def summary(self): + """ + Return a list of LArray stored in the HDF5 file. + + Examples + -------- + TODO: write examples + """ + if self.is_open: + res = "" + for name, group in self.items(): + _type = getattr(group._v_attrs, 'type', 'Unknown') + res += "{}: {}\n".format(name, _type) + return res + else: + return "File {} is CLOSED".format(self.filename) + + def get(self, key, **kwargs): + """ + Retrieve a larray object stored in file. + + Parameters + ---------- + key : str + Name of the object to read. + **kwargs + + * fill_value : scalar or LArray, optional + Value used to fill cells corresponding to label combinations which are not present in the input. + Defaults to NaN. + * sort_rows : bool, optional + Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). + Defaults to False. + * sort_columns : bool, optional + Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting). + Defaults to False. + * name : str, optional + Name of the axis or group to return. If None, name is set to passed key. + Defaults to None. + + Returns + ------- + obj : same type as object stored in file. + + Examples + -------- + TODO : write examples + """ + return self._storer.get(key, **kwargs) + + def put(self, key, value, **kwargs): + """ + Dump a larray object in file. + + Parameters + ---------- + key: str + Name of the object to dump. + value: LArray, Axis or Group + Object to dump. + **kwargs + + * ??? + + Examples + -------- + TODO : write examples + """ + self._storer.put(key, value, **kwargs) def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, sort_columns=False, @@ -23,10 +567,10 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s Parameters ---------- - filepath_or_buffer : str or pandas.HDFStore + filepath_or_buffer : str or LArrayHDFStore Path and name where the HDF5 file is stored or a HDFStore object. key : str or Group - Name of the array. + Name of the object to read. fill_value : scalar or LArray, optional Value used to fill cells corresponding to label combinations which are not present in the input. Defaults to NaN. @@ -66,56 +610,25 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s fill_value = na warnings.warn("read_hdf `na` argument has been renamed to `fill_value`. Please use that instead.", FutureWarning, stacklevel=2) - - key = _translate_group_key_hdf(key) - res = None - with LHDFStore(filepath_or_buffer) as store: - pd_obj = store.get(key) - attrs = store.get_storer(key).attrs - writer = attrs.writer if 'writer' in attrs else None - # for backward compatibility but any object read from an hdf file should have an attribute 'type' - _type = attrs.type if 'type' in attrs else 'Array' - _meta = attrs.metadata if 'metadata' in attrs else None - if _type == 'Array': - # cartesian product is not necessary if the array was written by LArray - cartesian_prod = writer != 'LArray' - res = df_aslarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value, - parse_header=False, cartesian_prod=cartesian_prod) - if _meta is not None: - res.meta = _meta - elif _type == 'Axis': - if name is None: - name = str(pd_obj.name) - if name == 'None': - name = None - labels = pd_obj.values - if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U': - labels = np.char.decode(labels, 'utf-8') - res = Axis(labels=labels, name=name) - res._iswildcard = attrs['wildcard'] - elif _type == 'Group': - if name is None: - name = str(pd_obj.name) - if name == 'None': - name = None - key = pd_obj.values - if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U': - key = np.char.decode(key, 'utf-8') - axis = read_hdf(filepath_or_buffer, attrs['axis_key']) - res = LGroup(key=key, name=name, axis=axis) + with LHDFStore(filepath_or_buffer, **kwargs) as store: + res = store.get(key, fill_value=fill_value, sort_rows=sort_rows, sort_columns=sort_columns, name=name) return res -@register_file_handler('pandas_hdf', ['h5', 'hdf']) -class PandasHDFHandler(FileHandler): +@register_file_handler('hdf', ['h5', 'hdf']) +class HDFHandler(FileHandler): r""" Handler for HDF5 files using Pandas. """ + def __init__(self, fname, overwrite_file=False, engine='auto'): + super(HDFHandler, self).__init__(fname, overwrite_file) + self.engine = engine + def _open_for_read(self): - self.handle = HDFStore(self.fname, mode='r') + self.handle = LHDFStore(self.fname, mode='r', engine=self.engine) def _open_for_write(self): - self.handle = HDFStore(self.fname) + self.handle = LHDFStore(self.fname, engine=self.engine) def list_items(self): keys = [key.strip('/') for key in self.handle.keys()] @@ -138,30 +651,25 @@ def _read_item(self, key, type, *args, **kwargs): kwargs['name'] = key else: raise TypeError() - return read_hdf(self.handle, hdf_key, *args, **kwargs) + return self.handle.get(hdf_key, **kwargs) def _dump_item(self, key, value, *args, **kwargs): if isinstance(value, LArray): hdf_key = '/' + key - value.to_hdf(self.handle, hdf_key, *args, **kwargs) elif isinstance(value, Axis): hdf_key = '__axes__/' + key - value.to_hdf(self.handle, hdf_key, *args, **kwargs) elif isinstance(value, Group): hdf_key = '__groups__/' + key - hdf_axis_key = '__axes__/' + value.axis.name - value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs) + kwargs['axis_key'] = '__axes__/' + value.axis.name else: raise TypeError() + self.handle.put(hdf_key, value, **kwargs) def _read_metadata(self): - metadata = Metadata.from_hdf(self.handle) - if metadata is None: - metadata = Metadata() - return metadata + return self.handle.meta def _dump_metadata(self, metadata): - metadata.to_hdf(self.handle) + self.handle.meta = metadata def close(self): self.handle.close() diff --git a/larray/tests/test_array.py b/larray/tests/test_array.py index 0615f987e..50b51f01f 100644 --- a/larray/tests/test_array.py +++ b/larray/tests/test_array.py @@ -23,7 +23,8 @@ from_lists, from_string, open_excel, from_frame, sequence, nan, IGroup) from larray.inout.pandas import from_series from larray.core.axis import _to_ticks, _to_key -from larray.util.misc import StringIO, LHDFStore +from larray.util.misc import StringIO +from larray.inout.hdf import LHDFStore from larray.core.metadata import Metadata @@ -119,11 +120,9 @@ def test_read_set_update_delete_metadata(meta, tmpdir): def test_metadata_hdf(meta, tmpdir): key = 'meta' fname = os.path.join(tmpdir.strpath, 'test_metadata.hdf') - with LHDFStore(fname) as store: - ndtest(3).to_hdf(store, key) - meta.to_hdf(store, key) - meta2 = Metadata.from_hdf(store, key) - assert meta2 == meta + ndtest(3, meta=meta).to_hdf(fname, key) + arr = read_hdf(fname, key) + assert arr.meta == meta def test_meta_arg_array_creation(array): diff --git a/larray/tests/test_session.py b/larray/tests/test_session.py index e62f29e33..2652677a3 100644 --- a/larray/tests/test_session.py +++ b/larray/tests/test_session.py @@ -153,13 +153,13 @@ def test_names(session): assert session.names == ['a', 'a01', 'b', 'b12', 'c', 'd', 'e', 'f', 'g', 'h', 'i'] -def test_h5_io(tmpdir, session, meta): +def test_h5_pandas_io(tmpdir, session, meta): fpath = tmp_path(tmpdir, 'test_session.h5') session.meta = meta - session.save(fpath) + session.save(fpath, engine='pandas_hdf') s = Session() - s.load(fpath) + s.load(fpath, engine='pandas_hdf') # HDF does *not* keep ordering (ie, keys are always sorted + # read Axis objects, then Groups objects and finally LArray objects) assert list(s.keys()) == ['a', 'b', 'a01', 'b12', 'e', 'f', 'g'] @@ -169,9 +169,41 @@ def test_h5_io(tmpdir, session, meta): a2 = Axis('a=0..2') a2_01 = a2['0,1'] >> 'a01' e2 = ndtest((a2, 'b=b0..b2')) - Session(a=a2, a01=a2_01, e=e2).save(fpath, overwrite=False) + Session(a=a2, a01=a2_01, e=e2).save(fpath, overwrite=False, engine='pandas_hdf') + s = Session() + s.load(fpath, engine='pandas_hdf') + assert list(s.keys()) == ['a', 'b', 'a01', 'b12', 'e', 'f', 'g'] + assert s['a'].equals(a2) + assert all(s['a01'] == a2_01) + assert_array_nan_equal(s['e'], e2) + assert s.meta == meta + + # load only some objects + s = Session() + s.load(fpath, names=['a', 'a01', 'e', 'f'], engine='pandas_hdf') + assert list(s.keys()) == ['a', 'a01', 'e', 'f'] + assert s.meta == meta + + +def test_h5_tables_io(tmpdir, session, meta): + fpath = tmp_path(tmpdir, 'test_session.h5') + session.meta = meta + session.save(fpath, engine='tables_hdf') + + s = Session() + s.load(fpath, engine='tables_hdf') + # HDF does *not* keep ordering (ie, keys are always sorted + + # read Axis objects, then Groups objects and finally LArray objects) + assert list(s.keys()) == ['a', 'b', 'a01', 'b12', 'e', 'f', 'g'] + assert s.meta == meta + + # update a Group + an Axis + an array (overwrite=False) + a2 = Axis('a=0..2') + a2_01 = a2['0,1'] >> 'a01' + e2 = ndtest((a2, 'b=b0..b2')) + Session(a=a2, a01=a2_01, e=e2).save(fpath, overwrite=False, engine='tables_hdf') s = Session() - s.load(fpath) + s.load(fpath, engine='tables_hdf') assert list(s.keys()) == ['a', 'b', 'a01', 'b12', 'e', 'f', 'g'] assert s['a'].equals(a2) assert all(s['a01'] == a2_01) @@ -180,7 +212,7 @@ def test_h5_io(tmpdir, session, meta): # load only some objects s = Session() - s.load(fpath, names=['a', 'a01', 'e', 'f']) + s.load(fpath, names=['a', 'a01', 'e', 'f'], engine='tables_hdf') assert list(s.keys()) == ['a', 'a01', 'e', 'f'] assert s.meta == meta diff --git a/larray/util/misc.py b/larray/util/misc.py index b61547a0f..5aab1b25a 100644 --- a/larray/util/misc.py +++ b/larray/util/misc.py @@ -26,8 +26,6 @@ except TypeError: pass -import pandas as pd - if sys.version_info[0] < 3: basestring = basestring bytes = str @@ -772,26 +770,6 @@ def common_type(arrays): return object -class LHDFStore(object): - """Context manager for pandas HDFStore""" - def __init__(self, filepath_or_buffer, **kwargs): - if isinstance(filepath_or_buffer, pd.HDFStore): - if not filepath_or_buffer.is_open: - raise IOError('The HDFStore must be open for reading.') - self.store = filepath_or_buffer - self.close_store = False - else: - self.store = pd.HDFStore(filepath_or_buffer, **kwargs) - self.close_store = True - - def __enter__(self): - return self.store - - def __exit__(self, type_, value, traceback): - if self.close_store: - self.store.close() - - class SequenceZip(object): """ Represents the "combination" of several sequences. From 18f0e77753d20ef90b92aee3b8d4e007e4c60f35 Mon Sep 17 00:00:00 2001 From: Alix Damman Date: Mon, 26 Aug 2019 09:22:50 +0200 Subject: [PATCH 2/4] removed LHDFStore from api.rst file and __init__.py module --- doc/source/api.rst | 18 ------------------ larray/__init__.py | 3 +-- larray/inout/hdf.py | 8 ++++---- 3 files changed, 5 insertions(+), 24 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 6b6f54e62..31239c395 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -706,24 +706,6 @@ ReportSheet ReportSheet.add_graphs ReportSheet.newline -HDF -=== - -.. autosummary:: - :toctree: _generated/ - - LHDFStore - -.. autosummary:: - :toctree: _generated/ - - LHDFStore.filename - LHDFStore.is_open - LHDFStore.keys - LHDFStore.items - LHDFStore.summary - LHDFStore.close - .. _api-misc: diff --git a/larray/__init__.py b/larray/__init__.py index 6e011a042..ab65cf548 100644 --- a/larray/__init__.py +++ b/larray/__init__.py @@ -26,7 +26,7 @@ from larray.inout.pandas import from_frame, from_series from larray.inout.csv import read_csv, read_tsv, read_eurostat from larray.inout.excel import read_excel -from larray.inout.hdf import read_hdf, LHDFStore +from larray.inout.hdf import read_hdf from larray.inout.sas import read_sas from larray.inout.stata import read_stata from larray.inout.xw_excel import open_excel, Workbook @@ -78,7 +78,6 @@ 'from_lists', 'from_string', 'from_frame', 'from_series', 'read_csv', 'read_tsv', 'read_eurostat', 'read_excel', 'read_hdf', 'read_sas', 'read_stata', 'open_excel', 'Workbook', 'ExcelReport', 'ReportSheet', - 'LHDFStore', # utils 'get_options', 'set_options', # viewer diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py index 14d6d6b46..8d77edb26 100644 --- a/larray/inout/hdf.py +++ b/larray/inout/hdf.py @@ -360,12 +360,12 @@ class LHDFStore(object): ``'r+'`` It is similar to ``'a'``, but the file must already exist. complevel : int, 0-9, default None - Specifies a compression level for data. - A value of 0 disables compression. + Specifies a compression level for data. + A value of 0 disables compression. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' - Specifies the compression library to be used. + Specifies the compression library to be used. fletcher32 : bool, default False - If applying compression use the fletcher32 checksum + If applying compression use the fletcher32 checksum engine: {'auto', 'tables', 'pandas'}, optional Load using `engine`. Use 'pandas' to read an HDF file generated with a LArray version previous to 0.31. Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file). From 397012de14b7a7abc8fbe7ee8cce58434a584fc6 Mon Sep 17 00:00:00 2001 From: Alix Damman Date: Mon, 26 Aug 2019 10:07:43 +0200 Subject: [PATCH 3/4] added engine argument to the read_hdf method and all to_hdf methods --- larray/core/array.py | 7 +++++-- larray/core/axis.py | 7 +++++-- larray/core/group.py | 7 +++++-- larray/inout/hdf.py | 13 ++++++++----- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/larray/core/array.py b/larray/core/array.py index 1e07e5a77..a07d48ff9 100644 --- a/larray/core/array.py +++ b/larray/core/array.py @@ -6701,7 +6701,7 @@ def to_csv(self, filepath, sep=',', na_rep='', wide=True, value_name='value', dr series = self.to_series(value_name, dropna is not None) series.to_csv(filepath, sep=sep, na_rep=na_rep, header=True, **kwargs) - def to_hdf(self, filepath, key): + def to_hdf(self, filepath, key, engine='auto'): r""" Writes array to a HDF file. @@ -6714,6 +6714,9 @@ def to_hdf(self, filepath, key): Path where the hdf file has to be written. key : str or Group Key (path) of the array within the HDF file (see Notes below). + engine: {'auto', 'tables', 'pandas'}, optional + Dump using `engine`. Use 'pandas' to update an HDF file generated with a LArray version previous to 0.31. + Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file). Notes ----- @@ -6735,7 +6738,7 @@ def to_hdf(self, filepath, key): >>> a.to_hdf('test.h5', 'arrays/a') # doctest: +SKIP """ from larray.inout.hdf import LHDFStore - with LHDFStore(filepath) as store: + with LHDFStore(filepath, engine=engine) as store: store.put(key, self) def to_stata(self, filepath_or_buffer, **kwargs): diff --git a/larray/core/axis.py b/larray/core/axis.py index f7ee22e38..74de282f2 100644 --- a/larray/core/axis.py +++ b/larray/core/axis.py @@ -1304,7 +1304,7 @@ def align(self, other, join='outer'): else: return self - def to_hdf(self, filepath, key=None): + def to_hdf(self, filepath, key=None, engine='auto'): r""" Writes axis to a HDF file. @@ -1319,6 +1319,9 @@ def to_hdf(self, filepath, key=None): Key (path) of the axis within the HDF file (see Notes below). If None, the name of the axis is used. Defaults to None. + engine: {'auto', 'tables', 'pandas'}, optional + Dump using `engine`. Use 'pandas' to update an HDF file generated with a LArray version previous to 0.31. + Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file). Notes ----- @@ -1349,7 +1352,7 @@ def to_hdf(self, filepath, key=None): if self.name is None: raise ValueError("Argument key must be provided explicitly in case of anonymous axis") key = self.name - with LHDFStore(filepath) as store: + with LHDFStore(filepath, engine=engine) as store: store.put(key, self) @property diff --git a/larray/core/group.py b/larray/core/group.py index faf58a5cb..313360bb8 100644 --- a/larray/core/group.py +++ b/larray/core/group.py @@ -1396,7 +1396,7 @@ def containing(self, substring): substring = substring.eval() return LGroup([v for v in self.eval() if substring in v], axis=self.axis) - def to_hdf(self, filepath, key=None, axis_key=None): + def to_hdf(self, filepath, key=None, axis_key=None, engine='auto'): r""" Writes group to a HDF file. @@ -1417,6 +1417,9 @@ def to_hdf(self, filepath, key=None, axis_key=None): Key (path) of the associated axis in the HDF file (see Notes below). If None, the name of the axis associated with the group is used. Defaults to None. + engine: {'auto', 'tables', 'pandas'}, optional + Dump using `engine`. Use 'pandas' to update an HDF file generated with a LArray version previous to 0.31. + Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file). Notes ----- @@ -1458,7 +1461,7 @@ def to_hdf(self, filepath, key=None, axis_key=None): if self.name is None: raise ValueError("Argument key must be provided explicitly in case of anonymous group") key = self.name - with LHDFStore(filepath) as store: + with LHDFStore(filepath, engine=engine) as store: store.put(key, self, axis_key=axis_key) # this makes range(LGroup(int)) possible diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py index 8d77edb26..9e031158a 100644 --- a/larray/inout/hdf.py +++ b/larray/inout/hdf.py @@ -346,7 +346,7 @@ class LHDFStore(object): Parameters ---------- filepath : str or PathLike object - File path to HDF5 file + File path to HDF5 file. mode : {'a', 'w', 'r', 'r+'}, default 'a' ``'r'`` @@ -562,13 +562,13 @@ def put(self, key, value, **kwargs): def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, sort_columns=False, - name=None, **kwargs): + name=None, engine='auto', **kwargs): r"""Reads an axis or group or array named key from a HDF5 file in filepath (path+name) Parameters ---------- - filepath_or_buffer : str or LArrayHDFStore - Path and name where the HDF5 file is stored or a HDFStore object. + filepath_or_buffer : str or PathLike object + Path and name where the HDF5 file is stored. key : str or Group Name of the object to read. fill_value : scalar or LArray, optional @@ -585,6 +585,9 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s name : str, optional Name of the axis or group to return. If None, name is set to passed key. Defaults to None. + engine: {'auto', 'tables', 'pandas'}, optional + Load using `engine`. Use 'pandas' to read an HDF file generated with a LArray version previous to 0.31. + Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file). Returns ------- @@ -610,7 +613,7 @@ def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, s fill_value = na warnings.warn("read_hdf `na` argument has been renamed to `fill_value`. Please use that instead.", FutureWarning, stacklevel=2) - with LHDFStore(filepath_or_buffer, **kwargs) as store: + with LHDFStore(filepath_or_buffer, engine=engine, **kwargs) as store: res = store.get(key, fill_value=fill_value, sort_rows=sort_rows, sort_columns=sort_columns, name=name) return res From fa1c2221ed08f0c43f2931df6e7cc6a10a046511 Mon Sep 17 00:00:00 2001 From: Alix Damman Date: Mon, 26 Aug 2019 11:25:05 +0200 Subject: [PATCH 4/4] added doctests for LHDFStore + updated LHDFStore.summary() method --- larray/inout/hdf.py | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/larray/inout/hdf.py b/larray/inout/hdf.py index 9e031158a..bfa440864 100644 --- a/larray/inout/hdf.py +++ b/larray/inout/hdf.py @@ -372,7 +372,33 @@ class LHDFStore(object): Examples -------- - # TODO : write examples + >>> from larray import ndtest + >>> with LHDFStore('hdf_file.h5') as s: + ... # dump and read an axis + ... s['a'] = Axis("a=a0..a2") + ... a = s['a'] + ... # dump and read a group + ... s['a01'] = a['a0,a1'] >> 'a01' + ... a01 = s['a01'] + ... # dump and read an array + ... s['arr'] = ndtest((3, 3)) + ... arr = s['arr'] + ... # add and read top level metadata + ... s.meta.author = 'John Smith' + ... metadata = s.meta + ... # get filepath + ... s.filename + ... # display list of items stored in the hdf file + ... s.keys() + ... # display list of items and their type + ... print(s.summary()) + 'hdf_file.h5' + ['/a', '/a01', '/arr', '/arr/axis_a', '/arr/axis_b'] + /a: Axis + /a01: Group + /arr: Array + /arr/axis_a: Axis + /arr/axis_b: Axis """ def __init__(self, filepath, mode=None, complevel=None, complib=None, fletcher32=False, engine='auto', **kwargs): @@ -475,6 +501,10 @@ def keys(self): Return a (potentially unordered) list of the keys corresponding to the objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. have the leading '/' + + See Also + -------- + LHDFStore """ return [n._v_pathname for n in self._storer.groups()] @@ -494,16 +524,13 @@ def summary(self): """ Return a list of LArray stored in the HDF5 file. - Examples + See Also -------- - TODO: write examples + LHDFStore """ if self.is_open: - res = "" - for name, group in self.items(): - _type = getattr(group._v_attrs, 'type', 'Unknown') - res += "{}: {}\n".format(name, _type) - return res + return '\n'.join(["{}: {}".format(name, getattr(group._v_attrs, 'type', 'Unknown')) + for name, group in self.items()]) else: return "File {} is CLOSED".format(self.filename)