Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(issue 724) : bypass pandas using pytables directly to work with HDF5 files #761

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,7 @@ ReportSheet
ReportSheet.add_graphs
ReportSheet.newline


.. _api-misc:

Miscellaneous
Expand Down
17 changes: 8 additions & 9 deletions larray/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
from larray.core.axis import Axis, AxisReference, AxisCollection, X, _make_axis
from larray.util.misc import (table2str, size2str, basestring, izip, rproduct, ReprString, duplicates,
float_error_handler_factory, _isnoneslice, light_product, unique_list, common_type,
renamed_to, deprecate_kwarg, LHDFStore, lazy_attribute, unique_multi, SequenceZip,
renamed_to, deprecate_kwarg, lazy_attribute, unique_multi, SequenceZip,
Repeater, Product, ensure_no_numpy_type, PY2)
from larray.util.options import _OPTIONS, DISPLAY_MAXLINES, DISPLAY_EDGEITEMS, DISPLAY_WIDTH, DISPLAY_PRECISION

Expand Down Expand Up @@ -6701,7 +6701,7 @@ def to_csv(self, filepath, sep=',', na_rep='', wide=True, value_name='value', dr
series = self.to_series(value_name, dropna is not None)
series.to_csv(filepath, sep=sep, na_rep=na_rep, header=True, **kwargs)

def to_hdf(self, filepath, key):
def to_hdf(self, filepath, key, engine='auto'):
r"""
Writes array to a HDF file.

Expand All @@ -6714,6 +6714,9 @@ def to_hdf(self, filepath, key):
Path where the hdf file has to be written.
key : str or Group
Key (path) of the array within the HDF file (see Notes below).
engine: {'auto', 'tables', 'pandas'}, optional
Dump using `engine`. Use 'pandas' to update an HDF file generated with a LArray version previous to 0.31.
Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change "used to produced" to either "used to produce" or "which produced"


Notes
-----
Expand All @@ -6734,13 +6737,9 @@ def to_hdf(self, filepath, key):

>>> a.to_hdf('test.h5', 'arrays/a') # doctest: +SKIP
"""
key = _translate_group_key_hdf(key)
with LHDFStore(filepath) as store:
store.put(key, self.to_frame())
attrs = store.get_storer(key).attrs
attrs.type = 'Array'
attrs.writer = 'LArray'
self.meta.to_hdf(store, key)
from larray.inout.hdf import LHDFStore
with LHDFStore(filepath, engine=engine) as store:
store.put(key, self)

def to_stata(self, filepath_or_buffer, **kwargs):
r"""
Expand Down
19 changes: 8 additions & 11 deletions larray/core/axis.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
_range_to_slice, _seq_group_to_name, _translate_group_key_hdf, remove_nested_groups)
from larray.util.oset import *
from larray.util.misc import (basestring, PY2, unicode, long, duplicates, array_lookup2, ReprString, index_by_id,
renamed_to, common_type, LHDFStore, lazy_attribute, _isnoneslice, unique_multi, Product)
renamed_to, common_type, lazy_attribute, _isnoneslice, unique_multi, Product)


np_frompyfunc = np.frompyfunc
Expand Down Expand Up @@ -1304,7 +1304,7 @@ def align(self, other, join='outer'):
else:
return self

def to_hdf(self, filepath, key=None):
def to_hdf(self, filepath, key=None, engine='auto'):
r"""
Writes axis to a HDF file.

Expand All @@ -1319,6 +1319,9 @@ def to_hdf(self, filepath, key=None):
Key (path) of the axis within the HDF file (see Notes below).
If None, the name of the axis is used.
Defaults to None.
engine: {'auto', 'tables', 'pandas'}, optional
Dump using `engine`. Use 'pandas' to update an HDF file generated with a LArray version previous to 0.31.
Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file).

Notes
-----
Expand All @@ -1344,19 +1347,13 @@ def to_hdf(self, filepath, key=None):

>>> a.to_hdf('test.h5', 'axes/a') # doctest: +SKIP
"""
from larray.inout.hdf import LHDFStore
if key is None:
if self.name is None:
raise ValueError("Argument key must be provided explicitly in case of anonymous axis")
key = self.name
key = _translate_group_key_hdf(key)
dtype_kind = self.labels.dtype.kind
data = np.char.encode(self.labels, 'utf-8') if dtype_kind == 'U' else self.labels
s = pd.Series(data=data, name=self.name)
with LHDFStore(filepath) as store:
store.put(key, s)
store.get_storer(key).attrs.type = 'Axis'
store.get_storer(key).attrs.dtype_kind = dtype_kind
store.get_storer(key).attrs.wildcard = self.iswildcard
with LHDFStore(filepath, engine=engine) as store:
store.put(key, self)

@property
def dtype(self):
Expand Down
27 changes: 8 additions & 19 deletions larray/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from larray.core.abstractbases import ABCAxis, ABCAxisReference, ABCLArray
from larray.util.oset import *
from larray.util.misc import (basestring, PY2, unique, find_closing_chr, _parse_bound, _seq_summary, _isintstring,
renamed_to, LHDFStore)
renamed_to)


def _slice_to_str(key, repr_func=str):
Expand Down Expand Up @@ -1396,7 +1396,7 @@ def containing(self, substring):
substring = substring.eval()
return LGroup([v for v in self.eval() if substring in v], axis=self.axis)

def to_hdf(self, filepath, key=None, axis_key=None):
def to_hdf(self, filepath, key=None, axis_key=None, engine='auto'):
r"""
Writes group to a HDF file.

Expand All @@ -1417,6 +1417,9 @@ def to_hdf(self, filepath, key=None, axis_key=None):
Key (path) of the associated axis in the HDF file (see Notes below).
If None, the name of the axis associated with the group is used.
Defaults to None.
engine: {'auto', 'tables', 'pandas'}, optional
Dump using `engine`. Use 'pandas' to update an HDF file generated with a LArray version previous to 0.31.
Defaults to 'auto' (use default engine if you don't know the LArray version used to produced the HDF file).

Notes
-----
Expand Down Expand Up @@ -1453,27 +1456,13 @@ def to_hdf(self, filepath, key=None, axis_key=None):
>>> # save both the group 'b01' and the associated axis 'b'
>>> b01.to_hdf('test.h5') # doctest: +SKIP
"""
from larray.inout.hdf import LHDFStore
if key is None:
if self.name is None:
raise ValueError("Argument key must be provided explicitly in case of anonymous group")
key = self.name
key = _translate_group_key_hdf(key)
if axis_key is None:
if self.axis.name is None:
raise ValueError("Argument axis_key must be provided explicitly if the associated axis is anonymous")
axis_key = self.axis.name
data = self.eval()
dtype_kind = data.dtype.kind if isinstance(data, np.ndarray) else ''
if dtype_kind == 'U':
data = np.char.encode(data, 'utf-8')
s = pd.Series(data=data, name=self.name)
with LHDFStore(filepath) as store:
store.put(key, s)
store.get_storer(key).attrs.type = 'Group'
store.get_storer(key).attrs.dtype_kind = dtype_kind
if axis_key not in store:
self.axis.to_hdf(store, key=axis_key)
store.get_storer(key).attrs.axis_key = axis_key
with LHDFStore(filepath, engine=engine) as store:
store.put(key, self, axis_key=axis_key)

# this makes range(LGroup(int)) possible
def __index__(self):
Expand Down
14 changes: 0 additions & 14 deletions larray/core/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,17 +162,3 @@ def _convert_value(value):
return value

return Metadata([(key, _convert_value(value)) for key, value in zip(array.axes.labels[0], array.data)])

# ---------- IO methods ----------
def to_hdf(self, hdfstore, key=None):
if len(self):
attrs = hdfstore.get_storer(key).attrs if key is not None else hdfstore.root._v_attrs
attrs.metadata = self

@classmethod
def from_hdf(cls, hdfstore, key=None):
attrs = hdfstore.get_storer(key).attrs if key is not None else hdfstore.root._v_attrs
if 'metadata' in attrs:
return attrs.metadata
else:
return None
55 changes: 29 additions & 26 deletions larray/core/session.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,41 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function

import fnmatch
import os
import sys
import re
import fnmatch
import sys
import warnings
from collections import OrderedDict, Iterable

import numpy as np

from larray.core.metadata import Metadata
from larray.core.group import Group
from larray.core.array import LArray, get_axes, ndtest, zeros, zeros_like, sequence
from larray.core.axis import Axis
from larray.core.constants import nan
from larray.core.array import LArray, get_axes, ndtest, zeros, zeros_like, sequence, aslarray
from larray.util.misc import float_error_handler_factory, is_interactive_interpreter, renamed_to, inverseop, basestring
from larray.core.group import Group
from larray.core.metadata import Metadata
from larray.inout.session import ext_default_engine, get_file_handler
from larray.util.misc import float_error_handler_factory, is_interactive_interpreter, renamed_to, inverseop, basestring


def _get_handler(engine, fname, overwrite, **kwargs):
if engine == 'auto':
_, ext = os.path.splitext(fname)
ext = ext.strip('.') if '.' in ext else 'csv'
engine = ext_default_engine[ext]
if engine == 'hdf':
engine_hdf = 'auto'
if '_hdf' in engine:
engine_hdf, engine = engine.split('_')
handler_cls = get_file_handler(engine)
if engine == 'pandas_csv' and 'sep' in kwargs:
handler = handler_cls(fname, overwrite, kwargs['sep'])
elif engine == 'hdf':
handler = handler_cls(fname, overwrite, engine=engine_hdf)
else:
handler = handler_cls(fname, overwrite)
return handler


# XXX: inherit from OrderedDict or LArray?
Expand Down Expand Up @@ -358,7 +377,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
List of objects to load.
If `fname` is None, list of paths to CSV files.
Defaults to all valid objects present in the file/directory.
engine : {'auto', 'pandas_csv', 'pandas_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
engine : {'auto', 'pandas_csv', 'pandas_hdf', 'tables_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
Load using `engine`. Defaults to 'auto' (use default engine for the format guessed from the file extension).
display : bool, optional
Whether or not to display which file is being worked on. Defaults to False.
Expand Down Expand Up @@ -415,15 +434,7 @@ def load(self, fname, names=None, engine='auto', display=False, **kwargs):
engine = ext_default_engine['csv']
else:
raise ValueError("List of paths to only CSV files expected. Got {}".format(names))
if engine == 'auto':
_, ext = os.path.splitext(fname)
ext = ext.strip('.') if '.' in ext else 'csv'
engine = ext_default_engine[ext]
handler_cls = get_file_handler(engine)
if engine == 'pandas_csv' and 'sep' in kwargs:
handler = handler_cls(fname, kwargs['sep'])
else:
handler = handler_cls(fname)
handler = _get_handler(engine, fname, False, **kwargs)
metadata, objects = handler.read(names, display=display, **kwargs)
for k, v in objects.items():
self[k] = v
Expand All @@ -442,7 +453,7 @@ def save(self, fname, names=None, engine='auto', overwrite=True, display=False,
List of names of LArray/Axis/Group objects to dump.
If `fname` is None, list of paths to CSV files.
Defaults to all objects present in the Session.
engine : {'auto', 'pandas_csv', 'pandas_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
engine : {'auto', 'pandas_csv', 'pandas_hdf', 'tables_hdf', 'pandas_excel', 'xlwings_excel', 'pickle'}, optional
Dump using `engine`. Defaults to 'auto' (use default engine for the format guessed from the file extension).
overwrite: bool, optional
Whether or not to overwrite an existing file, if any. Ignored for CSV files and 'pandas_excel' engine.
Expand Down Expand Up @@ -482,15 +493,7 @@ def save(self, fname, names=None, engine='auto', overwrite=True, display=False,
>>> # replace arr1 and add arr4 in file output.h5
>>> s2.save('output.h5', overwrite=False) # doctest: +SKIP
"""
if engine == 'auto':
_, ext = os.path.splitext(fname)
ext = ext.strip('.') if '.' in ext else 'csv'
engine = ext_default_engine[ext]
handler_cls = get_file_handler(engine)
if engine == 'pandas_csv' and 'sep' in kwargs:
handler = handler_cls(fname, overwrite, kwargs['sep'])
else:
handler = handler_cls(fname, overwrite)
handler = _get_handler(engine, fname, overwrite, **kwargs)
meta = self.meta if overwrite else None
items = self.items()
if names is not None:
Expand Down
Loading