From bcda1c610f9cfe0e16186275fcdc157625024d22 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 10 Jan 2013 07:33:55 -0500 Subject: [PATCH 01/10] BUG: shape attribute on GenericStorer returns tuple correctly now (not string) data_orientation was potentitally returning non-ints in a tuple (fixed downstream) --- pandas/io/pytables.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b7cdf1706b5e9..6b45bb9eb76ce 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1238,6 +1238,8 @@ def __repr__(self): self.infer_axes() s = self.shape if s is not None: + if isinstance(s, (list,tuple)): + s = "[%s]" % ','.join([ str(x) for x in s ]) return "%-12.12s (shape->%s)" % (self.pandas_type,s) return self.pandas_type @@ -1618,7 +1620,7 @@ class SeriesStorer(GenericStorer): @property def shape(self): try: - return "[%s]" % len(getattr(self.group,'values',None)) + return len(getattr(self.group,'values')), except: return None @@ -1748,7 +1750,7 @@ def shape(self): if self.is_shape_reversed: shape = shape[::-1] - return "[%s]" % ','.join([ str(x) for x in shape ]) + return shape except: return None @@ -1908,7 +1910,7 @@ def is_transposed(self): @property def data_orientation(self): """ return a tuple of my permutated axes, non_indexable at the front """ - return tuple(itertools.chain([a[0] for a in self.non_index_axes], [a.axis for a in self.index_axes])) + return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes], [int(a.axis) for a in self.index_axes])) def queryables(self): """ return a dict of the kinds allowable columns for this object """ From 50eb56171cee74984a65d8cd805dbc330c03da60 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 14 Jan 2013 16:25:24 -0500 Subject: [PATCH 02/10] BUG: fix for GH #2694 (natural naming issue on __contains__) --- RELEASE.rst | 2 ++ pandas/io/pytables.py | 2 +- pandas/io/tests/test_pytables.py | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index 59a86221d14a9..a6ffd088ec2d7 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -66,6 +66,7 @@ pandas 0.10.1 - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index is ``Int64``), (closes GH512_) - handle Timestamp correctly in data_columns (closes GH2637_) + - contains correctly matches on non-natural names - Fix DataFrame.info bug with UTF8-encoded columns. (GH2576_) - Fix DatetimeIndex handling of FixedOffset tz (GH2604_) - More robust detection of being in IPython session for wide DataFrame @@ -98,6 +99,7 @@ pandas 0.10.1 .. _GH2625: https://github.com/pydata/pandas/issues/2625 .. _GH2643: https://github.com/pydata/pandas/issues/2643 .. _GH2637: https://github.com/pydata/pandas/issues/2637 +.. _GH2694: https://github.com/pydata/pandas/issues/2694 pandas 0.10.0 ============= diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6b45bb9eb76ce..22b207d9c571f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -220,7 +220,7 @@ def __contains__(self, key): node = self.get_node(key) if node is not None: name = node._v_pathname - return re.search(key, name) is not None + if name == key or name[1:] == key: return True return False def __len__(self): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index cb2d9dd2af58f..d5978ea150c26 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -111,6 +111,12 @@ def test_contains(self): self.assert_('/foo/b' not in self.store) self.assert_('bar' not in self.store) + # GH 2694 + warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + self.store['node())'] = tm.makeDataFrame() + self.assert_('node())' in self.store) + warnings.filterwarnings('always', category=tables.NaturalNameWarning) + def test_versioning(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeDataFrame() From 314b574f040b672fbc825b00efa810dac0e8d519 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 15 Jan 2013 09:24:27 -0500 Subject: [PATCH 03/10] ENH: added ability to read in generic PyTables flavor tables to allow compatiblity between other HDF5 systems --- RELEASE.rst | 1 + pandas/core/reshape.py | 4 +- pandas/io/pytables.py | 91 ++++++++++++++++++++++++++--- pandas/io/tests/pytables_native.h5 | Bin 0 -> 12336 bytes pandas/io/tests/test_pytables.py | 7 +++ 5 files changed, 92 insertions(+), 11 deletions(-) create mode 100644 pandas/io/tests/pytables_native.h5 diff --git a/RELEASE.rst b/RELEASE.rst index a6ffd088ec2d7..95e39f6510e08 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -52,6 +52,7 @@ pandas 0.10.1 - added method ``unique`` to select the unique values in an indexable or data column - added method ``copy`` to copy an existing store (and possibly upgrade) - show the shape of the data on disk for non-table stores when printing the store + - added ability to read PyTables flavor tables (allows compatiblity to other HDF5 systems) - Add ``logx`` option to DataFrame/Series.plot (GH2327_, #2565) - Support reading gzipped data from file-like object - ``pivot_table`` aggfunc can be anything used in GroupBy.aggregate (GH2643_) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 571bcf5008178..7db6d301f7e83 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -834,5 +834,5 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None): def factor_indexer(shape, labels): """ given a tuple of shape and a list of Factor lables, return the expanded label indexer """ - mult = np.array(shape)[::-1].cumprod()[::-1] - return np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T + mult = np.array(shape, dtype = 'i8')[::-1].cumprod()[::-1] + return np.sum(np.array(labels, dtype = 'i8').T * np.append(mult, [1]), axis=1).T diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 22b207d9c571f..bfe00c430ae18 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -24,6 +24,7 @@ from pandas.core.common import _asarray_tuplesafe, _try_sort from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.reshape import block2d_to_block3d, block2d_to_blocknd, factor_indexer +from pandas.core.index import Int64Index import pandas.core.common as com from pandas.tools.merge import concat @@ -71,6 +72,7 @@ class IncompatibilityWarning(Warning): pass # table class map _TABLE_MAP = { + 'generic_table' : 'GenericTable', 'appendable_frame' : 'AppendableFrameTable', 'appendable_multiframe' : 'AppendableMultiFrameTable', 'appendable_panel' : 'AppendablePanelTable', @@ -609,7 +611,7 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ - return [ g for g in self.handle.walkGroups() if getattr(g._v_attrs,'pandas_type',None) ] + return [ g for g in self.handle.walkGroups() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) ] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -684,16 +686,22 @@ def error(t): # infer the pt from the passed value if pt is None: if value is None: - raise Exception("cannot create a storer if the object is not existing nor a value are passed") - try: - pt = _TYPE_MAP[type(value)] - except: - error('_TYPE_MAP') + if getattr(group,'table',None): + pt = 'frame_table' + tt = 'generic_table' + else: + raise Exception("cannot create a storer if the object is not existing nor a value are passed") + else: + + try: + pt = _TYPE_MAP[type(value)] + except: + error('_TYPE_MAP') - # we are actually a table - if table or append: - pt += '_table' + # we are actually a table + if table or append: + pt += '_table' # a storer node if 'table' not in pt: @@ -959,6 +967,24 @@ def set_attr(self): """ set the kind for this colummn """ setattr(self.attrs, self.kind_attr, self.kind) +class GenericIndexCol(IndexCol): + """ an index which is not represented in the data of the table """ + + @property + def is_indexed(self): + return False + + def convert(self, values, nan_rep): + """ set the values from this selection: take = take ownership """ + + self.values = Int64Index(np.arange(self.table.nrows)) + return self + + def get_attr(self): + pass + + def set_attr(self): + pass class DataCol(IndexCol): """ a data holding column, by definition this is not indexable @@ -1194,6 +1220,12 @@ def get_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col() +class GenericDataIndexableCol(DataIndexableCol): + """ represent a generic pytables data column """ + + def get_attr(self): + pass + class Storer(object): """ represent an object in my store facilitate read/write of various types of objects @@ -2632,6 +2664,47 @@ def read(self, where=None, columns=None, **kwargs): return df +class GenericTable(AppendableFrameTable): + """ a table that read/writes the generic pytables table format """ + pandas_kind = 'frame_table' + table_type = 'generic_table' + ndim = 2 + obj_type = DataFrame + + @property + def pandas_type(self): + return self.pandas_kind + + def get_attrs(self): + """ retrieve our attributes """ + self.non_index_axes = [] + self.nan_rep = None + self.levels = [] + t = self.table + self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] + self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] + self.data_columns = [ a.name for a in self.values_axes ] + + @property + def indexables(self): + """ create the indexables from the table description """ + if self._indexables is None: + + d = self.description + + # the index columns is just a simple index + self._indexables = [ GenericIndexCol(name='index',axis=0) ] + + for i, n in enumerate(d._v_names): + + dc = GenericDataIndexableCol(name = n, pos=i, values = [ n ], version = self.version) + self._indexables.append(dc) + + return self._indexables + + def write(self, **kwargs): + raise NotImplementedError("cannot write on an generic table") + class AppendableMultiFrameTable(AppendableFrameTable): """ a frame with a multi-index """ table_type = 'appendable_multiframe' diff --git a/pandas/io/tests/pytables_native.h5 b/pandas/io/tests/pytables_native.h5 new file mode 100644 index 0000000000000000000000000000000000000000..4786eea077533929868e41b46764b039b2c10ed7 GIT binary patch literal 12336 zcmeHMeNa?Y6n_h->qiK+f`g8($`sZlj};NrX%~fp8G{uVL@QiAmW0>^m&HI0cIXfA zqx=I6kx-FI)GReKKTvcOO!*MeX&5C$bHIvELCPm9bnmOt22n~$K$dKQYv5b^=fiHu;7#t!dS{)M~p;!0+=G$WbGM1gqy3J-~tlM06 zE+`Fvf9P3cWyO1bFdE+gJdP{hRpEhWD^Og)r$=qqsAqdWOZ5)F_wfCb_#ko%&S&Ae zSr@?w{$WAyUT}@76ak6=MSvne5ugZI1OkLts_<$^IS@8u=;aN^E6+|q^GQ87v^RRy z(hxAmD~0@!tXss4F@U$Y@LeZhpacixC}=a2VeF-H%o7xh`&h=k4{%fw<~IX8 z2zef!_aI)yVnPcwc)xKB%R7M2B?f;X{vnIOh92_@IADp4W&vBBIw>hxrjj4 zj;C*k4_Wu~Y_a3eKBf3FOFCxENj@IJyxokM#S9OQE40sM2Y^772mvOH9};U%8b1ac zKX4xO`=s#`;b!c7PN-?_?rdN7$RoZhJF9AcU2nMWnTBfD zYuj_WtA8u<3atNQL79rZ=$gH~{ReK@Wo>cmdCB>R6K9V%b$Q6Ei)xLYWh>pw{Zq?Z zj&i=GORI8bUbv&vahb9nOM8*0{`5J807K zi`P6{wBh=JZH_&%cQhrN9<+4c-xe1$O<@4rKQ>}JDeX`YXe9ibEHHk0c03p0H5q)- zXrl=H_YfFdC&B+CnAW?n>IHQ#E`0xKeYDr4K7yWxs*lFO`@oxl77(E^sjcCN3Z9-@ z(?>+ctHIsK^GMCy*_yeHA zsEdtW5sPZB!IlFfru8pBCl1wR0~UjOrgLJWwaJ=DwT@VgB^@&o&V@A}$e8Z$C7&MlNkmz1^?T(S#N?%7q~ zZ6z5EsIK*cWhvVi*zS(WySLbW-P--_el@4>8J#wltSU{5={~%3~XxIs_UEHmBnY?Jsh^>Yp%Yt)0+D;6j;@| zHZN%?3Y(R~H7>2_%AefCg|`2+E-5$i=<>#6JKwAxmzjF=_Nk`g&HL7;P!>(M66k)) ZQ)NM4;4Rlvxhp{xOB+RiBJeyA_zS3>0-yi@ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d5978ea150c26..39d63df9c5318 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1745,6 +1745,13 @@ def _check_roundtrip_table(self, obj, comparator, compression=False): store.close() os.remove(self.scratchpath) + def test_pytables_native_read(self): + pth = curpath() + store = HDFStore(os.path.join(pth, 'pytables_native.h5'), 'r') + d = store['detector'] + str(store) + store.close() + def test_legacy_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') From e3ae189889321503f3c633ca353b25d76dbb27a1 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 15 Jan 2013 11:34:40 -0500 Subject: [PATCH 04/10] ENH: allow tables to be specified w/o using 'table' under the group node (only for GenericTable) --- pandas/io/pytables.py | 12 ++++++++---- pandas/io/tests/test_pytables.py | 5 +++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index bfe00c430ae18..f4b1af63cbbda 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -611,7 +611,8 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ - return [ g for g in self.handle.walkGroups() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) ] + _tables() + return [ g for g in self.handle.walkGroups() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or isinstance(g,_table_mod.table.Table) ] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -687,7 +688,8 @@ def error(t): if pt is None: if value is None: - if getattr(group,'table',None): + _tables() + if getattr(group,'table',None) or isinstance(group,_table_mod.table.Table): pt = 'frame_table' tt = 'generic_table' else: @@ -2551,8 +2553,6 @@ def write_data_chunk(self, indexes, mask, search, values): self.table.append(rows) self.table.flush() except (Exception), detail: - import pdb - pdb.set_trace() raise Exception( "tables cannot write this data -> %s" % str(detail)) @@ -2675,6 +2675,10 @@ class GenericTable(AppendableFrameTable): def pandas_type(self): return self.pandas_kind + @property + def storable(self): + return getattr(self.group,'table',None) or self.group + def get_attrs(self): """ retrieve our attributes """ self.non_index_axes = [] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 39d63df9c5318..19343973dbcac 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1748,8 +1748,9 @@ def _check_roundtrip_table(self, obj, comparator, compression=False): def test_pytables_native_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'pytables_native.h5'), 'r') - d = store['detector'] - str(store) + d1 = store['detector'] + d2 = store['detector/table'] + assert_frame_equal(d1, d2) store.close() def test_legacy_read(self): From 71d5151b12954d9eb118cf9f5e0deb30e6a1a18d Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 15 Jan 2013 12:53:09 -0500 Subject: [PATCH 05/10] DOC: doc updates for external compatibility --- doc/source/io.rst | 24 +++++++++++++++++++++--- doc/source/v0.10.1.txt | 1 + 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 1b61de7bf8281..4f1d02e9dc95b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1260,7 +1260,7 @@ To retrieve the *unique* values of an indexable or data column, use the method ` concat([ store.select('df_dc',c) for c in [ crit1, crit2 ] ]) -**Table Object** +**Storer Object** If you want to inspect the stored object, retrieve via ``get_storer``. You could use this progamatically to say get the number of rows in an object. @@ -1363,8 +1363,26 @@ Notes & Caveats # we have provided a minimum minor_axis indexable size store.root.wp_big_strings.table -Compatibility -~~~~~~~~~~~~~ +External Compatibility +~~~~~~~~~~~~~~~~~~~~~~ + +``HDFStore`` write storer objects in specific formats suitable for producing loss-less roundtrips to pandas objects. For external compatibility, ``HDFStore`` can read native ``PyTables`` format tables. It is possible to write an ``HDFStore`` object that can easily be imported into ``R`` using the ``rhdf5`` library. Create a table format store like this: + + .. ipython:: python + + store_export = HDFStore('export.h5') + store_export.append('df_dc',df_dc,data_columns=df_dc.columns) + store_export + + .. ipython:: python + :suppress: + + legacy_store.close() + import os + os.remove('store_export.h5') + +Backwards Compatibility +~~~~~~~~~~~~~~~~~~~~~~~ 0.10.1 of ``HDFStore`` is backwards compatible for reading tables created in a prior version of pandas however, query terms using the prior (undocumented) methodology are unsupported. ``HDFStore`` will issue a warning if you try to use a prior-version format file. You must read in the entire file and write it out using the new format, using the method ``copy`` to take advantage of the updates. The group attribute ``pandas_version`` contains the version information. ``copy`` takes a number of options, please see the docstring. diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 2eb40b2823214..86e8ede558611 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -119,6 +119,7 @@ Multi-table creation via ``append_to_multiple`` and selection via ``select_as_mu **Enhancements** +- ``HDFStore`` now can read native PyTables table format tables - You can pass ``nan_rep = 'my_nan_rep'`` to append, to change the default nan representation on disk (which converts to/from `np.nan`), this defaults to `nan`. - You can pass ``index`` to ``append``. This defaults to ``True``. This will automagically create indicies on the *indexables* and *data columns* of the table - You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. From 62119368fed007f4d7ddadd88036ce5569aa5ea7 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 15 Jan 2013 14:22:17 -0500 Subject: [PATCH 06/10] BUG: reading non-well formed generic_tables not working (e.g. detector/readout), where readout is a table added pytables_native2.h5 for testing BUG: force travis to rebuild BUG: fix travis build 2 --- pandas/core/reshape.py | 4 ++-- pandas/io/pytables.py | 2 +- pandas/io/tests/pytables_native.h5 | Bin 12336 -> 74246 bytes pandas/io/tests/pytables_native2.h5 | Bin 0 -> 12336 bytes pandas/io/tests/test_pytables.py | 10 ++++++---- 5 files changed, 9 insertions(+), 7 deletions(-) create mode 100644 pandas/io/tests/pytables_native2.h5 diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7db6d301f7e83..e16fdffd15e8d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -834,5 +834,5 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None): def factor_indexer(shape, labels): """ given a tuple of shape and a list of Factor lables, return the expanded label indexer """ - mult = np.array(shape, dtype = 'i8')[::-1].cumprod()[::-1] - return np.sum(np.array(labels, dtype = 'i8').T * np.append(mult, [1]), axis=1).T + mult = np.array(shape)[::-1].cumprod()[::-1] + return (np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T).astype('i8') diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f4b1af63cbbda..5812b8a7ef994 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -612,7 +612,7 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() - return [ g for g in self.handle.walkGroups() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or isinstance(g,_table_mod.table.Table) ] + return [ g for g in self.handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != 'table') ] def get_node(self, key): """ return the node with the key or None if it does not exist """ diff --git a/pandas/io/tests/pytables_native.h5 b/pandas/io/tests/pytables_native.h5 index 4786eea077533929868e41b46764b039b2c10ed7..a01b0f1dca3c00cf45e93fb69862077e681eb8ee 100644 GIT binary patch literal 74246 zcmeI%O>Y}T7{KvoohEf@Dx4P4mh!S74oFC$Nz*qZRBsxGL^5$?yQN$RAu(W-IFaK> z?TH=`f{y_Q#E~O#;D|VK0zLsp)C)%x=9w3Jv+Jf2AP(d|RlK`1^UUu2JUct~^~EmcLy^aGwf1MEW7_MT-OYDf z{b#8@qYg?M8?Lppx@jMI7Y+(AW!k2?Y$t&ha>tD3nug#w{mf1DS1OLBe&km&L_j;ryd%gol^$aoj(8c zzWC$jIzoTLyfQrQ(nI6IoH9Ia=HNIpuMF?!<$ZDKb#ZoX8Gha?`{K?0+V5wWUxuIe z>b|)2d9(9OI(|I${mw5|tNQ)w*a-8z9tUUL*EBqSY9$@-V%)@$k1yMHA~_{b^v^%t zj~_PwYe2po_hzH*4Ek}Y-%fg7KYaYmKa9V;FFrgQhMzy&pXqUar5~@)3GH*f-(_vi zZd#GtHexKL_)dz6V9-?BVx{u>F0{)ysZvpWmOO z40`lGUVdIGO5G zBKL1d!T$!aZGZm=@2a}ZRG&80%QE7Ae)#;`vhL`6geHA;&Q!l-Rzis!^Dom6%DNMN z$xYVKu{FP7h)arKKKgYA(L zSg0h++TlCC*RkLJaKViWe<<7C*3Qmuw-t^%tDAN|)VdZ54!NE1zr&A;YivuXygoH2hB<}aLY*GgW;mUetlU(Z&Q+m(7_v9@IEkC^gszNyKJ8fK19x;$SnYGQP^P$<}(5pzfC&P05} zlxQl;%t@Kq)5c7Se7_?f@3T!^7dH}p|GX*`AAQ0+di?(E$FI9z{AJ01J$?_GN1n9(a95^NM?A!G yaKGdo)A!f*z1|0X{LtC|&GvfPb!FCV?=O^>SG&FT+D7Zu*rv;w<13lttA7Cz{LOp- literal 12336 zcmeHMeNa?Y6n_h->qiK+f`g8($`sZlj};NrX%~fp8G{uVL@QiAmW0>^m&HI0cIXfA zqx=I6kx-FI)GReKKTvcOO!*MeX&5C$bHIvELCPm9bnmOt22n~$K$dKQYv5b^=fiHu;7#t!dS{)M~p;!0+=G$WbGM1gqy3J-~tlM06 zE+`Fvf9P3cWyO1bFdE+gJdP{hRpEhWD^Og)r$=qqsAqdWOZ5)F_wfCb_#ko%&S&Ae zSr@?w{$WAyUT}@76ak6=MSvne5ugZI1OkLts_<$^IS@8u=;aN^E6+|q^GQ87v^RRy z(hxAmD~0@!tXss4F@U$Y@LeZhpacixC}=a2VeF-H%o7xh`&h=k4{%fw<~IX8 z2zef!_aI)yVnPcwc)xKB%R7M2B?f;X{vnIOh92_@IADp4W&vBBIw>hxrjj4 zj;C*k4_Wu~Y_a3eKBf3FOFCxENj@IJyxokM#S9OQE40sM2Y^772mvOH9};U%8b1ac zKX4xO`=s#`;b!c7PN-?_?rdN7$RoZhJF9AcU2nMWnTBfD zYuj_WtA8u<3atNQL79rZ=$gH~{ReK@Wo>cmdCB>R6K9V%b$Q6Ei)xLYWh>pw{Zq?Z zj&i=GORI8bUbv&vahb9nOM8*0{`5J807K zi`P6{wBh=JZH_&%cQhrN9<+4c-xe1$O<@4rKQ>}JDeX`YXe9ibEHHk0c03p0H5q)- zXrl=H_YfFdC&B+CnAW?n>IHQ#E`0xKeYDr4K7yWxs*lFO`@oxl77(E^sjcCN3Z9-@ z(?>+ctHIsK^GMCy*_yeHA zsEdtW5sPZB!IlFfru8pBCl1wR0~UjOrgLJWwaJ=DwT@VgB^@&o&V@A}$e8Z$C7&MlNkmz1^?T(S#N?%7q~ zZ6z5EsIK*cWhvVi*zS(WySLbW-P--_el@4>8J#wltSU{5={~%3~XxIs_UEHmBnY?Jsh^>Yp%Yt)0+D;6j;@| zHZN%?3Y(R~H7>2_%AefCg|`2+E-5$i=<>#6JKwAxmzjF=_Nk`g&HL7;P!>(M66k)) ZQ)NM4;4Rlvxhp{xOB+RiBJeyA_zS3>0-yi@ diff --git a/pandas/io/tests/pytables_native2.h5 b/pandas/io/tests/pytables_native2.h5 new file mode 100644 index 0000000000000000000000000000000000000000..4786eea077533929868e41b46764b039b2c10ed7 GIT binary patch literal 12336 zcmeHMeNa?Y6n_h->qiK+f`g8($`sZlj};NrX%~fp8G{uVL@QiAmW0>^m&HI0cIXfA zqx=I6kx-FI)GReKKTvcOO!*MeX&5C$bHIvELCPm9bnmOt22n~$K$dKQYv5b^=fiHu;7#t!dS{)M~p;!0+=G$WbGM1gqy3J-~tlM06 zE+`Fvf9P3cWyO1bFdE+gJdP{hRpEhWD^Og)r$=qqsAqdWOZ5)F_wfCb_#ko%&S&Ae zSr@?w{$WAyUT}@76ak6=MSvne5ugZI1OkLts_<$^IS@8u=;aN^E6+|q^GQ87v^RRy z(hxAmD~0@!tXss4F@U$Y@LeZhpacixC}=a2VeF-H%o7xh`&h=k4{%fw<~IX8 z2zef!_aI)yVnPcwc)xKB%R7M2B?f;X{vnIOh92_@IADp4W&vBBIw>hxrjj4 zj;C*k4_Wu~Y_a3eKBf3FOFCxENj@IJyxokM#S9OQE40sM2Y^772mvOH9};U%8b1ac zKX4xO`=s#`;b!c7PN-?_?rdN7$RoZhJF9AcU2nMWnTBfD zYuj_WtA8u<3atNQL79rZ=$gH~{ReK@Wo>cmdCB>R6K9V%b$Q6Ei)xLYWh>pw{Zq?Z zj&i=GORI8bUbv&vahb9nOM8*0{`5J807K zi`P6{wBh=JZH_&%cQhrN9<+4c-xe1$O<@4rKQ>}JDeX`YXe9ibEHHk0c03p0H5q)- zXrl=H_YfFdC&B+CnAW?n>IHQ#E`0xKeYDr4K7yWxs*lFO`@oxl77(E^sjcCN3Z9-@ z(?>+ctHIsK^GMCy*_yeHA zsEdtW5sPZB!IlFfru8pBCl1wR0~UjOrgLJWwaJ=DwT@VgB^@&o&V@A}$e8Z$C7&MlNkmz1^?T(S#N?%7q~ zZ6z5EsIK*cWhvVi*zS(WySLbW-P--_el@4>8J#wltSU{5={~%3~XxIs_UEHmBnY?Jsh^>Yp%Yt)0+D;6j;@| zHZN%?3Y(R~H7>2_%AefCg|`2+E-5$i=<>#6JKwAxmzjF=_Nk`g&HL7;P!>(M66k)) ZQ)NM4;4Rlvxhp{xOB+RiBJeyA_zS3>0-yi@ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 19343973dbcac..31cda093a685f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -754,7 +754,7 @@ def test_big_table_panel(self): x = time.time() try: store = HDFStore(self.scratchpath) - store.prof_append('wp', wp) + store.append('wp', wp) rows = store.root.wp.table.nrows recons = store.select('wp') finally: @@ -1748,9 +1748,11 @@ def _check_roundtrip_table(self, obj, comparator, compression=False): def test_pytables_native_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'pytables_native.h5'), 'r') + d2 = store['detector/readout'] + store.close() + store = HDFStore(os.path.join(pth, 'pytables_native2.h5'), 'r') + str(store) d1 = store['detector'] - d2 = store['detector/table'] - assert_frame_equal(d1, d2) store.close() def test_legacy_read(self): @@ -1826,7 +1828,7 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): if a.is_indexed: self.assert_(new_t[a.name].is_indexed == True) - except: + except (Exception), detail: pass finally: store.close() From 8b288626ab6a397888e6866dd00f6f225778e299 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 15 Jan 2013 20:54:08 -0500 Subject: [PATCH 07/10] BUG: hopefully fixed factor_indexer in reshape.py; failed on 32-bit platformsx --- pandas/core/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index e16fdffd15e8d..40c4dc6e5efe7 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -835,4 +835,4 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None): def factor_indexer(shape, labels): """ given a tuple of shape and a list of Factor lables, return the expanded label indexer """ mult = np.array(shape)[::-1].cumprod()[::-1] - return (np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T).astype('i8') + return com._ensure_platform_int(np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) From 4cfc8cd2f9e5a7d03245512fbfb59ac8c27df455 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 18 Jan 2013 10:41:03 -0500 Subject: [PATCH 08/10] ENH/DOC: added PerformanceWarning which will trigger on putting a non-endemic type fixed legacy_0.10.h5 issues with docs (finding file) --- RELEASE.rst | 1 + doc/source/io.rst | 16 +++-- pandas/io/pytables.py | 29 +++++++-- pandas/io/tests/test_pytables.py | 103 +++++++++++++++++++++++++++---- 4 files changed, 127 insertions(+), 22 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 95e39f6510e08..2e5cfd886a13d 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -88,6 +88,7 @@ pandas 0.10.1 - refactored HFDStore to deal with non-table stores as objects, will allow future enhancements - removed keyword ``compression`` from ``put`` (replaced by keyword ``complib`` to be consistent across library) + - warn `PerformanceWarning` if you are attempting to store types that will be pickled by PyTables .. _GH512: https://github.com/pydata/pandas/issues/512 .. _GH1277: https://github.com/pydata/pandas/issues/1277 diff --git a/doc/source/io.rst b/doc/source/io.rst index 4f1d02e9dc95b..6b7ec3dfdd841 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1211,7 +1211,7 @@ You can create/modify an index for a table with ``create_table_index`` after dat Query via Data Columns ~~~~~~~~~~~~~~~~~~~~~~ -You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. +You can designate (and index) certain columns that you want to be able to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. You can specify ``data_columns = True`` to force all columns to be data_columns .. ipython:: python @@ -1377,9 +1377,9 @@ External Compatibility .. ipython:: python :suppress: - legacy_store.close() + store_export.close() import os - os.remove('store_export.h5') + os.remove('export.h5') Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ @@ -1387,11 +1387,16 @@ Backwards Compatibility 0.10.1 of ``HDFStore`` is backwards compatible for reading tables created in a prior version of pandas however, query terms using the prior (undocumented) methodology are unsupported. ``HDFStore`` will issue a warning if you try to use a prior-version format file. You must read in the entire file and write it out using the new format, using the method ``copy`` to take advantage of the updates. The group attribute ``pandas_version`` contains the version information. ``copy`` takes a number of options, please see the docstring. + .. ipython:: python + :suppress: + + import os + legacy_file_path = os.path.abspath('source/_static/legacy_0.10.h5') + .. ipython:: python # a legacy store - import os - legacy_store = HDFStore('legacy_0.10.h5', 'r') + legacy_store = HDFStore(legacy_file_path,'r') legacy_store # copy (and return the new handle) @@ -1415,6 +1420,7 @@ Performance - You can pass ``chunksize=an integer`` to ``append``, to change the writing chunksize (default is 50000). This will signficantly lower your memory usage on writing. - You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. - Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) + - A ``PerformanceWarning`` will be raised if you are attempting to store types that will be pickled by PyTables (rather than stored as endemic types). See for more information and some solutions. Experimental ~~~~~~~~~~~~ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5812b8a7ef994..0ba38c5adb133 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -42,6 +42,11 @@ class IncompatibilityWarning(Warning): pass where criteria is being ignored as this version [%s] is too old (or not-defined), read the file in and write it out to a new file to upgrade (with the copy_to method) """ +class PerformanceWarning(Warning): pass +performance_doc = """ +your performance may suffer as PyTables swill pickle object types that it cannot map +directly to c-types [inferred_type->%s,key->%s] +""" # map object types _TYPE_MAP = { @@ -510,7 +515,7 @@ def append(self, key, value, columns=None, **kwargs): Optional Parameters ------------------- - data_columns : list of columns to create as data columns + data_columns : list of columns to create as data columns, or True to use all columns min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing @@ -1606,6 +1611,17 @@ def write_array(self, key, value): return if value.dtype.type == np.object_: + + # infer the type, warn if we have a non-string type here (for performance) + inferred_type = lib.infer_dtype(value.flatten()) + if empty_array: + pass + elif inferred_type == 'string': + pass + else: + ws = performance_doc % (inferred_type,key) + warnings.warn(ws, PerformanceWarning) + vlarr = self.handle.createVLArray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) @@ -1846,7 +1862,7 @@ class Table(Storer): index_axes : a list of tuples of the (original indexing axis and index column) non_index_axes: a list of tuples of the (original index axis and columns on a non-indexing axis) values_axes : a list of the columns which comprise the data of this table - data_columns : a list of the columns that we are allowing indexing (these become single columns in values_axes) + data_columns : a list of the columns that we are allowing indexing (these become single columns in values_axes), or True to force all columns nan_rep : the string to use for nan representations for string objects levels : the names of levels @@ -2111,7 +2127,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, validate: validate the obj against an existiing object already written min_itemsize: a dict of the min size for a column in bytes nan_rep : a values to use for string column nan_rep - data_columns : a list of columns that we want to create separate to allow indexing + data_columns : a list of columns that we want to create separate to allow indexing (or True will force all colummns) """ @@ -2196,6 +2212,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if data_columns is not None and len(self.non_index_axes): axis = self.non_index_axes[0][0] axis_labels = self.non_index_axes[0][1] + if data_columns is True: + data_columns = axis_labels + data_columns = [c for c in data_columns if c in axis_labels] if len(data_columns): blocks = block_obj.reindex_axis(Index(axis_labels) - Index( @@ -2238,7 +2257,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, except (NotImplementedError): raise except (Exception), detail: - raise Exception("cannot find the correct atom type -> [dtype->%s] %s" % (b.dtype.name, str(detail))) + raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % (b.dtype.name, b.items, str(detail))) j += 1 # validate the axes if we have an existing table @@ -2722,6 +2741,8 @@ def table_type_short(self): def write(self, obj, data_columns=None, **kwargs): if data_columns is None: data_columns = [] + elif data_columns is True: + data_columns = obj.columns[:] for n in obj.index.names: if n not in data_columns: data_columns.insert(0, n) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 31cda093a685f..bd04fe4c1ce31 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -9,7 +9,7 @@ from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, date_range, Index) -from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning +from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning, PerformanceWarning import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal @@ -260,6 +260,28 @@ def test_put_integer(self): df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal) + def test_put_mixed_type(self): + df = tm.makeTimeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['bool3'] = True + df['int1'] = 1 + df['int2'] = 2 + df['timestamp1'] = Timestamp('20010102') + df['timestamp2'] = Timestamp('20010103') + df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) + df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) + df.ix[3:6, ['obj1']] = np.nan + df = df.consolidate().convert_objects() + self.store.remove('df') + warnings.filterwarnings('ignore', category=PerformanceWarning) + self.store.put('df',df) + expected = self.store.get('df') + tm.assert_frame_equal(expected,df) + warnings.filterwarnings('always', category=PerformanceWarning) + def test_append(self): df = tm.makeTimeDataFrame() @@ -703,7 +725,7 @@ def test_big_table_frame(self): print "\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x) def test_big_table2_frame(self): - # this is a really big table: 2.5m rows x 300 float columns, 20 string + # this is a really big table: 1m rows x 60 float columns, 20 string, 20 datetime # columns raise nose.SkipTest('no big table2 frame') @@ -711,10 +733,12 @@ def test_big_table2_frame(self): print "\nbig_table2 start" import time start_time = time.time() - df = DataFrame(np.random.randn(2.5 * 1000 * 1000, 300), index=range(int( - 2.5 * 1000 * 1000)), columns=['E%03d' % i for i in xrange(300)]) - for x in range(20): + df = DataFrame(np.random.randn(1000 * 1000, 60), index=xrange(int( + 1000 * 1000)), columns=['E%03d' % i for i in xrange(60)]) + for x in xrange(20): df['String%03d' % x] = 'string%03d' % x + for x in xrange(20): + df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) print "\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time) fn = 'big_table2.h5' @@ -728,7 +752,7 @@ def f(chunksize): store.close() return r - for c in [10000, 50000, 100000, 250000]: + for c in [10000, 50000, 250000]: start_time = time.time() print "big_table2 frame [chunk->%s]" % c rows = f(c) @@ -737,6 +761,35 @@ def f(chunksize): finally: os.remove(fn) + def test_big_put_frame(self): + raise nose.SkipTest('no big put frame') + + print "\nbig_put start" + import time + start_time = time.time() + df = DataFrame(np.random.randn(1000 * 1000, 60), index=xrange(int( + 1000 * 1000)), columns=['E%03d' % i for i in xrange(60)]) + for x in xrange(20): + df['String%03d' % x] = 'string%03d' % x + for x in xrange(20): + df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) + + print "\nbig_put frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time) + fn = 'big_put.h5' + + try: + + start_time = time.time() + store = HDFStore(fn, mode='w') + store.put('df', df) + store.close() + + print df.get_dtype_counts() + print "big_put frame [shape->%s] -> %5.2f" % (df.shape, time.time() - start_time) + + finally: + os.remove(fn) + def test_big_table_panel(self): raise nose.SkipTest('no big table panel') @@ -823,15 +876,23 @@ def test_table_index_incompatible_dtypes(self): def test_table_values_dtypes_roundtrip(self): df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') - self.store.append('df1', df1) - assert df1.dtypes == self.store['df1'].dtypes + self.store.append('df_f8', df1) + assert df1.dtypes == self.store['df_f8'].dtypes df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') - self.store.append('df2', df2) - assert df2.dtypes == self.store['df2'].dtypes + self.store.append('df_i8', df2) + assert df2.dtypes == self.store['df_i8'].dtypes # incompatible dtype - self.assertRaises(Exception, self.store.append, 'df2', df1) + self.assertRaises(Exception, self.store.append, 'df_i8', df1) + + #df1 = DataFrame({'a': Series([1, 2, 3], dtype='f4')}) + #self.store.append('df_f4', df1) + #assert df1.dtypes == self.store['df_f4'].dtypes + + #df2 = DataFrame({'a': Series([1, 2, 3], dtype='i4')}) + #self.store.append('df_i4', df2) + #assert df2.dtypes == self.store['df_i4'].dtypes def test_table_mixed_dtypes(self): @@ -1165,15 +1226,19 @@ def test_tuple_index(self): idx = [(0., 1.), (2., 3.), (4., 5.)] data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) + warnings.filterwarnings('ignore', category=PerformanceWarning) self._check_roundtrip(DF, tm.assert_frame_equal) + warnings.filterwarnings('always', category=PerformanceWarning) def test_index_types(self): values = np.random.randn(2) func = lambda l, r: tm.assert_series_equal(l, r, True, True, True) + warnings.filterwarnings('ignore', category=PerformanceWarning) ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) + warnings.filterwarnings('always', category=PerformanceWarning) ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func) @@ -1181,11 +1246,15 @@ def test_index_types(self): ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) + warnings.filterwarnings('ignore', category=PerformanceWarning) ser = Series(values, [datetime.date.today(), 'a']) self._check_roundtrip(ser, func) + warnings.filterwarnings('always', category=PerformanceWarning) + warnings.filterwarnings('ignore', category=PerformanceWarning) ser = Series(values, [1.23, 'b']) self._check_roundtrip(ser, func) + warnings.filterwarnings('always', category=PerformanceWarning) ser = Series(values, [1, 1.53]) self._check_roundtrip(ser, func) @@ -1456,6 +1525,13 @@ def test_select(self): expected = df[df.A > 0].reindex(columns=['A', 'B']) tm.assert_frame_equal(expected, result) + # all a data columns + self.store.remove('df') + self.store.append('df', df, data_columns=True) + result = self.store.select('df', ['A > 0'], columns=['A', 'B']) + expected = df[df.A > 0].reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + # with a data column, but different columns self.store.remove('df') self.store.append('df', df, data_columns=['A']) @@ -1776,7 +1852,6 @@ def test_legacy_table_read(self): store.select('df2', typ='legacy_frame') # old version warning - import warnings warnings.filterwarnings('ignore', category=IncompatibilityWarning) self.assertRaises( Exception, store.select, 'wp1', Term('minor_axis', '=', 'B')) @@ -1915,9 +1990,11 @@ def test_tseries_indices_frame(self): def test_unicode_index(self): unicode_values = [u'\u03c3', u'\u03c3\u03c3'] - + warnings.filterwarnings('ignore', category=PerformanceWarning) s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) + warnings.filterwarnings('always', category=PerformanceWarning) + def test_store_datetime_mixed(self): df = DataFrame( From 8b4a8744ef92303cb4c1e5100ebb78c9029c3312 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 18 Jan 2013 21:25:20 -0500 Subject: [PATCH 09/10] BUG: correctly store float32 dtypes (that are not-mixed with float64 dtypes) --- RELEASE.rst | 1 + doc/source/v0.10.1.txt | 2 ++ pandas/io/pytables.py | 26 +++++++++++++++++++++----- pandas/io/tests/test_pytables.py | 26 +++++++++++++++++++------- 4 files changed, 43 insertions(+), 12 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 2e5cfd886a13d..245e72d6bca6e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -68,6 +68,7 @@ pandas 0.10.1 is ``Int64``), (closes GH512_) - handle Timestamp correctly in data_columns (closes GH2637_) - contains correctly matches on non-natural names + - correctly store ``float32`` dtypes in tables (if not other float types in the same table) - Fix DataFrame.info bug with UTF8-encoded columns. (GH2576_) - Fix DatetimeIndex handling of FixedOffset tz (GH2604_) - More robust detection of being in IPython session for wide DataFrame diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 86e8ede558611..8aa2dad2b35a0 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -126,6 +126,8 @@ Multi-table creation via ``append_to_multiple`` and selection via ``select_as_mu - You can pass ``expectedrows=an integer`` to the first ``append``, to set the TOTAL number of expectedrows that ``PyTables`` will expected. This will optimize read/write performance. - ``Select`` now supports passing ``start`` and ``stop`` to provide selection space limiting in selection. +**Bug Fixes** +- ``HDFStore`` tables can now store ``float32`` types correctly (cannot be mixed with ``float64`` however) See the `full release notes `__ or issue tracker diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0ba38c5adb133..e8a2d40de6597 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1129,7 +1129,7 @@ def get_atom_data(self, block): def set_atom_data(self, block): self.kind = block.dtype.name self.typ = self.get_atom_data(block) - self.set_data(block.values.astype(self.typ._deftype)) + self.set_data(block.values.astype(self.typ.type)) def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) @@ -2116,6 +2116,22 @@ def get_object(self, obj): """ return the data for this obj """ return obj + def convert_objects(self, obj): + """ attempt to convert any object fields; don't touch other fields + if we are converting anything, copy the object and modify the copy """ + new_obj = None + convert_f = lambda x: lib.maybe_convert_objects(x, convert_datetime=True) + + for col, s in obj.iteritems(): + if s.dtype == np.object_: + if new_obj is None: + new_obj = obj.copy() + new_obj[col] = convert_f(s) + + if new_obj is not None: + return new_obj + return obj + def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2162,10 +2178,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, self.nan_rep = nan_rep # convert the objects if we can to better divine dtypes - try: - obj = obj.convert_objects() - except: - pass + obj = self.convert_objects(obj) # create axes to index and non_index index_axes_map = dict() @@ -2767,6 +2780,9 @@ def get_object(self, obj): obj = obj.transpose(*self.data_orientation) return obj + def convert_objects(self, obj): + return obj + @property def is_transposed(self): return self.data_orientation != tuple(range(self.ndim)) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index bd04fe4c1ce31..5e0fe8d292e16 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -886,13 +886,25 @@ def test_table_values_dtypes_roundtrip(self): # incompatible dtype self.assertRaises(Exception, self.store.append, 'df_i8', df1) - #df1 = DataFrame({'a': Series([1, 2, 3], dtype='f4')}) - #self.store.append('df_f4', df1) - #assert df1.dtypes == self.store['df_f4'].dtypes - - #df2 = DataFrame({'a': Series([1, 2, 3], dtype='i4')}) - #self.store.append('df_i4', df2) - #assert df2.dtypes == self.store['df_i4'].dtypes + # check creation/storage/retrieval of float32 (a bit hacky to actually create them thought) + df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['A']) + self.store.append('df_f4', df1) + assert df1.dtypes == self.store['df_f4'].dtypes + assert df1.dtypes[0] == 'float32' + + # check with mixed dtypes (but not multi float types) + df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['float32']) + df1['string'] = 'foo' + self.store.append('df_mixed_dtypes1', df1) + assert (df1.dtypes == self.store['df_mixed_dtypes1'].dtypes).all() == True + assert df1.dtypes[0] == 'float32' + assert df1.dtypes[1] == 'object' + + ### this is not supported, e.g. mixed float32/float64 blocks ### + #df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['float32']) + #df1['float64'] = 1.0 + #self.store.append('df_mixed_dtypes2', df1) + #assert df1.dtypes == self.store['df_mixed_dtypes2'].dtypes).all() == True def test_table_mixed_dtypes(self): From cadca37ffc814a03d01a793c0ee8865c48e63705 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 19 Jan 2013 09:05:59 -0500 Subject: [PATCH 10/10] CLN: removed convert_objects step in create_axes (no longer needed) --- pandas/io/pytables.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e8a2d40de6597..78bd204f26993 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2116,22 +2116,6 @@ def get_object(self, obj): """ return the data for this obj """ return obj - def convert_objects(self, obj): - """ attempt to convert any object fields; don't touch other fields - if we are converting anything, copy the object and modify the copy """ - new_obj = None - convert_f = lambda x: lib.maybe_convert_objects(x, convert_datetime=True) - - for col, s in obj.iteritems(): - if s.dtype == np.object_: - if new_obj is None: - new_obj = obj.copy() - new_obj[col] = convert_f(s) - - if new_obj is not None: - return new_obj - return obj - def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2177,9 +2161,6 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, nan_rep = 'nan' self.nan_rep = nan_rep - # convert the objects if we can to better divine dtypes - obj = self.convert_objects(obj) - # create axes to index and non_index index_axes_map = dict() for i, a in enumerate(obj.axes): @@ -2780,9 +2761,6 @@ def get_object(self, obj): obj = obj.transpose(*self.data_orientation) return obj - def convert_objects(self, obj): - return obj - @property def is_transposed(self): return self.data_orientation != tuple(range(self.ndim))