diff --git a/RELEASE.rst b/RELEASE.rst index f3fb98535cb61..1a86ac02b2f7e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -61,8 +61,20 @@ pandas 0.11.1 - Fix regression in a DataFrame apply with axis=1, objects were not being converted back to base dtypes correctly (GH3480_) - Fix issue when storing uint dtypes in an HDFStore. (GH3493_) + - Non-unique index support clarified (GH3468_) + + - Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_) + - Fix construction of a DataFrame with a duplicate index + - ref_locs support to allow duplicative indices across dtypes, + allows iget support to always find the index (even across dtypes) (GH2194_) + - applymap on a DataFrame with a non-unique index now works + (removed warning) (GH2786_), and fix (GH3230_) + - Fix to_csv to handle non-unique columns (GH3495_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 +.. _GH2786: https://github.com/pydata/pandas/issues/2786 +.. _GH2194: https://github.com/pydata/pandas/issues/2194 +.. _GH3230: https://github.com/pydata/pandas/issues/3230 .. _GH3251: https://github.com/pydata/pandas/issues/3251 .. _GH3379: https://github.com/pydata/pandas/issues/3379 .. _GH3480: https://github.com/pydata/pandas/issues/3480 @@ -75,8 +87,10 @@ pandas 0.11.1 .. _GH3455: https://github.com/pydata/pandas/issues/3455 .. _GH3457: https://github.com/pydata/pandas/issues/3457 .. _GH3461: https://github.com/pydata/pandas/issues/3461 +.. _GH3468: https://github.com/pydata/pandas/issues/3468 .. _GH3448: https://github.com/pydata/pandas/issues/3448 .. _GH3449: https://github.com/pydata/pandas/issues/3449 +.. _GH3495: https://github.com/pydata/pandas/issues/3495 .. _GH3493: https://github.com/pydata/pandas/issues/3493 diff --git a/pandas/core/common.py b/pandas/core/common.py index e6ce9fc5fc925..490f269c8c104 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1156,6 +1156,7 @@ def _default_index(n): values = np.arange(n, dtype=np.int64) result = values.view(Int64Index) result.name = None + result.is_unique = True return result diff --git a/pandas/core/format.py b/pandas/core/format.py index 5b68b26a41b77..fa2135bb4310c 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -820,21 +820,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.blocks = self.obj._data.blocks ncols = sum(len(b.items) for b in self.blocks) self.data =[None] * ncols - - if self.obj.columns.is_unique: - self.colname_map = dict((k,i) for i,k in enumerate(self.obj.columns)) - else: - ks = [set(x.items) for x in self.blocks] - u = len(reduce(lambda a,x: a.union(x),ks,set())) - t = sum(map(len,ks)) - if u != t: - if len(set(self.cols)) != len(self.cols): - raise NotImplementedError("duplicate columns with differing dtypes are unsupported") - else: - # if columns are not unique and we acces this, - # we're doing it wrong - pass - + self.column_map = self.obj._data.get_items_map() if chunksize is None: chunksize = (100000/ (len(self.cols) or 1)) or 1 @@ -1034,18 +1020,13 @@ def _save_chunk(self, start_i, end_i): # create the data for a chunk slicer = slice(start_i,end_i) - if self.obj.columns.is_unique: - for i in range(len(self.blocks)): - b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) - for j, k in enumerate(b.items): - # self.data is a preallocated list - self.data[self.colname_map[k]] = d[j] - else: - # self.obj should contain a proper view of the dataframes - # with the specified ordering of cols if cols was specified - for i in range(len(self.obj.columns)): - self.data[i] = self.obj.icol(i).values[slicer].tolist() + for i in range(len(self.blocks)): + b = self.blocks[i] + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) + for i, item in enumerate(b.items): + + # self.data is a preallocated list + self.data[self.column_map[b][i]] = d[i] ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2cb7608c7aba6..8bfdee3b75170 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4261,9 +4261,6 @@ def infer(x): if com.is_datetime64_dtype(x): x = lib.map_infer(x, lib.Timestamp) return lib.map_infer(x, func) - #GH2786 - if not self.columns.is_unique: - raise ValueError("applymap does not support dataframes having duplicate column labels") return self.apply(infer) #---------------------------------------------------------------------- diff --git a/pandas/core/index.py b/pandas/core/index.py index 34edd26a49617..101b69ffc3c7e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -278,7 +278,7 @@ def is_monotonic(self): def is_lexsorted_for_tuple(self, tup): return True - @cache_readonly + @cache_readonly(allow_setting=True) def is_unique(self): return self._engine.is_unique diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 03cfd18f5afe5..5c0f9253beb62 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -61,9 +61,15 @@ def ref_locs(self): if (indexer == -1).any(): raise AssertionError('Some block items were not in block ' 'ref_items') + self._ref_locs = indexer return self._ref_locs + def set_ref_locs(self, placement): + """ explicity set the ref_locs indexer, only necessary for duplicate indicies """ + if placement is not None: + self._ref_locs = np.array(placement,dtype='int64') + def set_ref_items(self, ref_items, maybe_rename=True): """ If maybe_rename=True, need to set the items for this guy @@ -164,6 +170,9 @@ def get(self, item): loc = self.items.get_loc(item) return self.values[loc] + def iget(self, i): + return self.values[i] + def set(self, item, value): """ Modify Block in-place with new item value @@ -710,7 +719,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True): # attempt to create new type blocks blocks = [] for i, c in enumerate(self.items): - values = self.get(c) + values = self.iget(i) values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric) values = _block_shape(values) @@ -879,7 +888,7 @@ class BlockManager(object): ----- This is *not* a public API class """ - __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated'] + __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs', '_items_map'] def __init__(self, blocks, axes, do_integrity_check=True): self.axes = [_ensure_index(ax) for ax in axes] @@ -897,6 +906,10 @@ def __init__(self, blocks, axes, do_integrity_check=True): self._consolidate_check() + # we have a duplicate items index, setup the block maps + if not self.items.is_unique: + self._set_ref_locs(do_refs=True) + @classmethod def make_empty(self): return BlockManager([], [[], []]) @@ -915,12 +928,141 @@ def set_axis(self, axis, value): if len(value) != len(cur_axis): raise Exception('Length mismatch (%d vs %d)' % (len(value), len(cur_axis))) + self.axes[axis] = value if axis == 0: + + # set/reset ref_locs based on the current index + # and map the new index if needed + self._set_ref_locs(labels=cur_axis) + + # take via ref_locs for block in self.blocks: block.set_ref_items(self.items, maybe_rename=True) + # set/reset ref_locs based on the new index + self._set_ref_locs(labels=value, do_refs=True) + + def _set_ref_locs(self, labels=None, do_refs=False): + """ + if we have a non-unique index on this axis, set the indexers + we need to set an absolute indexer for the blocks + return the indexer if we are not unique + + labels : the (new) labels for this manager + ref : boolean, whether to set the labels (one a 1-1 mapping) + + """ + + im = None + if labels is None: + labels = self.items + else: + _ensure_index(labels) + + # we are unique, and coming from a unique + if labels.is_unique and not do_refs: + + # reset our ref locs + self._ref_locs = None + for b in self.blocks: + b._ref_locs = None + + return None + + # we are going to a non-unique index + # we have ref_locs on the block at this point + # or if ref_locs are not set, then we must assume a block + # ordering + if not labels.is_unique and do_refs: + + # create the items map + im = getattr(self,'_items_map',None) + if im is None: + + im = dict() + def maybe_create_block(block): + try: + return d[block] + except: + im[block] = l = [ None ] * len(block.items) + return l + + count_items = 0 + for block in self.blocks: + + # if we have a duplicate index but + # _ref_locs have not been set....then + # have to assume ordered blocks are passed + num_items = len(block.items) + try: + rl = block.ref_locs + except: + rl = np.arange(num_items) + count_items + + m = maybe_create_block(block) + for i, item in enumerate(block.items): + m[i] = rl[i] + count_items += num_items + + self._items_map = im + + # create the _ref_loc map here + rl = np.empty(len(labels),dtype=object) + for block, items in im.items(): + for i, loc in enumerate(items): + rl[loc] = (block,i) + self._ref_locs = rl + return rl + + # return our cached _ref_locs (or will compute again + # when we recreate the block manager if needed + return getattr(self,'_ref_locs',None) + + def get_items_map(self): + """ + return an inverted ref_loc map for an item index + block -> item (in that block) location -> column location + """ + + # cache check + im = getattr(self,'_items_map',None) + if im is not None: + return im + + im = dict() + rl = self._set_ref_locs() + + def maybe_create_block(block): + try: + return im[block] + except: + im[block] = l = [ None ] * len(block.items) + return l + + # we have a non-duplicative index + if rl is None: + + axis = self.axes[0] + for block in self.blocks: + + m = maybe_create_block(block) + for i, item in enumerate(block.items): + m[i] = axis.get_loc(item) + + + # use the ref_locs to construct the map + else: + + for i, (block, idx) in enumerate(rl): + + m = maybe_create_block(block) + m[idx] = i + + self._items_map = im + return im + # make items read only for now def _get_items(self): return self.axes[0] @@ -1185,13 +1327,16 @@ def get_slice(self, slobj, axis=0, raise_on_error=False): new_items, klass=blk.__class__, fastpath=True) + newb.set_ref_locs(blk._ref_locs) new_blocks = [newb] else: return self.reindex_items(new_items) else: new_blocks = self._slice_blocks(slobj, axis) - return BlockManager(new_blocks, new_axes, do_integrity_check=False) + bm = BlockManager(new_blocks, new_axes, do_integrity_check=False) + bm._consolidate_inplace() + return bm def _slice_blocks(self, slobj, axis): new_blocks = [] @@ -1206,6 +1351,7 @@ def _slice_blocks(self, slobj, axis): block.ref_items, klass=block.__class__, fastpath=True) + newb.set_ref_locs(block._ref_locs) new_blocks.append(newb) return new_blocks @@ -1387,26 +1533,11 @@ def iget(self, i): item = self.items[i] if self.items.is_unique: return self.get(item) - else: - # ugh - try: - inds, = (self.items == item).nonzero() - except AttributeError: # MultiIndex - inds, = self.items.map(lambda x: x == item).nonzero() - _, block = self._find_block(item) - - try: - binds, = (block.items == item).nonzero() - except AttributeError: # MultiIndex - binds, = block.items.map(lambda x: x == item).nonzero() - - for j, (k, b) in enumerate(zip(inds, binds)): - if i == k: - return block.values[b] - - raise Exception('Cannot have duplicate column names ' - 'split across dtypes') + # compute the duplicative indexer if needed + ref_locs = self._set_ref_locs() + b, loc = ref_locs[i] + return b.iget(loc) def get_scalar(self, tup): """ @@ -1582,6 +1713,8 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): # keep track of what items aren't found anywhere mask = np.zeros(len(item_order), dtype=bool) + new_axes = [new_items] + self.axes[1:] + new_blocks = [] for blk in self.blocks: blk_indexer = blk.items.get_indexer(item_order) @@ -1605,7 +1738,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) - return BlockManager(new_blocks, [new_items] + self.axes[1:]) + return BlockManager(new_blocks, new_axes) def reindex_items(self, new_items, copy=True, fill_value=np.nan): """ @@ -1619,6 +1752,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan): # TODO: this part could be faster (!) new_items, indexer = self.items.reindex(new_items) + new_axes = [new_items] + self.axes[1:] # could have so me pathological (MultiIndex) issues here new_blocks = [] @@ -1643,7 +1777,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan): new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) - return BlockManager(new_blocks, [new_items] + self.axes[1:]) + return BlockManager(new_blocks, new_axes) def _make_na_block(self, items, ref_items, fill_value=np.nan): # TODO: infer dtypes other than float64 from fill_value @@ -1685,11 +1819,11 @@ def merge(self, other, lsuffix=None, rsuffix=None): this, other = self._maybe_rename_join(other, lsuffix, rsuffix) cons_items = this.items + other.items - consolidated = _consolidate(this.blocks + other.blocks, cons_items) - new_axes = list(this.axes) new_axes[0] = cons_items + consolidated = _consolidate(this.blocks + other.blocks, cons_items) + return BlockManager(consolidated, new_axes) def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True): @@ -1842,54 +1976,55 @@ def form_blocks(arrays, names, axes): bool_items = [] object_items = [] datetime_items = [] - for k, v in zip(names, arrays): + for i, (k, v) in enumerate(zip(names, arrays)): if issubclass(v.dtype.type, np.floating): - float_items.append((k, v)) + float_items.append((i, k, v)) elif issubclass(v.dtype.type, np.complexfloating): - complex_items.append((k, v)) + complex_items.append((i, k, v)) elif issubclass(v.dtype.type, np.datetime64): if v.dtype != _NS_DTYPE: v = tslib.cast_to_nanoseconds(v) if hasattr(v, 'tz') and v.tz is not None: - object_items.append((k, v)) + object_items.append((i, k, v)) else: - datetime_items.append((k, v)) + datetime_items.append((i, k, v)) elif issubclass(v.dtype.type, np.integer): if v.dtype == np.uint64: # HACK #2355 definite overflow if (v > 2 ** 63 - 1).any(): - object_items.append((k, v)) + object_items.append((i, k, v)) continue - int_items.append((k, v)) + int_items.append((i, k, v)) elif v.dtype == np.bool_: - bool_items.append((k, v)) + bool_items.append((i, k, v)) else: - object_items.append((k, v)) + object_items.append((i, k, v)) + is_unique = items.is_unique blocks = [] if len(float_items): - float_blocks = _multi_blockify(float_items, items) + float_blocks = _multi_blockify(float_items, items, is_unique=is_unique) blocks.extend(float_blocks) if len(complex_items): - complex_blocks = _simple_blockify(complex_items, items, np.complex128) + complex_blocks = _simple_blockify(complex_items, items, np.complex128, is_unique=is_unique) blocks.extend(complex_blocks) if len(int_items): - int_blocks = _multi_blockify(int_items, items) + int_blocks = _multi_blockify(int_items, items, is_unique=is_unique) blocks.extend(int_blocks) if len(datetime_items): - datetime_blocks = _simple_blockify(datetime_items, items, _NS_DTYPE) + datetime_blocks = _simple_blockify(datetime_items, items, _NS_DTYPE, is_unique=is_unique) blocks.extend(datetime_blocks) if len(bool_items): - bool_blocks = _simple_blockify(bool_items, items, np.bool_) + bool_blocks = _simple_blockify(bool_items, items, np.bool_, is_unique=is_unique) blocks.extend(bool_blocks) if len(object_items) > 0: - object_blocks = _simple_blockify(object_items, items, np.object_) + object_blocks = _simple_blockify(object_items, items, np.object_, is_unique=is_unique) blocks.extend(object_blocks) if len(extra_items): @@ -1897,38 +2032,40 @@ def form_blocks(arrays, names, axes): # empty items -> dtype object block_values = np.empty(shape, dtype=object) - block_values.fill(nan) na_block = make_block(block_values, extra_items, items) blocks.append(na_block) - blocks = _consolidate(blocks, items) return blocks -def _simple_blockify(tuples, ref_items, dtype): +def _simple_blockify(tuples, ref_items, dtype, is_unique=True): """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """ - block_items, values = _stack_arrays(tuples, ref_items, dtype) + block_items, values, placement = _stack_arrays(tuples, ref_items, dtype) # CHECK DTYPE? if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - return [ make_block(values, block_items, ref_items) ] + block = make_block(values, block_items, ref_items) + if not is_unique: + block.set_ref_locs(placement) + return [ block ] - -def _multi_blockify(tuples, ref_items, dtype = None): +def _multi_blockify(tuples, ref_items, dtype = None, is_unique=True): """ return an array of blocks that potentially have different dtypes """ # group by dtype - grouper = itertools.groupby(tuples, lambda x: x[1].dtype) + grouper = itertools.groupby(tuples, lambda x: x[2].dtype) new_blocks = [] for dtype, tup_block in grouper: - block_items, values = _stack_arrays(list(tup_block), ref_items, dtype) + block_items, values, placement = _stack_arrays(list(tup_block), ref_items, dtype) block = make_block(values, block_items, ref_items) + if not is_unique: + block.set_ref_locs(placement) new_blocks.append(block) return new_blocks @@ -1951,10 +2088,7 @@ def _shape_compat(x): else: return x.shape - names, arrays = zip(*tuples) - - # index may box values - items = ref_items[ref_items.isin(names)] + placement, names, arrays = zip(*tuples) first = arrays[0] shape = (len(arrays),) + _shape_compat(first) @@ -1963,7 +2097,15 @@ def _shape_compat(x): for i, arr in enumerate(arrays): stacked[i] = _asarray_compat(arr) - return items, stacked + # index may box values + if ref_items.is_unique: + items = ref_items[ref_items.isin(names)] + else: + items = _ensure_index([ n for n in names if n in ref_items ]) + if len(items) != len(stacked): + raise Exception("invalid names passed _stack_arrays") + + return items, stacked, placement def _blocks_to_series_dict(blocks, index=None): diff --git a/pandas/src/properties.pyx b/pandas/src/properties.pyx index 53bb561ef9110..1df11cecf7b94 100644 --- a/pandas/src/properties.pyx +++ b/pandas/src/properties.pyx @@ -4,16 +4,20 @@ from cpython cimport PyDict_Contains, PyDict_GetItem, PyDict_GetItem cdef class cache_readonly(object): cdef readonly: - object fget, name + object func, name, allow_setting - def __init__(self, func): - self.fget = func - self.name = func.__name__ + def __init__(self, func=None, allow_setting=False): + if func is not None: + self.func = func + self.name = func.__name__ + self.allow_setting = allow_setting - def __get__(self, obj, type): - if obj is None: - return self.fget + def __call__(self, func, doc=None): + self.func = func + self.name = func.__name__ + return self + def __get__(self, obj, typ): # Get the cache or set a default one if needed cache = getattr(obj, '_cache', None) @@ -23,12 +27,23 @@ cdef class cache_readonly(object): if PyDict_Contains(cache, self.name): # not necessary to Py_INCREF val = PyDict_GetItem(cache, self.name) - return val else: - val = self.fget(obj) + val = self.func(obj) PyDict_SetItem(cache, self.name, val) - return val + return val + + def __set__(self, obj, value): + + if not self.allow_setting: + raise Exception("cannot set values for [%s]" % self.name) + + # Get the cache or set a default one if needed + cache = getattr(obj, '_cache', None) + if cache is None: + cache = obj._cache = {} + PyDict_SetItem(cache, self.name, value) + cdef class AxisProperty(object): cdef: Py_ssize_t axis diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7bafed216b9b9..69225c40e36df 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4973,17 +4973,33 @@ def test_to_csv_dups_cols(self): with ensure_clean() as filename: df.to_csv(filename) # single dtype, fine + result = read_csv(filename,index_col=0) + result.columns = df.columns + assert_frame_equal(result,df) - df_float = DataFrame(np.random.randn(1000, 30),dtype='float64') - df_int = DataFrame(np.random.randn(1000, 30),dtype='int64') - df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) - df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) - df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) - df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + df_float = DataFrame(np.random.randn(1000, 3),dtype='float64') + df_int = DataFrame(np.random.randn(1000, 3),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=range(3)) + df_object = DataFrame('foo',index=df_float.index,columns=range(3)) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=range(3)) + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1, ignore_index=True) + + cols = [] + for i in range(5): + cols.extend([0,1,2]) + df.columns = cols - #### this raises because we have duplicate column names across dtypes #### + from pandas import to_datetime with ensure_clean() as filename: - self.assertRaises(Exception, df.to_csv, filename) + df.to_csv(filename) + result = read_csv(filename,index_col=0) + + # date cols + for i in ['0.4','1.4','2.4']: + result[i] = to_datetime(result[i]) + + result.columns = df.columns + assert_frame_equal(result,df) # GH3457 from pandas.util.testing import makeCustomDataframe as mkdf @@ -7492,12 +7508,15 @@ def test_applymap(self): self.assert_(result.dtypes[0] == object) # GH2786 - df = DataFrame(np.random.random((3,4))) - df.columns = ['a','a','a','a'] - try: - df.applymap(str) - except ValueError as e: - self.assertTrue("support" in str(e)) + df = DataFrame(np.random.random((3,4))) + df2 = df.copy() + cols = ['a','a','a','a'] + df.columns = cols + + expected = df2.applymap(str) + expected.columns = cols + result = df.applymap(str) + assert_frame_equal(result,expected) def test_filter(self): # items @@ -9201,6 +9220,62 @@ def test_assign_columns(self): assert_series_equal(self.frame['C'], frame['baz']) assert_series_equal(self.frame['hi'], frame['foo2']) + def test_columns_with_dups(self): + + # GH 3468 related + + # basic + df = DataFrame([[1,2]], columns=['a','a']) + df.columns = ['a','a.1'] + str(df) + expected = DataFrame([[1,2]], columns=['a','a.1']) + assert_frame_equal(df, expected) + + df = DataFrame([[1,2,3]], columns=['b','a','a']) + df.columns = ['b','a','a.1'] + str(df) + expected = DataFrame([[1,2,3]], columns=['b','a','a.1']) + assert_frame_equal(df, expected) + + # with a dup index + df = DataFrame([[1,2]], columns=['a','a']) + df.columns = ['b','b'] + str(df) + expected = DataFrame([[1,2]], columns=['b','b']) + assert_frame_equal(df, expected) + + # multi-dtype + df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=['a','a','b','b','d','c','c']) + df.columns = list('ABCDEFG') + str(df) + expected = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('ABCDEFG')) + assert_frame_equal(df, expected) + + # this is an error because we cannot disambiguate the dup columns + self.assertRaises(Exception, lambda x: DataFrame([[1,2,'foo','bar']], columns=['a','a','a','a'])) + + # dups across blocks + df_float = DataFrame(np.random.randn(10, 3),dtype='float64') + df_int = DataFrame(np.random.randn(10, 3),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) + df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + result = df._data._set_ref_locs() + self.assert_(len(result) == len(df.columns)) + + # testing iget + for i in range(len(df.columns)): + df.iloc[:,i] + + # dup columns across dtype GH 2079/2194 + vals = [[1, -1, 2.], [2, -2, 3.]] + rs = DataFrame(vals, columns=['A', 'A', 'B']) + xp = DataFrame(vals) + xp.columns = ['A', 'A', 'B'] + assert_frame_equal(rs, xp) + def test_cast_internals(self): casted = DataFrame(self.frame._data, dtype=int) expected = DataFrame(self.frame._series, dtype=int) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 86cd0ef524b35..ae71ec8b35422 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -772,6 +772,19 @@ def test_dups_fancy_indexing(self): expected = Index(['b','a','a']) self.assert_(result.equals(expected)) + # across dtypes + df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa')) + df.head() + str(df) + result = DataFrame([[1,2,1.,2.,3.,'foo','bar']]) + result.columns = list('aaaaaaa') + + df_v = df.iloc[:,4] + res_v = result.iloc[:,4] + + assert_frame_equal(df,result) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index eec5f5632d36b..e25bd0de769a7 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -268,7 +268,7 @@ def test_duplicate_item_failure(self): b.ref_items = items mgr = BlockManager(blocks, [items, np.arange(N)]) - self.assertRaises(Exception, mgr.iget, 1) + mgr.iget(1) def test_contains(self): self.assert_('a' in self.mgr)