@@ -243,33 +243,27 @@ def init_dict(data, index, columns, dtype=None):
243
243
mask = np .isin (extra_positions , positions , invert = True )
244
244
extra_positions = extra_positions [mask ]
245
245
246
- # And now, what should the dtype of this new guys be? We'll that's a little
246
+ # And now, what should the dtype of this new guys be? We'll that's
247
247
# tricky.
248
248
# 1. User provided dtype, just use that...
249
249
# unless the user provided dtype=int and an index (Gh-24385)
250
+ # - DataFrame(None, index=idx, columns=cols, dtype=int) :: float
251
+ # - DataFrame(None, index=idx, columns=cols, dtype=object) :: object
250
252
# 2. Empty data.keys() & columns is object (unless specified by the user)
251
253
# 3. No data and No dtype is object (unless specified by the user).
254
+ # 4. For string-like `dtype`, things are even more subtle.
255
+ # a.) We rely on arrays_to_mgr to coerce values to strings, when
256
+ # the user provides dtype-str
257
+ # b.) But we don't want the values coercion for newly-created
258
+ # columns. This only partly works. See
259
+ # https://github.com/pandas-dev/pandas/issues/24388 for more.
252
260
253
- # https://github.com/pandas-dev/pandas/issues/24385
254
- # Series(None, dtype=int) and DataFrame(None, dtype=dtype)
255
- # differ when the index is provided.
256
- # But if dtype is not provided, then we fall use object.
257
- # we have to pass this dtype through to arrays_to_mgr
258
-
259
- # Some things I'd like to change
260
- # With DataFrame(None, index=[1], columns=['a'], dtype=dtype):
261
- # For dtype=object, the result is object
262
- # But for dtype=int, the result is float
263
261
empty_columns = len (positions .index & columns ) == 0
264
262
265
263
if empty_columns and dtype is None :
266
264
dtype = object
267
265
elif (index_len
268
266
and is_integer_dtype (dtype )):
269
- # That's one complicated condition:
270
- # DataFrame(None, index=idx, columns=cols, dtype=int) must be float
271
- # DataFrame(None, index=idx, columns=cols, dtype=object) is object
272
- # DataFrame({'a': 2}, columns=['b']) is object (empty)
273
267
dtype = float
274
268
elif not data and dtype is None :
275
269
dtype = np .dtype ('object' )
@@ -279,8 +273,15 @@ def init_dict(data, index, columns, dtype=None):
279
273
280
274
arrays = [new_data [i ] for i in range (len (columns ))]
281
275
282
- # hrm this probably belongs in arrays_to_mgr...
283
- if is_string_dtype (dtype ) and not is_categorical_dtype (dtype ):
276
+ if (empty_columns
277
+ and is_string_dtype (dtype )
278
+ and not is_categorical_dtype (dtype )):
279
+ # For user-provided `dtype=str`, we want to preserve that so
280
+ # that arrays_to_mgr handles the *values* coercion from user-provided
281
+ # to strings. *But* we don't want to do that for columns that were
282
+ # newly created. But, there's the bug. We only handle this correctly
283
+ # when all the columns are newly created. See
284
+ # https://github.com/pandas-dev/pandas/issues/24388 for more.
284
285
dtype = np .dtype ("object" )
285
286
286
287
return arrays_to_mgr (arrays , columns , index , columns , dtype = dtype )
0 commit comments