Skip to content

Commit 025fb91

Browse files
committed
Handle implicit string conversion
1 parent 29a638c commit 025fb91

File tree

2 files changed

+23
-17
lines changed

2 files changed

+23
-17
lines changed

pandas/core/internals/construction.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -243,33 +243,27 @@ def init_dict(data, index, columns, dtype=None):
243243
mask = np.isin(extra_positions, positions, invert=True)
244244
extra_positions = extra_positions[mask]
245245

246-
# And now, what should the dtype of this new guys be? We'll that's a little
246+
# And now, what should the dtype of this new guys be? We'll that's
247247
# tricky.
248248
# 1. User provided dtype, just use that...
249249
# unless the user provided dtype=int and an index (Gh-24385)
250+
# - DataFrame(None, index=idx, columns=cols, dtype=int) :: float
251+
# - DataFrame(None, index=idx, columns=cols, dtype=object) :: object
250252
# 2. Empty data.keys() & columns is object (unless specified by the user)
251253
# 3. No data and No dtype is object (unless specified by the user).
254+
# 4. For string-like `dtype`, things are even more subtle.
255+
# a.) We rely on arrays_to_mgr to coerce values to strings, when
256+
# the user provides dtype-str
257+
# b.) But we don't want the values coercion for newly-created
258+
# columns. This only partly works. See
259+
# https://github.com/pandas-dev/pandas/issues/24388 for more.
252260

253-
# https://github.com/pandas-dev/pandas/issues/24385
254-
# Series(None, dtype=int) and DataFrame(None, dtype=dtype)
255-
# differ when the index is provided.
256-
# But if dtype is not provided, then we fall use object.
257-
# we have to pass this dtype through to arrays_to_mgr
258-
259-
# Some things I'd like to change
260-
# With DataFrame(None, index=[1], columns=['a'], dtype=dtype):
261-
# For dtype=object, the result is object
262-
# But for dtype=int, the result is float
263261
empty_columns = len(positions.index & columns) == 0
264262

265263
if empty_columns and dtype is None:
266264
dtype = object
267265
elif (index_len
268266
and is_integer_dtype(dtype)):
269-
# That's one complicated condition:
270-
# DataFrame(None, index=idx, columns=cols, dtype=int) must be float
271-
# DataFrame(None, index=idx, columns=cols, dtype=object) is object
272-
# DataFrame({'a': 2}, columns=['b']) is object (empty)
273267
dtype = float
274268
elif not data and dtype is None:
275269
dtype = np.dtype('object')
@@ -279,8 +273,15 @@ def init_dict(data, index, columns, dtype=None):
279273

280274
arrays = [new_data[i] for i in range(len(columns))]
281275

282-
# hrm this probably belongs in arrays_to_mgr...
283-
if is_string_dtype(dtype) and not is_categorical_dtype(dtype):
276+
if (empty_columns
277+
and is_string_dtype(dtype)
278+
and not is_categorical_dtype(dtype)):
279+
# For user-provided `dtype=str`, we want to preserve that so
280+
# that arrays_to_mgr handles the *values* coercion from user-provided
281+
# to strings. *But* we don't want to do that for columns that were
282+
# newly created. But, there's the bug. We only handle this correctly
283+
# when all the columns are newly created. See
284+
# https://github.com/pandas-dev/pandas/issues/24388 for more.
284285
dtype = np.dtype("object")
285286

286287
return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)

pandas/tests/frame/test_constructors.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,6 +1419,11 @@ def test_constructor_empty_with_string_dtype(self):
14191419
df = DataFrame(index=[0, 1], columns=[0, 1], dtype='U5')
14201420
tm.assert_frame_equal(df, expected)
14211421

1422+
def test_constsructor_string_dtype_coerces_values(self):
1423+
result = pd.DataFrame({"A": [1, 2]}, dtype=str)
1424+
expected = pd.DataFrame({"A": ['1', '2']}, dtype=object)
1425+
tm.assert_frame_equal(result, expected)
1426+
14221427
def test_constructor_single_value(self):
14231428
# expecting single value upcasting here
14241429
df = DataFrame(0., index=[1, 2, 3], columns=['a', 'b', 'c'])

0 commit comments

Comments
 (0)