diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 177a38b526c6e..42416f70d9ca5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -37,6 +37,7 @@ Other enhancements - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) +- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 402bbdb872a18..f01fe8ecef930 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -178,7 +178,7 @@ class DtypeWarning(Warning): ... ) # doctest: +SKIP >>> df.to_csv("test.csv", index=False) # doctest: +SKIP >>> df2 = pd.read_csv("test.csv") # doctest: +SKIP - ... # DtypeWarning: Columns (0) have mixed types + ... # DtypeWarning: Columns (0: a) have mixed types Important to notice that ``df2`` will contain both `str` and `int` for the same input, '1'. diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 67f3e5a9f4880..6e5d36ad39c8a 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -235,7 +235,7 @@ def read( if self.low_memory: chunks = self._reader.read_low_memory(nrows) # destructive to chunks - data = _concatenate_chunks(chunks) + data = _concatenate_chunks(chunks, self.names) # type: ignore[has-type] else: data = self._reader.read(nrows) @@ -358,7 +358,9 @@ def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): return values -def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: +def _concatenate_chunks( + chunks: list[dict[int, ArrayLike]], column_names: list[str] +) -> dict: """ Concatenate chunks of data read with low_memory=True. @@ -381,10 +383,12 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: else: result[name] = concat_compat(arrs) if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object): - warning_columns.append(str(name)) + warning_columns.append(column_names[name]) if warning_columns: - warning_names = ",".join(warning_columns) + warning_names = ", ".join( + [f"{index}: {name}" for index, name in enumerate(warning_columns)] + ) warning_message = " ".join( [ f"Columns ({warning_names}) have mixed types. " diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index cdf4d6ae77f91..78a0b016bd353 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -253,7 +253,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): else: df = parser.read_csv_check_warnings( warning_type, - r"Columns \(0\) have mixed types. " + r"Columns \(0: a\) have mixed types. " "Specify dtype option on import or set low_memory=False.", buf, ) diff --git a/pandas/tests/io/parser/test_concatenate_chunks.py b/pandas/tests/io/parser/test_concatenate_chunks.py index 1bae2317a2fc6..3e418faeff8e0 100644 --- a/pandas/tests/io/parser/test_concatenate_chunks.py +++ b/pandas/tests/io/parser/test_concatenate_chunks.py @@ -16,7 +16,7 @@ def test_concatenate_chunks_pyarrow(): {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, {0: ArrowExtensionArray(pa.array([1, 2]))}, ] - result = _concatenate_chunks(chunks) + result = _concatenate_chunks(chunks, ["column_0", "column_1"]) expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0])) tm.assert_extension_array_equal(result[0], expected) @@ -28,8 +28,10 @@ def test_concatenate_chunks_pyarrow_strings(): {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, {0: ArrowExtensionArray(pa.array(["a", "b"]))}, ] - with tm.assert_produces_warning(DtypeWarning, match="have mixed types"): - result = _concatenate_chunks(chunks) + with tm.assert_produces_warning( + DtypeWarning, match="Columns \\(0: column_0\\) have mixed types" + ): + result = _concatenate_chunks(chunks, ["column_0", "column_1"]) expected = np.concatenate( [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])] )