BUG: str dtype ignored for column with dot (#50364)

natmokval · web-flow · commit 060b9dae7cd4 · 2023-03-16T18:08:45.000Z
* BUG: str dtype ignored for column with dot I

* BUG: add test to str dtype ignored for column with dot I

* BUG: str dtype ignored for column with dot III

* BUG: str dtype ignored for column with dot IV

* BUG: str dtype ignored for column with dot V

* BUG: str dtype ignored for column with dot VI

* TEST: added assert for mypy

* TEST: added assert for mypy II

* BUG: str dtype ignored for column with dot VII

* specify int64 explicitly

* specify int64 explicitly II

* add the original example and remove the redundant check

* remove unnecessary check

* add parametrize to thousand separator test

* BUG: remove duplicative test

* BUG: add additional parameters in parametrization

* BUG: exclude bool and add object dtype in parametrization

* BUG: change parameters of parametrization and add a second test
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -29,7 +29,11 @@
     ParserError,
 )
 
-from pandas.core.dtypes.common import is_integer
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_integer,
+    is_numeric_dtype,
+)
 from pandas.core.dtypes.inference import is_dict_like
 
 from pandas.io.common import (
@@ -155,12 +159,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
             self._col_indices = list(range(len(self.columns)))
 
         self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
-        no_thousands_columns: set[int] | None = None
-        if self.parse_dates:
-            no_thousands_columns = self._set_noconvert_dtype_columns(
-                self._col_indices, self.columns
-            )
-        self._no_thousands_columns = no_thousands_columns
+        self._no_thousands_columns = self._set_no_thousand_columns()
 
         if len(self.decimal) != 1:
             raise ValueError("Only length-1 decimal markers supported")
@@ -889,7 +888,7 @@ def _search_replace_num_columns(
                 if (
                     not isinstance(x, str)
                     or search not in x
-                    or (self._no_thousands_columns and i in self._no_thousands_columns)
+                    or i in self._no_thousands_columns
                     or not self.num.search(x.strip())
                 ):
                     rl.append(x)
@@ -1162,6 +1161,31 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar
             ]
         return new_rows
 
+    def _set_no_thousand_columns(self) -> set[int]:
+        no_thousands_columns: set[int] = set()
+        if self.columns and self.parse_dates:
+            assert self._col_indices is not None
+            no_thousands_columns = self._set_noconvert_dtype_columns(
+                self._col_indices, self.columns
+            )
+        if self.columns and self.dtype:
+            assert self._col_indices is not None
+            for i in self._col_indices:
+                if not isinstance(self.dtype, dict) and not is_numeric_dtype(
+                    self.dtype
+                ):
+                    no_thousands_columns.add(i)
+                if (
+                    isinstance(self.dtype, dict)
+                    and self.columns[i] in self.dtype
+                    and (
+                        not is_numeric_dtype(self.dtype[self.columns[i]])
+                        or is_bool_dtype(self.dtype[self.columns[i]])
+                    )
+                ):
+                    no_thousands_columns.add(i)
+        return no_thousands_columns
+
 
 class FixedWidthReader(abc.Iterator):
     """
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
@@ -14,6 +14,7 @@
 )
 from typing import Iterator
 
+import numpy as np
 import pytest
 
 from pandas.errors import (
@@ -488,3 +489,71 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse
     )
     expected = DataFrame({"a": ["a", "c", "f"]})
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
+)
+def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
+    # GH#50270
+    parser = python_parser_only
+    data = """\
+a;b;c
+0000.7995;16.000;0
+3.03.001.00514;0;4.000
+4923.600.041;23.000;131"""
+    result = parser.read_csv(
+        StringIO(data),
+        sep=";",
+        dtype=dtype,
+        thousands=".",
+    )
+    expected = DataFrame(
+        {
+            "a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
+            "b": [16000, 0, 23000],
+            "c": [0, 4000, 131],
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype,expected",
+    [
+        (
+            {"a": str, "b": np.float64, "c": np.int64},
+            DataFrame(
+                {
+                    "b": [16000.1, 0, 23000],
+                    "c": [0, 4001, 131],
+                }
+            ),
+        ),
+        (
+            str,
+            DataFrame(
+                {
+                    "b": ["16,000.1", "0", "23,000"],
+                    "c": ["0", "4,001", "131"],
+                }
+            ),
+        ),
+    ],
+)
+def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
+    # GH#50270
+    parser = python_parser_only
+    data = """a;b;c
+0000,7995;16,000.1;0
+3,03,001,00514;0;4,001
+4923,600,041;23,000;131
+"""
+    result = parser.read_csv(
+        StringIO(data),
+        sep=";",
+        dtype=dtype,
+        thousands=",",
+    )
+    expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
+    tm.assert_frame_equal(result, expected)