Skip to content

Commit 060b9da

Browse files
authored
BUG: str dtype ignored for column with dot (#50364)
* BUG: str dtype ignored for column with dot I * BUG: add test to str dtype ignored for column with dot I * BUG: str dtype ignored for column with dot III * BUG: str dtype ignored for column with dot IV * BUG: str dtype ignored for column with dot V * BUG: str dtype ignored for column with dot VI * TEST: added assert for mypy * TEST: added assert for mypy II * BUG: str dtype ignored for column with dot VII * specify int64 explicitly * specify int64 explicitly II * add the original example and remove the redundant check * remove unnecessary check * add parametrize to thousand separator test * BUG: remove duplicative test * BUG: add additional parameters in parametrization * BUG: exclude bool and add object dtype in parametrization * BUG: change parameters of parametrization and add a second test
1 parent 6b27de3 commit 060b9da

File tree

2 files changed

+101
-8
lines changed

2 files changed

+101
-8
lines changed

pandas/io/parsers/python_parser.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,11 @@
2929
ParserError,
3030
)
3131

32-
from pandas.core.dtypes.common import is_integer
32+
from pandas.core.dtypes.common import (
33+
is_bool_dtype,
34+
is_integer,
35+
is_numeric_dtype,
36+
)
3337
from pandas.core.dtypes.inference import is_dict_like
3438

3539
from pandas.io.common import (
@@ -155,12 +159,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
155159
self._col_indices = list(range(len(self.columns)))
156160

157161
self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
158-
no_thousands_columns: set[int] | None = None
159-
if self.parse_dates:
160-
no_thousands_columns = self._set_noconvert_dtype_columns(
161-
self._col_indices, self.columns
162-
)
163-
self._no_thousands_columns = no_thousands_columns
162+
self._no_thousands_columns = self._set_no_thousand_columns()
164163

165164
if len(self.decimal) != 1:
166165
raise ValueError("Only length-1 decimal markers supported")
@@ -889,7 +888,7 @@ def _search_replace_num_columns(
889888
if (
890889
not isinstance(x, str)
891890
or search not in x
892-
or (self._no_thousands_columns and i in self._no_thousands_columns)
891+
or i in self._no_thousands_columns
893892
or not self.num.search(x.strip())
894893
):
895894
rl.append(x)
@@ -1162,6 +1161,31 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar
11621161
]
11631162
return new_rows
11641163

1164+
def _set_no_thousand_columns(self) -> set[int]:
1165+
no_thousands_columns: set[int] = set()
1166+
if self.columns and self.parse_dates:
1167+
assert self._col_indices is not None
1168+
no_thousands_columns = self._set_noconvert_dtype_columns(
1169+
self._col_indices, self.columns
1170+
)
1171+
if self.columns and self.dtype:
1172+
assert self._col_indices is not None
1173+
for i in self._col_indices:
1174+
if not isinstance(self.dtype, dict) and not is_numeric_dtype(
1175+
self.dtype
1176+
):
1177+
no_thousands_columns.add(i)
1178+
if (
1179+
isinstance(self.dtype, dict)
1180+
and self.columns[i] in self.dtype
1181+
and (
1182+
not is_numeric_dtype(self.dtype[self.columns[i]])
1183+
or is_bool_dtype(self.dtype[self.columns[i]])
1184+
)
1185+
):
1186+
no_thousands_columns.add(i)
1187+
return no_thousands_columns
1188+
11651189

11661190
class FixedWidthReader(abc.Iterator):
11671191
"""

pandas/tests/io/parser/test_python_parser_only.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515
from typing import Iterator
1616

17+
import numpy as np
1718
import pytest
1819

1920
from pandas.errors import (
@@ -488,3 +489,71 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse
488489
)
489490
expected = DataFrame({"a": ["a", "c", "f"]})
490491
tm.assert_frame_equal(result, expected)
492+
493+
494+
@pytest.mark.parametrize(
495+
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
496+
)
497+
def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
498+
# GH#50270
499+
parser = python_parser_only
500+
data = """\
501+
a;b;c
502+
0000.7995;16.000;0
503+
3.03.001.00514;0;4.000
504+
4923.600.041;23.000;131"""
505+
result = parser.read_csv(
506+
StringIO(data),
507+
sep=";",
508+
dtype=dtype,
509+
thousands=".",
510+
)
511+
expected = DataFrame(
512+
{
513+
"a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
514+
"b": [16000, 0, 23000],
515+
"c": [0, 4000, 131],
516+
}
517+
)
518+
tm.assert_frame_equal(result, expected)
519+
520+
521+
@pytest.mark.parametrize(
522+
"dtype,expected",
523+
[
524+
(
525+
{"a": str, "b": np.float64, "c": np.int64},
526+
DataFrame(
527+
{
528+
"b": [16000.1, 0, 23000],
529+
"c": [0, 4001, 131],
530+
}
531+
),
532+
),
533+
(
534+
str,
535+
DataFrame(
536+
{
537+
"b": ["16,000.1", "0", "23,000"],
538+
"c": ["0", "4,001", "131"],
539+
}
540+
),
541+
),
542+
],
543+
)
544+
def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
545+
# GH#50270
546+
parser = python_parser_only
547+
data = """a;b;c
548+
0000,7995;16,000.1;0
549+
3,03,001,00514;0;4,001
550+
4923,600,041;23,000;131
551+
"""
552+
result = parser.read_csv(
553+
StringIO(data),
554+
sep=";",
555+
dtype=dtype,
556+
thousands=",",
557+
)
558+
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
559+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)