Skip to content

BUG: repr of Categorical does not distinguish int and str. #34222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 24, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1237,6 +1237,8 @@ def _format(x):
fmt_values.append(f" {_format(v)}")
elif is_float_type[i]:
fmt_values.append(float_format(v))
elif isinstance(v, str):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GenericArrayFormatter has a quoting parameter. format_array does not have this parameter to be able to pass it on.

_repr_categories in pandas\core\arrays\categorical.py uses format_array.

is it feasible to add quoting parameter to format_array instead of changing this?

Copy link
Member Author

@MarcoGorelli MarcoGorelli May 21, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simonjayhawkins thanks for the suggestion!

So, in _repr_categories from pandas\core\arrays\categorical.py, format_array is called. This could be changed to, say,

category_strs = fmt.format_array(self.categories, None, quoting=csv.QUOTE_NONNUMERIC)

and then, inside format_array, pass quoting on to fmt_klass, so that (here) GenericArrayFormatter would be initialised with it.

It's unclear to me me what GenericArrayFormatter currently does with quoting, I don't see it being used. Should there be no need to change anything here in _format_strings?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's unclear to me me what GenericArrayFormatter currently does with quoting, I don't see it being used. Should there be no need to change anything here in _format_strings?

that does seem strange, I must have assumed wrongly where the quoting parameter is used. will look again, but your suggestions sgtm.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of this you need to adjust _format, L1221; you maybe able to simply adjust the args to pprint_thing which can quote as needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thanks for pointing me towards that, that was useful!

So, pprint_thing takes a quote_string argument, which is boolean, while quoting is Optional[int], so I've tried doing this by setting

quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE

and then passing quote_strings to pprint_thing

fmt_values.append(f"'{v}'")
else:
if leading_space is False:
# False specifically, so that the default is
Expand Down
30 changes: 19 additions & 11 deletions pandas/tests/arrays/categorical/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

class TestCategoricalReprWithFactor(TestCategorical):
def test_print(self):
expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"]
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(self.factor)
assert actual == expected
Expand All @@ -24,9 +27,9 @@ class TestCategoricalRepr:
def test_big_print(self):
factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True)
expected = [
"[a, b, c, a, b, ..., b, c, a, b, c]",
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
"Length: 600",
"Categories (3, object): [a, b, c]",
"Categories (3, object): ['a', 'b', 'c']",
]
expected = "\n".join(expected)

Expand All @@ -36,13 +39,13 @@ def test_big_print(self):

def test_empty_print(self):
factor = Categorical([], ["a", "b", "c"])
expected = "[], Categories (3, object): [a, b, c]"
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected

assert expected == actual
factor = Categorical([], ["a", "b", "c"], ordered=True)
expected = "[], Categories (3, object): [a < b < c]"
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual

Expand All @@ -64,17 +67,17 @@ def test_print_none_width(self):
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc]
['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc']
Length: 60
Categories (3, object): [aaaaa, bb, cccc]"""
Categories (3, object): ['aaaaa', 'bb', 'cccc']"""

assert repr(c) == expected

c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """\
[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa

assert repr(c) == expected

Expand All @@ -83,9 +86,9 @@ def test_unicode_print(self):
with option_context("display.unicode.east_asian_width", True):

c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa

assert repr(c) == expected

Expand Down Expand Up @@ -523,3 +526,8 @@ def test_categorical_index_repr_timedelta_ordered(self):
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa

assert repr(i) == exp

def test_categorical_str_repr(self):
result = repr(Categorical([1, "2", 3, 4]))
expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']"
assert result == expected