Skip to content

Commit 5b16f10

Browse files
authored
Merge branch 'master' into add-compression-stata-reading
2 parents 1f52991 + 6d80717 commit 5b16f10

File tree

20 files changed

+327
-302
lines changed

20 files changed

+327
-302
lines changed

asv_bench/benchmarks/series_methods.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -284,16 +284,29 @@ def time_clip(self, n):
284284

285285
class ValueCounts:
286286

287-
params = ["int", "uint", "float", "object"]
288-
param_names = ["dtype"]
287+
params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]]
288+
param_names = ["N", "dtype"]
289289

290-
def setup(self, dtype):
291-
self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype)
290+
def setup(self, N, dtype):
291+
self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype)
292292

293-
def time_value_counts(self, dtype):
293+
def time_value_counts(self, N, dtype):
294294
self.s.value_counts()
295295

296296

297+
class Mode:
298+
299+
params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]]
300+
param_names = ["N", "dtype"]
301+
302+
def setup(self, N, dtype):
303+
np.random.seed(42)
304+
self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype)
305+
306+
def time_mode(self, N, dtype):
307+
self.s.mode()
308+
309+
297310
class Dir:
298311
def setup(self):
299312
self.s = Series(index=tm.makeStringIndex(10000))

doc/source/user_guide/visualization.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,9 @@ These can be specified by the ``x`` and ``y`` keywords.
552552
.. ipython:: python
553553
554554
df = pd.DataFrame(np.random.rand(50, 4), columns=["a", "b", "c", "d"])
555+
df["species"] = pd.Categorical(
556+
["setosa"] * 20 + ["versicolor"] * 20 + ["virginica"] * 10
557+
)
555558
556559
@savefig scatter_plot.png
557560
df.plot.scatter(x="a", y="b");
@@ -579,6 +582,21 @@ each point:
579582
df.plot.scatter(x="a", y="b", c="c", s=50);
580583
581584
585+
.. ipython:: python
586+
:suppress:
587+
588+
plt.close("all")
589+
590+
If a categorical column is passed to ``c``, then a discrete colorbar will be produced:
591+
592+
.. versionadded:: 1.3.0
593+
594+
.. ipython:: python
595+
596+
@savefig scatter_plot_categorical.png
597+
df.plot.scatter(x="a", y="b", c="species", cmap="viridis", s=50);
598+
599+
582600
.. ipython:: python
583601
:suppress:
584602

doc/source/whatsnew/v1.3.0.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,10 @@ Other enhancements
5252
- :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
5353
- :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
5454
- :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`)
55+
- :meth:`DataFrame.plot.scatter` can now accept a categorical column as the argument to ``c`` (:issue:`12380`, :issue:`31357`)
5556
- :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes.
56-
- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
57+
\- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
58+
- :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`)
5759

5860
.. ---------------------------------------------------------------------------
5961
@@ -326,6 +328,7 @@ I/O
326328
- Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`)
327329
- Bug in :meth:`~HDFStore.put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`)
328330
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
331+
- Bug in :func:`read_csv` apllying thousands separator to date columns when column should be parsed for dates and ``usecols`` is specified for ``engine="python"`` (:issue:`39365`)
329332
- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)
330333
- :func:`read_excel` now respects :func:`set_option` (:issue:`34252`)
331334
- Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)
@@ -364,7 +367,7 @@ Reshaping
364367
- Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`)
365368
- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`)
366369
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
367-
- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`)
370+
- :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`)
368371
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
369372
-
370373

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 49 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -28,52 +28,6 @@ dtypes = [('Complex128', 'complex128', 'complex128',
2828
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
2929

3030

31-
@cython.wraparound(False)
32-
@cython.boundscheck(False)
33-
{{if dtype == 'object'}}
34-
cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values,
35-
kh_{{ttype}}_t *table, bint dropna):
36-
{{else}}
37-
cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values,
38-
kh_{{ttype}}_t *table, bint dropna):
39-
{{endif}}
40-
cdef:
41-
khiter_t k
42-
Py_ssize_t i, n = len(values)
43-
44-
{{c_type}} val
45-
46-
int ret = 0
47-
48-
{{if dtype == 'object'}}
49-
kh_resize_{{ttype}}(table, n // 10)
50-
51-
for i in range(n):
52-
val = values[i]
53-
if not checknull(val) or not dropna:
54-
k = kh_get_{{ttype}}(table, <PyObject*>val)
55-
if k != table.n_buckets:
56-
table.vals[k] += 1
57-
else:
58-
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
59-
table.vals[k] = 1
60-
{{else}}
61-
with nogil:
62-
kh_resize_{{ttype}}(table, n)
63-
64-
for i in range(n):
65-
val = {{to_c_type}}(values[i])
66-
67-
if not is_nan_{{c_type}}(val) or not dropna:
68-
k = kh_get_{{ttype}}(table, val)
69-
if k != table.n_buckets:
70-
table.vals[k] += 1
71-
else:
72-
k = kh_put_{{ttype}}(table, val, &ret)
73-
table.vals[k] = 1
74-
{{endif}}
75-
76-
7731
@cython.wraparound(False)
7832
@cython.boundscheck(False)
7933
{{if dtype == 'object'}}
@@ -84,8 +38,6 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
8438
cdef:
8539
Py_ssize_t i = 0
8640
Py_ssize_t n = len(values)
87-
size_t unique_key_index = 0
88-
size_t unique_key_count = 0
8941
kh_{{ttype}}_t *table
9042

9143
# Don't use Py_ssize_t, since table.n_buckets is unsigned
@@ -98,12 +50,10 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
9850

9951
# we track the order in which keys are first seen (GH39009),
10052
# khash-map isn't insertion-ordered, thus:
101-
# table maps key to index_of_appearence
102-
# result_keys maps index_of_appearence to key
103-
# result_counts maps index_of_appearence to number of elements
53+
# table maps keys to counts
54+
# result_keys remembers the original order of keys
10455

10556
result_keys = {{name}}Vector()
106-
result_counts = Int64Vector()
10757
table = kh_init_{{ttype}}()
10858

10959
{{if dtype == 'object'}}
@@ -118,14 +68,11 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
11868
val = navalue
11969
k = kh_get_{{ttype}}(table, <PyObject*>val)
12070
if k != table.n_buckets:
121-
unique_key_index = table.vals[k]
122-
result_counts.data.data[unique_key_index] += 1
71+
table.vals[k] += 1
12372
else:
12473
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
125-
table.vals[k] = unique_key_count
74+
table.vals[k] = 1
12675
result_keys.append(val)
127-
result_counts.append(1)
128-
unique_key_count+=1
12976
{{else}}
13077
kh_resize_{{ttype}}(table, n)
13178

@@ -135,19 +82,26 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
13582
if not is_nan_{{c_type}}(val) or not dropna:
13683
k = kh_get_{{ttype}}(table, val)
13784
if k != table.n_buckets:
138-
unique_key_index = table.vals[k]
139-
result_counts.data.data[unique_key_index] += 1
85+
table.vals[k] += 1
14086
else:
14187
k = kh_put_{{ttype}}(table, val, &ret)
142-
table.vals[k] = unique_key_count
88+
table.vals[k] = 1
14389
result_keys.append(val)
144-
result_counts.append(1)
145-
unique_key_count+=1
14690
{{endif}}
14791

92+
# collect counts in the order corresponding to result_keys:
93+
cdef int64_t[:] result_counts = np.empty(table.size, dtype=np.int64)
94+
for i in range(table.size):
95+
{{if dtype == 'object'}}
96+
k = kh_get_{{ttype}}(table, result_keys.data[i])
97+
{{else}}
98+
k = kh_get_{{ttype}}(table, result_keys.data.data[i])
99+
{{endif}}
100+
result_counts[i] = table.vals[k]
101+
148102
kh_destroy_{{ttype}}(table)
149103

150-
return result_keys.to_array(), result_counts.to_array()
104+
return result_keys.to_array(), result_counts.base
151105

152106

153107
@cython.wraparound(False)
@@ -294,78 +248,42 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
294248
kh_destroy_{{ttype}}(table)
295249
return result.view(np.bool_)
296250

297-
{{endfor}}
298-
299-
300251
# ----------------------------------------------------------------------
301252
# Mode Computations
302253
# ----------------------------------------------------------------------
303254

304-
{{py:
305-
306-
# dtype, ctype, table_type, npy_dtype
307-
dtypes = [('complex128', 'khcomplex128_t', 'complex128', 'complex128'),
308-
('complex64', 'khcomplex64_t', 'complex64', 'complex64'),
309-
('float64', 'float64_t', 'float64', 'float64'),
310-
('float32', 'float32_t', 'float32', 'float32'),
311-
('int64', 'int64_t', 'int64', 'int64'),
312-
('int32', 'int32_t', 'int32', 'int32'),
313-
('int16', 'int16_t', 'int16', 'int16'),
314-
('int8', 'int8_t', 'int8', 'int8'),
315-
('uint64', 'uint64_t', 'uint64', 'uint64'),
316-
('uint32', 'uint32_t', 'uint32', 'uint32'),
317-
('uint16', 'uint16_t', 'uint16', 'uint16'),
318-
('uint8', 'uint8_t', 'uint8', 'uint8'),
319-
('object', 'object', 'pymap', 'object_')]
320-
}}
321-
322-
{{for dtype, ctype, table_type, npy_dtype in dtypes}}
323-
324255

325256
@cython.wraparound(False)
326257
@cython.boundscheck(False)
327-
328258
{{if dtype == 'object'}}
329-
330-
331-
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
259+
def mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
332260
{{else}}
333-
334-
335261
def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
336262
{{endif}}
337263
cdef:
338-
int count, max_count = 1
339-
int j = -1 # so you can do +=
340-
# Don't use Py_ssize_t, since table.n_buckets is unsigned
341-
khiter_t k
342-
kh_{{table_type}}_t *table
343-
ndarray[{{ctype}}] modes
264+
{{if dtype == 'object'}}
265+
ndarray[{{dtype}}] keys
266+
ndarray[{{dtype}}] modes
267+
{{else}}
268+
{{dtype}}_t[:] keys
269+
ndarray[{{dtype}}_t] modes
270+
{{endif}}
271+
int64_t[:] counts
272+
int64_t count, max_count = -1
273+
Py_ssize_t k, j = 0
344274

345-
table = kh_init_{{table_type}}()
346-
build_count_table_{{dtype}}(values, table, dropna)
275+
keys, counts = value_count_{{dtype}}(values, dropna)
347276

348-
modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
277+
{{if dtype == 'object'}}
278+
modes = np.empty(len(keys), dtype=np.object_)
279+
{{else}}
280+
modes = np.empty(len(keys), dtype=np.{{dtype}})
281+
{{endif}}
349282

350283
{{if dtype != 'object'}}
351284
with nogil:
352-
for k in range(table.n_buckets):
353-
if kh_exist_{{table_type}}(table, k):
354-
count = table.vals[k]
355-
if count == max_count:
356-
j += 1
357-
elif count > max_count:
358-
max_count = count
359-
j = 0
360-
else:
361-
continue
362-
363-
modes[j] = table.keys[k]
364-
{{else}}
365-
for k in range(table.n_buckets):
366-
if kh_exist_{{table_type}}(table, k):
367-
count = table.vals[k]
368-
285+
for k in range(len(keys)):
286+
count = counts[k]
369287
if count == max_count:
370288
j += 1
371289
elif count > max_count:
@@ -374,11 +292,21 @@ def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
374292
else:
375293
continue
376294

377-
modes[j] = <object>table.keys[k]
295+
modes[j] = keys[k]
296+
{{else}}
297+
for k in range(len(keys)):
298+
count = counts[k]
299+
if count == max_count:
300+
j += 1
301+
elif count > max_count:
302+
max_count = count
303+
j = 0
304+
else:
305+
continue
306+
307+
modes[j] = keys[k]
378308
{{endif}}
379309

380-
kh_destroy_{{table_type}}(table)
381-
382310
return modes[:j + 1]
383311

384312
{{endfor}}

pandas/core/dtypes/cast.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1952,4 +1952,7 @@ def can_hold_element(dtype: np.dtype, element: Any) -> bool:
19521952
return tipo.kind == "b"
19531953
return lib.is_bool(element)
19541954

1955+
elif dtype == object:
1956+
return True
1957+
19551958
raise NotImplementedError(dtype)

pandas/core/dtypes/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1677,7 +1677,7 @@ def _is_dtype_type(arr_or_dtype, condition) -> bool:
16771677
return condition(tipo)
16781678

16791679

1680-
def infer_dtype_from_object(dtype):
1680+
def infer_dtype_from_object(dtype) -> DtypeObj:
16811681
"""
16821682
Get a numpy dtype.type-style object for a dtype object.
16831683

0 commit comments

Comments
 (0)