44"""
55from __future__ import annotations
66
7+ import inspect
78import operator
89from textwrap import dedent
910from typing import (
1415 cast ,
1516 final ,
1617)
17- from warnings import warn
18+ import warnings
1819
1920import numpy as np
2021
@@ -586,7 +587,8 @@ def factorize_array(
586587def factorize (
587588 values ,
588589 sort : bool = False ,
589- na_sentinel : int | None = - 1 ,
590+ na_sentinel : int | None | lib .NoDefault = lib .no_default ,
591+ use_na_sentinel : bool | lib .NoDefault = lib .no_default ,
590592 size_hint : int | None = None ,
591593) -> tuple [np .ndarray , np .ndarray | Index ]:
592594 """
@@ -604,7 +606,19 @@ def factorize(
604606 Value to mark "not found". If None, will not drop the NaN
605607 from the uniques of the values.
606608
609+ .. deprecated:: 1.5.0
610+ The na_sentinel argument is deprecated and
611+ will be removed in a future version of pandas. Specify use_na_sentinel as
612+ either True or False.
613+
607614 .. versionchanged:: 1.1.2
615+
616+ use_na_sentinel : bool, default True
617+ If True, the sentinel -1 will be used for NaN values. If False,
618+ NaN values will be encoded as non-negative integers and will not drop the
619+ NaN from the uniques of the values.
620+
621+ .. versionadded:: 1.5.0
608622 {size_hint}\
609623
610624 Returns
@@ -652,8 +666,8 @@ def factorize(
652666 >>> uniques
653667 array(['a', 'b', 'c'], dtype=object)
654668
655- Missing values are indicated in `codes` with `na_sentinel`
656- (`` -1`` by default). Note that missing values are never
669+ When ``use_na_sentinel=True`` (the default), missing values are indicated in
670+ the `codes` with the sentinel value `` -1`` and missing values are not
657671 included in `uniques`.
658672
659673 >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
@@ -688,16 +702,16 @@ def factorize(
688702 Index(['a', 'c'], dtype='object')
689703
690704 If NaN is in the values, and we want to include NaN in the uniques of the
691- values, it can be achieved by setting ``na_sentinel=None ``.
705+ values, it can be achieved by setting ``use_na_sentinel=False ``.
692706
693707 >>> values = np.array([1, 2, 1, np.nan])
694- >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
708+ >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
695709 >>> codes
696710 array([ 0, 1, 0, -1])
697711 >>> uniques
698712 array([1., 2.])
699713
700- >>> codes, uniques = pd.factorize(values, na_sentinel=None )
714+ >>> codes, uniques = pd.factorize(values, use_na_sentinel=False )
701715 >>> codes
702716 array([0, 1, 0, 2])
703717 >>> uniques
@@ -712,6 +726,7 @@ def factorize(
712726 # responsible only for factorization. All data coercion, sorting and boxing
713727 # should happen here.
714728
729+ na_sentinel = resolve_na_sentinel (na_sentinel , use_na_sentinel )
715730 if isinstance (values , ABCRangeIndex ):
716731 return values .factorize (sort = sort )
717732
@@ -736,9 +751,22 @@ def factorize(
736751 codes , uniques = values .factorize (sort = sort )
737752 return _re_wrap_factorize (original , uniques , codes )
738753
739- if not isinstance (values .dtype , np .dtype ):
740- # i.e. ExtensionDtype
741- codes , uniques = values .factorize (na_sentinel = na_sentinel )
754+ elif not isinstance (values .dtype , np .dtype ):
755+ if (
756+ na_sentinel == - 1
757+ and "use_na_sentinel" in inspect .signature (values .factorize ).parameters
758+ ):
759+ # Avoid using catch_warnings when possible
760+ # GH#46910 - TimelikeOps has deprecated signature
761+ codes , uniques = values .factorize ( # type: ignore[call-arg]
762+ use_na_sentinel = True
763+ )
764+ else :
765+ with warnings .catch_warnings ():
766+ # We've already warned above
767+ warnings .filterwarnings ("ignore" , ".*use_na_sentinel.*" , FutureWarning )
768+ codes , uniques = values .factorize (na_sentinel = na_sentinel )
769+
742770 else :
743771 values = np .asarray (values ) # convert DTA/TDA/MultiIndex
744772 codes , uniques = factorize_array (
@@ -763,6 +791,56 @@ def factorize(
763791 return _re_wrap_factorize (original , uniques , codes )
764792
765793
794+ def resolve_na_sentinel (
795+ na_sentinel : int | None | lib .NoDefault ,
796+ use_na_sentinel : bool | lib .NoDefault ,
797+ ) -> int | None :
798+ """
799+ Determine value of na_sentinel for factorize methods.
800+
801+ See GH#46910 for details on the deprecation.
802+
803+ Parameters
804+ ----------
805+ na_sentinel : int, None, or lib.no_default
806+ Value passed to the method.
807+ use_na_sentinel : bool or lib.no_default
808+ Value passed to the method.
809+
810+ Returns
811+ -------
812+ Resolved value of na_sentinel.
813+ """
814+ if na_sentinel is not lib .no_default and use_na_sentinel is not lib .no_default :
815+ raise ValueError (
816+ "Cannot specify both `na_sentinel` and `use_na_sentile`; "
817+ f"got `na_sentinel={ na_sentinel } ` and `use_na_sentinel={ use_na_sentinel } `"
818+ )
819+ if na_sentinel is lib .no_default :
820+ result = - 1 if use_na_sentinel is lib .no_default or use_na_sentinel else None
821+ else :
822+ if na_sentinel is None :
823+ msg = (
824+ "Specifying `na_sentinel=None` is deprecated, specify "
825+ "`use_na_sentinel=False` instead."
826+ )
827+ elif na_sentinel == - 1 :
828+ msg = (
829+ "Specifying `na_sentinel=-1` is deprecated, specify "
830+ "`use_na_sentinel=True` instead."
831+ )
832+ else :
833+ msg = (
834+ "Specifying the specific value to use for `na_sentinel` is "
835+ "deprecated and will be removed in a future version of pandas. "
836+ "Specify `use_na_sentinel=True` to use the sentinel value -1, and "
837+ "`use_na_sentinel=False` to encode NaN values."
838+ )
839+ warnings .warn (msg , FutureWarning , stacklevel = find_stack_level ())
840+ result = na_sentinel
841+ return result
842+
843+
766844def _re_wrap_factorize (original , uniques , codes : np .ndarray ):
767845 """
768846 Wrap factorize results in Series or Index depending on original type.
@@ -956,7 +1034,7 @@ def mode(
9561034 try :
9571035 npresult = np .sort (npresult )
9581036 except TypeError as err :
959- warn (f"Unable to sort modes: { err } " )
1037+ warnings . warn (f"Unable to sort modes: { err } " )
9601038
9611039 result = _reconstruct_data (npresult , original .dtype , original )
9621040 return result
@@ -1576,7 +1654,7 @@ def diff(arr, n: int, axis: int = 0):
15761654 raise ValueError (f"cannot diff { type (arr ).__name__ } on axis={ axis } " )
15771655 return op (arr , arr .shift (n ))
15781656 else :
1579- warn (
1657+ warnings . warn (
15801658 "dtype lost in 'diff()'. In the future this will raise a "
15811659 "TypeError. Convert to a suitable dtype prior to calling 'diff'." ,
15821660 FutureWarning ,
0 commit comments