pandas-dev · MarcoGorelli · Nov 25, 2022 · Nov 25, 2022 · Nov 25, 2022 · Nov 25, 2022
@@ -36,6 +36,8 @@ jobs:
 
     - name: Run pre-commit
       uses: pre-commit/[email protected]
+      with:
+        extra_args: --verbose --all-files
 
   docstring_typing_pylint:
     name: Docstring validation, typing, and pylint
@@ -89,7 +91,7 @@ jobs:
     - name: Typing + pylint
       uses: pre-commit/[email protected]
       with:
-        extra_args: --hook-stage manual --all-files
+        extra_args: --verbose --hook-stage manual --all-files
       if: ${{ steps.build.outcome == 'success' && always() }}
 
     - name: Run docstring validation script tests

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,21 +17,18 @@ repos:
         entry: python scripts/run_vulture.py
         pass_filenames: true
         require_serial: false
--   repo: https://github.com/python/black
-    rev: 22.10.0
-    hooks:
-    -   id: black
 -   repo: https://github.com/codespell-project/codespell
     rev: v2.2.2
     hooks:
     -   id: codespell
         types_or: [python, rst, markdown]
 -   repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.2.1
+    rev: v0.9.1
     hooks:
     -   id: cython-lint
+    -   id: double-quote-cython-strings
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
+    rev: v4.4.0
     hooks:
     -   id: debug-statements
     -   id: end-of-file-fixer
@@ -50,22 +47,22 @@ repos:
         exclude: ^pandas/_libs/src/(klib|headers)/
         args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir']
 -   repo: https://github.com/PyCQA/flake8
-    rev: 5.0.4
+    rev: 6.0.0
     hooks:
     -   id: flake8
         # Need to patch os.remove rule in pandas-dev-flaker
         exclude: ^ci/fix_wheels.py
         additional_dependencies: &flake8_dependencies
-        - flake8==5.0.4
+        - flake8==6.0.0
         - flake8-bugbear==22.7.1
         - pandas-dev-flaker==0.5.0
 -   repo: https://github.com/pycqa/pylint
-    rev: v2.15.5
+    rev: v2.15.6
     hooks:
     -   id: pylint
         stages: [manual]
 -   repo: https://github.com/pycqa/pylint
-    rev: v2.15.5
+    rev: v2.15.6
     hooks:
     -   id: pylint
         alias: redefined-outer-name
@@ -88,7 +85,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.2.0
+    rev: v3.2.2
     hooks:
     -   id: pyupgrade
         args: [--py38-plus]
@@ -111,8 +108,19 @@ repos:
     hooks:
     -   id: yesqa
         additional_dependencies: *flake8_dependencies
+        stages: [manual]
 -   repo: local
     hooks:
+    # NOTE: we make `black` a local hook because if it's installed from
+    # PyPI (rather than from source) then it'll run twice as fast thanks to mypyc
+    -   id: black
+        name: black
+        description: "Black: The uncompromising Python code formatter"
+        entry: black
+        language: python
+        require_serial: true
+        types_or: [python, pyi]
+        additional_dependencies: [black==22.10.0]
     -   id: pyright
         # note: assumes python env is setup and activated
         name: pyright

diff --git a/environment.yml b/environment.yml
@@ -84,7 +84,7 @@ dependencies:
   # code checks
   - black=22.3.0
   - cpplint
-  - flake8=5.0.4
+  - flake8=6.0.0
   - flake8-bugbear=22.7.1 # used by flake8, find likely bugs
   - isort>=5.2.1  # check that imports are in the right order
   - mypy=0.990

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -180,7 +180,7 @@ def is_lexsorted(list_of_arrays: list) -> bint:
     cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
     for i in range(nlevels):
         arr = list_of_arrays[i]
-        assert arr.dtype.name == 'int64'
+        assert arr.dtype.name == "int64"
         vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)
 
     # Assume uniqueness??
@@ -514,9 +514,9 @@ def validate_limit(nobs: int | None, limit=None) -> int:
         lim = nobs
     else:
         if not util.is_integer_object(limit):
-            raise ValueError('Limit must be an integer')
+            raise ValueError("Limit must be an integer")
         if limit < 1:
-            raise ValueError('Limit must be greater than 0')
+            raise ValueError("Limit must be greater than 0")
         lim = limit
 
     return lim
@@ -958,7 +958,7 @@ def rank_1d(
         if not ascending:
             tiebreak = TIEBREAK_FIRST_DESCENDING
 
-    keep_na = na_option == 'keep'
+    keep_na = na_option == "keep"
 
     N = len(values)
     if labels is not None:
@@ -984,7 +984,7 @@ def rank_1d(
     # with mask, without obfuscating location of missing data
     # in values array
     if numeric_object_t is object and values.dtype != np.object_:
-        masked_vals = values.astype('O')
+        masked_vals = values.astype("O")
     else:
         masked_vals = values.copy()
 
@@ -1005,7 +1005,7 @@ def rank_1d(
     # If descending, fill with highest value since descending
     # will flip the ordering to still end up with lowest rank.
     # Symmetric logic applies to `na_option == 'bottom'`
-    nans_rank_highest = ascending ^ (na_option == 'top')
+    nans_rank_highest = ascending ^ (na_option == "top")
     nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
     if nans_rank_highest:
         order = [masked_vals, mask]
@@ -1345,7 +1345,7 @@ def rank_2d(
         if not ascending:
             tiebreak = TIEBREAK_FIRST_DESCENDING
 
-    keep_na = na_option == 'keep'
+    keep_na = na_option == "keep"
 
     # For cases where a mask is not possible, we can avoid mask checks
     check_mask = (
@@ -1362,9 +1362,9 @@ def rank_2d(
 
     if numeric_object_t is object:
         if values.dtype != np.object_:
-            values = values.astype('O')
+            values = values.astype("O")
 
-    nans_rank_highest = ascending ^ (na_option == 'top')
+    nans_rank_highest = ascending ^ (na_option == "top")
     if check_mask:
         nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
 
@@ -1385,7 +1385,7 @@ def rank_2d(
         order = (values, ~np.asarray(mask))
 
     n, k = (<object>values).shape
-    out = np.empty((n, k), dtype='f8', order='F')
+    out = np.empty((n, k), dtype="f8", order="F")
     grp_sizes = np.ones(n, dtype=np.int64)
 
     # lexsort is slower, so only use if we need to worry about the mask

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -604,12 +604,12 @@ def group_any_all(
         intp_t lab
         int8_t flag_val, val
 
-    if val_test == 'all':
+    if val_test == "all":
         # Because the 'all' value of an empty iterable in Python is True we can
         # start with an array full of ones and set to zero when a False value
         # is encountered
         flag_val = 0
-    elif val_test == 'any':
+    elif val_test == "any":
         # Because the 'any' value of an empty iterable in Python is False we
         # can start with an array full of zeros and set to one only if any
         # value encountered is True
@@ -1061,7 +1061,7 @@ def group_ohlc(
     N, K = (<object>values).shape
 
     if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
+        raise ValueError("Output array must have 4 columns")
 
     if K > 1:
         raise NotImplementedError("Argument 'values' must have only one dimension")
@@ -1157,11 +1157,11 @@ def group_quantile(
         )
 
     inter_methods = {
-        'linear': INTERPOLATION_LINEAR,
-        'lower': INTERPOLATION_LOWER,
-        'higher': INTERPOLATION_HIGHER,
-        'nearest': INTERPOLATION_NEAREST,
-        'midpoint': INTERPOLATION_MIDPOINT,
+        "linear": INTERPOLATION_LINEAR,
+        "lower": INTERPOLATION_LOWER,
+        "higher": INTERPOLATION_HIGHER,
+        "nearest": INTERPOLATION_NEAREST,
+        "midpoint": INTERPOLATION_MIDPOINT,
     }
     interp = inter_methods[interpolation]
 

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -184,8 +184,8 @@ cdef class IndexEngine:
         if self.is_monotonic_increasing:
             values = self.values
             try:
-                left = values.searchsorted(val, side='left')
-                right = values.searchsorted(val, side='right')
+                left = values.searchsorted(val, side="left")
+                right = values.searchsorted(val, side="right")
             except TypeError:
                 # e.g. GH#29189 get_loc(None) with a Float64Index
                 #  2021-09-29 Now only reached for object-dtype
@@ -353,8 +353,8 @@ cdef class IndexEngine:
             remaining_stargets = set()
             for starget in stargets:
                 try:
-                    start = values.searchsorted(starget, side='left')
-                    end = values.searchsorted(starget, side='right')
+                    start = values.searchsorted(starget, side="left")
+                    end = values.searchsorted(starget, side="right")
                 except TypeError:  # e.g. if we tried to search for string in int array
                     remaining_stargets.add(starget)
                 else:
@@ -551,7 +551,7 @@ cdef class DatetimeEngine(Int64Engine):
                 return self._get_loc_duplicates(conv)
             values = self.values
 
-            loc = values.searchsorted(conv, side='left')
+            loc = values.searchsorted(conv, side="left")
 
             if loc == len(values) or values[loc] != conv:
                 raise KeyError(val)
@@ -655,8 +655,8 @@ cdef class BaseMultiIndexCodesEngine:
         # with positive integers (-1 for NaN becomes 1). This enables us to
         # differentiate between values that are missing in other and matching
         # NaNs. We will set values that are not found to 0 later:
-        labels_arr = np.array(labels, dtype='int64').T + multiindex_nulls_shift
-        codes = labels_arr.astype('uint64', copy=False)
+        labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
+        codes = labels_arr.astype("uint64", copy=False)
         self.level_has_nans = [-1 in lab for lab in labels]
 
         # Map each codes combination in the index to an integer unambiguously
@@ -693,7 +693,7 @@ cdef class BaseMultiIndexCodesEngine:
             if self.level_has_nans[i] and codes.hasnans:
                 result[codes.isna()] += 1
             level_codes.append(result)
-        return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
+        return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
 
     def get_indexer(self, target: np.ndarray) -> np.ndarray:
         """
@@ -754,12 +754,12 @@ cdef class BaseMultiIndexCodesEngine:
             ndarray[int64_t, ndim=1] new_codes, new_target_codes
             ndarray[intp_t, ndim=1] sorted_indexer
 
-        target_order = np.argsort(target).astype('int64')
+        target_order = np.argsort(target).astype("int64")
         target_values = target[target_order]
         num_values, num_target_values = len(values), len(target_values)
         new_codes, new_target_codes = (
-            np.empty((num_values,)).astype('int64'),
-            np.empty((num_target_values,)).astype('int64'),
+            np.empty((num_values,)).astype("int64"),
+            np.empty((num_target_values,)).astype("int64"),
         )
 
         # `values` and `target_values` are both sorted, so we walk through them
@@ -809,7 +809,7 @@ cdef class BaseMultiIndexCodesEngine:
             raise KeyError(key)
 
         # Transform indices into single integer:
-        lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
+        lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
 
         return self._base.get_loc(self, lab_int)
 
@@ -940,8 +940,8 @@ cdef class SharedEngine:
         if self.is_monotonic_increasing:
             values = self.values
             try:
-                left = values.searchsorted(val, side='left')
-                right = values.searchsorted(val, side='right')
+                left = values.searchsorted(val, side="left")
+                right = values.searchsorted(val, side="right")
             except TypeError:
                 # e.g. GH#29189 get_loc(None) with a Float64Index
                 raise KeyError(val)

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -69,7 +69,7 @@ cdef class BlockPlacement:
                 or not cnp.PyArray_ISWRITEABLE(val)
                 or (<ndarray>val).descr.type_num != cnp.NPY_INTP
             ):
-                arr = np.require(val, dtype=np.intp, requirements='W')
+                arr = np.require(val, dtype=np.intp, requirements="W")
             else:
                 arr = val
             # Caller is responsible for ensuring arr.ndim == 1