Update cython-lint and replace flake8 with ruff (rapidsai#13699)

This PR makes three changes to linting - Updates the cython-lint version and fixes associated errors - Adds the [ruff](https://beta.ruff.rs/docs/) linter - Removes flake8 [cython-lint has achieved parity with flake8 for Cython files](MarcoGorelli/cython-lint#45), so we no longer need flake8 for that. Meanwhile [ruff advertises itself as a suitable replacement for flake8 under three conditions](https://beta.ruff.rs/docs/faq/#how-does-ruff-compare-to-flake8): 1) few plugins, 2) using with black, and 3) Python 3 only. All of those apply to cudf code. From a quick analysis, there are a small handful of error codes supported by flake8 (via pyflakes or pycodestyle) that are not yet supported by ruff (pyflakes: F723, F842; pycodstyle: W391,W503, W504, W505, W606). On the flip side, ruff autofixes a range of issues that flake8 can't, it runs more than an order of magnitude faster, it supports more flexible configuration via pyproject.toml, and it promises to replace multiple tools over time. Therefore, I think this is a good switch for us to make. Note that ruff could in theory also replace isort for Python files, but it does not support Cython yet, so I have left our isort hook in place. We could revisit in the future if that changes. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - AJ Schmidt (https://github.com/ajschmidt8) URL: rapidsai#13699
Blonck · Jul 17, 2023 · 89735d8 · 89735d8
1 parent 620fe81
commit 89735d8
Show file tree

Hide file tree

Showing 26 changed files with 83 additions and 99 deletions.
diff --git a/.flake8 b/.flake8
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,17 +30,8 @@ repos:
                 files: python/.*
                 # Explicitly specify the pyproject.toml at the repo root, not per-project.
                 args: ["--config", "pyproject.toml"]
-      - repo: https://github.com/PyCQA/flake8
-        rev: 5.0.4
-        hooks:
-              - id: flake8
-                args: ["--config=.flake8"]
-                files: python/.*$
-                types: [file]
-                types_or: [python, cython]
-                additional_dependencies: ["flake8-force"]
       - repo: https://github.com/MarcoGorelli/cython-lint
-        rev: v0.1.10
+        rev: v0.15.0
         hooks:
               - id: cython-lint
       - repo: https://github.com/pre-commit/mirrors-mypy
@@ -165,6 +156,12 @@ repos:
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
+      - repo: https://github.com/astral-sh/ruff-pre-commit
+        rev: v0.0.278
+        hooks:
+          - id: ruff
+            files: python/.*$
+
 
 default_language_version:
       python: python3
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
@@ -26,7 +26,6 @@
     re.compile(r"CMakeLists[.]txt$"),
     re.compile(r"CMakeLists_standalone[.]txt$"),
     re.compile(r"setup[.]cfg$"),
-    re.compile(r"[.]flake8[.]cython$"),
     re.compile(r"meta[.]yaml$"),
 ]
 ExemptFiles = [

diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -15,7 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm
 The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting.
 Specifically, cuDF uses the following tools:
 
-- [`flake8`](https://github.com/pycqa/flake8) checks for general code formatting compliance.
+- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
 - [`black`](https://github.com/psf/black) is an automatic code formatter.
 - [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
@@ -28,10 +28,9 @@ Linter config data is stored in a number of files.
 We generally use `pyproject.toml` over `setup.cfg` and avoid project-specific files (e.g. `pyproject.toml` > `python/cudf/pyproject.toml`).
 However, differences between tools and the different packages in the repo result in the following caveats:
 
-- `flake8` has no plans to support `pyproject.toml`, so it must live in `.flake8`.
 - `isort` must be configured per project to set which project is the "first party" project.
 
-As a result, we currently maintain both root and project-level `pyproject.toml` files as well as a `.flake8` file.
+As a result, we currently maintain both root and project-level `pyproject.toml` files.
 
 For more information on how to use pre-commit hooks, see the code formatting section of the
 [overall contributing guide](https://github.com/rapidsai/cudf/blob/main/CONTRIBUTING.md#python--pre-commit-hooks).

diff --git a/pyproject.toml b/pyproject.toml
@@ -55,3 +55,16 @@ ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
 ignore-words-list = "inout,unparseable,falsy"
 builtin = "clear"
 quiet-level = 3
+
+[tool.ruff]
+select = ["E", "F", "W"]
+ignore = [
+    # whitespace before :
+    "E203",
+]
+fixable = ["ALL"]
+exclude = [
+    # TODO: Remove this in a follow-up where we fix __all__.
+    "__init__.py",
+]
+line-length = 79
diff --git a/python/cudf/cmake/Modules/ProtobufHelpers.cmake b/python/cudf/cmake/Modules/ProtobufHelpers.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -36,7 +36,6 @@ function(codegen_protoc)
     file(
       WRITE "${pb2_py_path}"
       [=[
-# flake8: noqa
 # fmt: off
 ]=]
     )

diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Dict, Optional, Tuple, TypeVar
+from typing import Dict, Optional, Tuple
 
 from typing_extensions import Self
 

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
@@ -49,7 +49,7 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.null_mask cimport null_count as c_null_count
+from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
 
@@ -323,7 +323,7 @@ cdef class Column:
         with acquire_spill_lock():
             if not self.nullable:
                 return 0
-            return c_null_count(
+            return cpp_null_count(
                 <libcudf_types.bitmask_type*><uintptr_t>(
                     self.base_mask.get_ptr(mode="read")
                 ),

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
@@ -154,10 +154,8 @@ def copy_range(Column source_column,
     example via ``slice.indices``.
     """
 
-    assert (
-        source_end - source_begin == target_end - target_begin,
-        "Source and target ranges must be same length"
-    )
+    msg = "Source and target ranges must be same length"
+    assert source_end - source_begin == target_end - target_begin, msg
     if target_end >= target_begin and inplace:
         # FIXME: Are we allowed to do this when inplace=False?
         return target_column

diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -12,7 +12,7 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.nvtext.stemmer cimport (
     is_letter as cpp_is_letter,
-    letter_type as letter_type,
+    letter_type,
     porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -51,7 +51,6 @@ from cudf._lib.cpp.io.parquet cimport (
     write_parquet as parquet_writer,
 )
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport NativeFileDatasource

diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
@@ -30,7 +30,7 @@ from cudf._lib.cpp.sorting cimport (
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_order, null_policy, order
+from cudf._lib.cpp.types cimport null_order, null_policy, order as cpp_order
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
@@ -63,23 +63,25 @@ def is_sorted(
         ``null_position``, False otherwise.
     """
 
-    cdef vector[order] column_order
+    cdef vector[cpp_order] column_order
     cdef vector[null_order] null_precedence
 
     if ascending is None:
-        column_order = vector[order](len(source_columns), order.ASCENDING)
+        column_order = vector[cpp_order](
+            len(source_columns), cpp_order.ASCENDING
+        )
     else:
         if len(ascending) != len(source_columns):
             raise ValueError(
                 f"Expected a list-like of length {len(source_columns)}, "
                 f"got length {len(ascending)} for `ascending`"
             )
-        column_order = vector[order](
-            len(source_columns), order.DESCENDING
+        column_order = vector[cpp_order](
+            len(source_columns), cpp_order.DESCENDING
         )
         for idx, val in enumerate(ascending):
             if val:
-                column_order[idx] = order.ASCENDING
+                column_order[idx] = cpp_order.ASCENDING
 
     if null_position is None:
         null_precedence = vector[null_order](
@@ -110,7 +112,7 @@ def is_sorted(
     return c_result
 
 
-cdef pair[vector[order], vector[null_order]] ordering(
+cdef pair[vector[cpp_order], vector[null_order]] ordering(
     column_order, null_precedence
 ):
     """
@@ -129,17 +131,19 @@ cdef pair[vector[order], vector[null_order]] ordering(
     -------
     pair of vectors (order, and null_order)
     """
-    cdef vector[order] c_column_order
+    cdef vector[cpp_order] c_column_order
     cdef vector[null_order] c_null_precedence
     for asc, null in zip(column_order, null_precedence):
-        c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING)
+        c_column_order.push_back(
+            cpp_order.ASCENDING if asc else cpp_order.DESCENDING
+        )
         if asc ^ (null == "first"):
             c_null_precedence.push_back(null_order.AFTER)
         elif asc ^ (null == "last"):
             c_null_precedence.push_back(null_order.BEFORE)
         else:
             raise ValueError(f"Invalid null precedence {null}")
-    return pair[vector[order], vector[null_order]](
+    return pair[vector[cpp_order], vector[null_order]](
         c_column_order, c_null_precedence
     )
 
@@ -176,7 +180,7 @@ def order_by(
     cdef table_view source_table_view = table_view_from_columns(
         columns_from_table
     )
-    cdef pair[vector[order], vector[null_order]] order = ordering(
+    cdef pair[vector[cpp_order], vector[null_order]] order = ordering(
         ascending, repeat(na_position)
     )
     cdef unique_ptr[column] c_result
@@ -218,7 +222,7 @@ def sort(
     cdef table_view values_view = table_view_from_columns(values)
     cdef unique_ptr[table] result
     ncol = len(values)
-    cdef pair[vector[order], vector[null_order]] order = ordering(
+    cdef pair[vector[cpp_order], vector[null_order]] order = ordering(
         column_order or repeat(True, ncol),
         null_precedence or repeat("first", ncol),
     )
@@ -268,7 +272,7 @@ def sort_by_key(
     """
     cdef table_view value_view = table_view_from_columns(values)
     cdef table_view key_view = table_view_from_columns(keys)
-    cdef pair[vector[order], vector[null_order]] order = ordering(
+    cdef pair[vector[cpp_order], vector[null_order]] order = ordering(
         ascending, na_position
     )
     cdef unique_ptr[table] c_result
@@ -329,7 +333,7 @@ def segmented_sort_by_key(
     cdef column_view offsets_view = segment_offsets.view()
     cdef unique_ptr[table] result
     ncol = len(values)
-    cdef pair[vector[order], vector[null_order]] order = ordering(
+    cdef pair[vector[cpp_order], vector[null_order]] order = ordering(
         column_order or repeat(True, ncol),
         null_precedence or repeat("first", ncol),
     )
@@ -375,10 +379,10 @@ def digitize(list source_columns, list bins, bool right=False):
     cdef table_view source_table_view = table_view_from_columns(
         source_columns
     )
-    cdef vector[order] column_order = (
-        vector[order](
+    cdef vector[cpp_order] column_order = (
+        vector[cpp_order](
             bins_view.num_columns(),
-            order.ASCENDING
+            cpp_order.ASCENDING
         )
     )
     cdef vector[null_order] null_precedence = (
@@ -420,10 +424,10 @@ def rank_columns(list source_columns, object method, str na_option,
         < underlying_type_t_rank_method > method
     )
 
-    cdef order column_order = (
-        order.ASCENDING
+    cdef cpp_order column_order = (
+        cpp_order.ASCENDING
         if ascending
-        else order.DESCENDING
+        else cpp_order.DESCENDING
     )
     # ascending
     #    #top    = na_is_smallest

diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 
 from libcpp cimport bool
@@ -14,7 +14,7 @@ from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
-    string_character_types as string_character_types,
+    string_character_types,
 )
 from cudf._lib.scalar cimport DeviceScalar
 

diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -13,8 +13,8 @@ from cudf._lib.cpp.strings.combine cimport (
     concatenate as cpp_concatenate,
     join_list_elements as cpp_join_list_elements,
     join_strings as cpp_join_strings,
-    output_if_empty_list as output_if_empty_list,
-    separator_on_nulls as separator_on_nulls,
+    output_if_empty_list,
+    separator_on_nulls,
 )
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.scalar cimport DeviceScalar

diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -14,7 +14,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.strings.translate cimport (
     filter_characters as cpp_filter_characters,
-    filter_type as filter_type,
+    filter_type,
     translate as cpp_translate,
 )
 from cudf._lib.cpp.types cimport char_utf8

diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -33,8 +33,8 @@ def column_to_string_view_array(Column strings_col):
     with nogil:
         c_buffer = move(cpp_to_string_view_array(input_view))
 
-    device_buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer))
-    return as_buffer(device_buffer, exposed=True)
+    db = DeviceBuffer.c_from_unique_ptr(move(c_buffer))
+    return as_buffer(db, exposed=True)
 
 
 def column_from_udf_string_array(DeviceBuffer d_buffer):