Add DataFrame.iter_columns() and simplify (#326)

MarcoGorelli · web-flow · commit 8ecbea372848 · 2023-12-07T21:59:06.000Z
* add DataFrame.column_iter

* iter_columns instead

* lint
diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING, Any, Literal, NoReturn, Protocol
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence
+    from collections.abc import Iterator, Mapping, Sequence
 
     from typing_extensions import Self
 
@@ -275,6 +275,10 @@ def schema(self) -> dict[str, DType]:
         """
         ...
 
+    def iter_columns(self) -> Iterator[Column]:
+        """Return iterator over columns."""
+        ...
+
     def sort(
         self,
         *keys: str,
@@ -905,23 +909,20 @@ def persist(self) -> Self:
             .. code-block:: python
 
                 df: DataFrame
-                features = []
                 result = df.std() > 0
                 result = result.persist()
-                for column_name in df.column_names:
-                    if result.col(column_name).get_value(0):
-                        features.append(column_name)
+                features = [col.name for col in df.iter_columns() if col.get_value(0)]
 
             instead of this:
 
             .. code-block:: python
 
                 df: DataFrame
-                features = []
-                for column_name in df.column_names:
-                    # Do NOT call `persist` on a `DataFrame` within a for-loop!
-                    # This may re-trigger the same computation multiple times
-                    if df.persist().col(column_name).std() > 0:
-                        features.append(column_name)
+                result = df.std() > 0
+                features = [
+                    # Do NOT do this! This will trigger execution of the entire
+                    # pipeline for element in the for-loop!
+                    col.name for col in df.iter_columns() if col.get_value(0).persist()
+                ]
         """
         ...
diff --git a/spec/API_specification/examples/01_standardise_columns.py b/spec/API_specification/examples/01_standardise_columns.py
@@ -9,11 +9,10 @@
 def my_dataframe_agnostic_function(df_non_standard: SupportsDataFrameAPI) -> Any:
     df = df_non_standard.__dataframe_consortium_standard__(api_version="2023.09-beta")
 
-    for column_name in df.column_names:
-        if column_name == "species":
-            continue
-        new_column = df.col(column_name)
-        new_column = (new_column - new_column.mean()) / new_column.std()
-        df = df.assign(new_column.rename(f"{column_name}_scaled"))
+    new_columns = [
+        ((col - col.mean()) / col.std()).rename(f"{col.name}_scaled")
+        for col in df.iter_columns()
+    ]
+    df = df.assign(*new_columns)
 
     return df.dataframe
diff --git a/spec/API_specification/examples/04_datatypes.py b/spec/API_specification/examples/04_datatypes.py
@@ -12,11 +12,7 @@ def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI:
     df = df_raw.__dataframe_consortium_standard__(api_version="2023-11.beta").persist()
     pdx = df.__dataframe_namespace__()
     df = df.select(
-        *[
-            col_name
-            for col_name in df.column_names
-            if isinstance(df.col(col_name).dtype, pdx.Int64)
-        ],
+        *[col.name for col in df.iter_columns() if isinstance(col.dtype, pdx.Int64)],
     )
     arr = df.to_array()
     arr = some_array_function(arr)
diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md
@@ -11,17 +11,13 @@ not be supported in some cases.
 For example, let's consider the following:
 ```python
 df: DataFrame
-features = []
-for column_name in df.column_names:
-    if df.col(column_name).std() > 0:
-        features.append(column_name)
-return features
+features = [col.name for col in df.iter_columns() if col.std() > 0]
 ```
-If `df` is a lazy dataframe, then the call `df.col(column_name).std() > 0` returns
+If `df` is a lazy dataframe, then the call `col.std() > 0` returns
 a (ducktyped) Python boolean scalar. No issues so far. Problem is,
-what happens when `if df.col(column_name).std() > 0` is called?
+what happens when `if col.std() > 0` is called?
 
-Under the hood, Python will call `(df.col(column_name).std() > 0).__bool__()` in
+Under the hood, Python will call `(col.std() > 0).__bool__()` in
 order to extract a Python boolean. This is a problem for "lazy" implementations,
 as the laziness needs breaking in order to evaluate the above.