feat: ensure Series.str.len() can get length of array columns (#497)

tswast · web-flow · commit 10c044686228 · 2024-03-22T20:00:28.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -357,8 +357,6 @@ def nested_pandas_df() -> pd.DataFrame:
         DATA_DIR / "nested.jsonl",
         lines=True,
     )
-    tests.system.utils.convert_pandas_dtypes(df, bytes_col=True)
-
     df = df.set_index("rowindex")
     return df
 
diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py
@@ -181,6 +181,26 @@ def test_len(scalars_dfs):
     )
 
 
+def test_len_with_array_column(nested_df, nested_pandas_df):
+    """
+    Series.str.len() is expected to work on columns containing lists as well as strings.
+
+    See: https://stackoverflow.com/a/41340543/101923
+    """
+    col_name = "event_sequence"
+    bf_series: bigframes.series.Series = nested_df[col_name]
+    bf_result = bf_series.str.len().to_pandas()
+    pd_result = nested_pandas_df[col_name].str.len()
+
+    # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but
+    # the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948
+    assert_series_equal(
+        pd_result.astype(pd.Int64Dtype()),
+        bf_result,
+        check_index_type=False,
+    )
+
+
 def test_lower(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     col_name = "string_col"

Original file line number	Diff line number	Diff line change
`@@ -357,8 +357,6 @@ def nested_pandas_df() -> pd.DataFrame:`
`357`	`357`	`DATA_DIR / "nested.jsonl",`
`358`	`358`	`lines=True,`
`359`	`359`	`)`
`360`		`- tests.system.utils.convert_pandas_dtypes(df, bytes_col=True)`
`361`		`-`
`362`	`360`	`df = df.set_index("rowindex")`
`363`	`361`	`return df`
`364`	`362`