|
2 | 2 | import re
|
3 | 3 | from typing import (
|
4 | 4 | Any,
|
| 5 | + Dict, |
| 6 | + List, |
5 | 7 | Optional,
|
6 | 8 | Tuple,
|
7 | 9 | Union,
|
|
22 | 24 | Endianness,
|
23 | 25 | )
|
24 | 26 |
|
25 |
| -_NP_DTYPES = { |
| 27 | +_NP_DTYPES: Dict[DtypeKind, Dict[int, Any]] = { |
26 | 28 | DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
|
27 | 29 | DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
|
28 | 30 | DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
|
@@ -90,7 +92,7 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
|
90 | 92 | """
|
91 | 93 | # We need a dict of columns here, with each column being a NumPy array (at
|
92 | 94 | # least for now, deal with non-NumPy dtypes later).
|
93 |
| - columns = {} |
| 95 | + columns: Dict[str, Any] = {} |
94 | 96 | buffers = [] # hold on to buffers, keeps memory alive
|
95 | 97 | for name in df.column_names():
|
96 | 98 | if not isinstance(name, str):
|
@@ -210,6 +212,7 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]:
|
210 | 212 |
|
211 | 213 | buffers = col.get_buffers()
|
212 | 214 |
|
| 215 | + assert buffers["offsets"], "String buffers must contain offsets" |
213 | 216 | # Retrieve the data buffer containing the UTF-8 code units
|
214 | 217 | data_buff, protocol_data_dtype = buffers["data"]
|
215 | 218 | # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
|
@@ -238,13 +241,14 @@ def string_column_to_ndarray(col: Column) -> Tuple[np.ndarray, Any]:
|
238 | 241 |
|
239 | 242 | null_pos = None
|
240 | 243 | if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
|
| 244 | + assert buffers["validity"], "Validity buffers cannot be empty for masks" |
241 | 245 | valid_buff, valid_dtype = buffers["validity"]
|
242 | 246 | null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size)
|
243 | 247 | if sentinel_val == 0:
|
244 | 248 | null_pos = ~null_pos
|
245 | 249 |
|
246 | 250 | # Assemble the strings from the code units
|
247 |
| - str_list = [None] * col.size |
| 251 | + str_list: List[Union[None, float, str]] = [None] * col.size |
248 | 252 | for i in range(col.size):
|
249 | 253 | # Check for missing values
|
250 | 254 | if null_pos is not None and null_pos[i]:
|
@@ -448,7 +452,7 @@ def bitmask_to_bool_ndarray(
|
448 | 452 | def set_nulls(
|
449 | 453 | data: Union[np.ndarray, pd.Series],
|
450 | 454 | col: Column,
|
451 |
| - validity: Tuple[Buffer, Tuple[DtypeKind, int, str, str]], |
| 455 | + validity: Optional[Tuple[Buffer, Tuple[DtypeKind, int, str, str]]], |
452 | 456 | allow_modify_inplace: bool = True,
|
453 | 457 | ):
|
454 | 458 | """
|
|
0 commit comments