sfu-db · dovahcrow · Sep 10, 2021 · Sep 6, 2021 · Sep 6, 2021 · Sep 6, 2021
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Justfile b/Justfile
@@ -1,3 +1,5 @@
+set dotenv-load := true
+
 build-release:
     cargo build  --release
 

diff --git a/connectorx-python/Cargo.toml b/connectorx-python/Cargo.toml
@@ -37,6 +37,7 @@ tokio = {version = "1", features = ["rt", "rt-multi-thread", "net"]}
 tokio-util = "0.6"
 url = "2"
 uuid = "0.8"
+lazy_static = "1.4.0"
 
 [build-dependencies]
 built = {version = "0.5", features = ["chrono"]}

diff --git a/connectorx-python/connectorx/tests/test_postgres.py b/connectorx-python/connectorx/tests/test_postgres.py
@@ -318,8 +318,9 @@ def test_read_sql_on_utf8(postgres_url: str) -> None:
 
 
 def test_types_binary(postgres_url: str) -> None:
-    query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum FROM test_types"
-    df = read_sql(postgres_url, query)
+    query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum, test_farray, test_iarray FROM test_types"
+    df = read_sql(postgres_url, query,
+                  partition_on="test_int16", partition_num=3)
     expected = pd.DataFrame(
         index=range(4),
         data={
@@ -367,14 +368,18 @@ def test_types_binary(postgres_url: str) -> None:
             "test_enum": pd.Series(
                 ["happy", "very happy", "ecstatic", "ecstatic"], dtype="object"
             ),
+            "test_farray": pd.Series([[], None, [0.0123], [0.000234, -12.987654321]], dtype="object"),
+            "test_iarray": pd.Series([[-1, 0, 1123], [], [-324324], None], dtype="object"),
         },
     )
+    print(df)
     assert_frame_equal(df, expected, check_names=True)
 
 
 def test_types_csv(postgres_url: str) -> None:
-    query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text FROM test_types"
-    df = read_sql(postgres_url, query, protocol="csv")
+    query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text, test_farray, test_iarray FROM test_types"
+    df = read_sql(postgres_url, query, protocol="csv",
+                  partition_on="test_int16", partition_num=2)
     expected = pd.DataFrame(
         index=range(4),
         data={
@@ -422,14 +427,17 @@ def test_types_csv(postgres_url: str) -> None:
             "test_enum": pd.Series(
                 ["happy", "very happy", "ecstatic", "ecstatic"], dtype="object"
             ),
+            "test_farray": pd.Series([[], None, [0.0123], [0.000234, -12.987654321]], dtype="object"),
+            "test_iarray": pd.Series([[-1, 0, 1123], [], [-324324], None], dtype="object"),
         },
     )
     assert_frame_equal(df, expected, check_names=True)
 
 
 def test_types_cursor(postgres_url: str) -> None:
-    query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text FROM test_types"
-    df = read_sql(postgres_url, query, protocol="cursor")
+    query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text, test_farray, test_iarray FROM test_types"
+    df = read_sql(postgres_url, query, protocol="cursor",
+                  partition_on="test_int16", partition_num=4)
     expected = pd.DataFrame(
         index=range(4),
         data={
@@ -477,6 +485,8 @@ def test_types_cursor(postgres_url: str) -> None:
             "test_enum": pd.Series(
                 ["happy", "very happy", "ecstatic", "ecstatic"], dtype="object"
             ),
+            "test_farray": pd.Series([[], None, [0.0123], [0.000234, -12.987654321]], dtype="object"),
+            "test_iarray": pd.Series([[-1, 0, 1123], [], [-324324], None], dtype="object"),
         },
     )
     assert_frame_equal(df, expected, check_names=True)

diff --git a/connectorx-python/src/lib.rs b/connectorx-python/src/lib.rs
@@ -11,6 +11,9 @@ use pyo3::prelude::*;
 use pyo3::{wrap_pyfunction, PyResult};
 use std::sync::Once;
 
+#[macro_use]
+extern crate lazy_static;
+
 static START: Once = Once::new();
 
 // https://github.com/PyO3/pyo3-built/issues/21

diff --git a/connectorx-python/src/pandas/destination.rs b/connectorx-python/src/pandas/destination.rs
@@ -1,7 +1,7 @@
 use super::{
     pandas_columns::{
-        BooleanBlock, BytesBlock, DateTimeBlock, Float64Block, HasPandasColumn, Int64Block,
-        PandasColumn, PandasColumnObject, PyBytes, StringBlock,
+        ArrayBlock, BooleanBlock, BytesBlock, DateTimeBlock, Float64Block, HasPandasColumn,
+        Int64Block, PandasColumn, PandasColumnObject, PyBytes, StringBlock,
     },
     pystring::PyString,
     typesystem::{PandasArrayType, PandasBlockType, PandasTypeSystem},
@@ -158,6 +158,12 @@ impl<'a> Destination for PandasDestination<'a> {
                 PandasBlockType::Float64 => {
                     self.allocate_array::<f64>(dt, placement)?;
                 }
+                PandasBlockType::Float64Array => {
+                    self.allocate_array::<super::pandas_columns::PyList>(dt, placement)?;
+                }
+                PandasBlockType::Int64Array => {
+                    self.allocate_array::<super::pandas_columns::PyList>(dt, placement)?;
+                }
                 PandasBlockType::String => {
                     self.allocate_array::<PyString>(dt, placement)?;
                 }
@@ -210,6 +216,28 @@ impl<'a> Destination for PandasDestination<'a> {
                             .collect()
                     }
                 }
+                PandasBlockType::Float64Array => {
+                    let fblock = ArrayBlock::<f64>::extract(buf)?;
+                    let fcols = fblock.split()?;
+                    for (&cid, fcol) in block.cids.iter().zip_eq(fcols) {
+                        partitioned_columns[cid] = fcol
+                            .partition(&counts)
+                            .into_iter()
+                            .map(|c| Box::new(c) as _)
+                            .collect()
+                    }
+                }
+                PandasBlockType::Int64Array => {
+                    let fblock = ArrayBlock::<i64>::extract(buf)?;
+                    let fcols = fblock.split()?;
+                    for (&cid, fcol) in block.cids.iter().zip_eq(fcols) {
+                        partitioned_columns[cid] = fcol
+                            .partition(&counts)
+                            .into_iter()
+                            .map(|c| Box::new(c) as _)
+                            .collect()
+                    }
+                }
                 PandasBlockType::Int64(_) => {
                     let ublock = Int64Block::extract(buf)?;
                     let ucols = ublock.split()?;

diff --git a/connectorx-python/src/pandas/pandas_columns/array.rs b/connectorx-python/src/pandas/pandas_columns/array.rs
@@ -0,0 +1,227 @@
+use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject, GIL_MUTEX};
+use crate::errors::ConnectorXPythonError;
+use anyhow::anyhow;
+use fehler::throws;
+use ndarray::{ArrayViewMut2, Axis, Ix2};
+use numpy::{npyffi::NPY_TYPES, Element, PyArray, PyArrayDescr};
+use pyo3::{FromPyObject, Py, PyAny, PyResult, Python, ToPyObject};
+use std::any::TypeId;
+use std::marker::PhantomData;
+
+#[derive(Clone)]
+#[repr(transparent)]
+pub struct PyList(Py<pyo3::types::PyList>);
+
+// In order to put it into a numpy array
+impl Element for PyList {
+    const DATA_TYPE: numpy::DataType = numpy::DataType::Object;
+    fn is_same_type(dtype: &PyArrayDescr) -> bool {
+        unsafe { *dtype.as_dtype_ptr() }.type_num == NPY_TYPES::NPY_OBJECT as i32
+    }
+}
+
+pub struct ArrayBlock<'a, V> {
+    data: ArrayViewMut2<'a, PyList>,
+    buf_size_mb: usize,
+    _value_type: PhantomData<V>,
+}
+
+impl<'a, V> FromPyObject<'a> for ArrayBlock<'a, V> {
+    fn extract(ob: &'a PyAny) -> PyResult<Self> {
+        check_dtype(ob, "object")?;
+        let array = ob.downcast::<PyArray<PyList, Ix2>>()?;
+        let data = unsafe { array.as_array_mut() };
+        Ok(ArrayBlock::<V> {
+            data,
+            buf_size_mb: 16, // in MB
+            _value_type: PhantomData,
+        })
+    }
+}
+
+impl<'a, V> ArrayBlock<'a, V> {
+    #[throws(ConnectorXPythonError)]
+    pub fn split(self) -> Vec<ArrayColumn<'a, V>> {
+        let mut ret = vec![];
+        let mut view = self.data;
+
+        let nrows = view.ncols();
+        while view.nrows() > 0 {
+            let (col, rest) = view.split_at(Axis(0), 1);
+            view = rest;
+            ret.push(ArrayColumn::<V> {
+                data: col
+                    .into_shape(nrows)?
+                    .into_slice()
+                    .ok_or_else(|| anyhow!("get None for splitted FloatArray data"))?,
+                next_write: 0,
+                lengths: vec![],
+                buffer: Vec::with_capacity(self.buf_size_mb * (1 << 17) * 11 / 10), // allocate a little bit more memory to avoid Vec growth
+                buf_size: self.buf_size_mb * (1 << 17),
+            })
+        }
+        ret
+    }
+}
+
+pub struct ArrayColumn<'a, V> {
+    data: &'a mut [PyList],
+    next_write: usize,
+    buffer: Vec<V>,
+    lengths: Vec<usize>, // usize::MAX if the string is None
+    buf_size: usize,
+}
+
+impl<'a, V> PandasColumnObject for ArrayColumn<'a, V>
+where
+    V: Send + ToPyObject,
+{
+    fn typecheck(&self, id: TypeId) -> bool {
+        id == TypeId::of::<PyList>() || id == TypeId::of::<Option<PyList>>()
+    }
+    fn len(&self) -> usize {
+        self.data.len()
+    }
+    fn typename(&self) -> &'static str {
+        std::any::type_name::<PyList>()
+    }
+
+    #[throws(ConnectorXPythonError)]
+    fn finalize(&mut self) {
+        self.flush()?;
+    }
+}
+
+impl<'a> PandasColumn<Vec<f64>> for ArrayColumn<'a, f64> {
+    #[throws(ConnectorXPythonError)]
+    fn write(&mut self, val: Vec<f64>) {
+        self.lengths.push(val.len());
+        self.buffer.extend_from_slice(&val[..]);
+        self.try_flush()?;
+    }
+}
+
+impl<'a> PandasColumn<Option<Vec<f64>>> for ArrayColumn<'a, f64> {
+    #[throws(ConnectorXPythonError)]
+    fn write(&mut self, val: Option<Vec<f64>>) {
+        match val {
+            Some(v) => {
+                self.lengths.push(v.len());
+                self.buffer.extend_from_slice(&v[..]);
+                self.try_flush()?;
+            }
+            None => {
+                self.lengths.push(usize::MAX);
+            }
+        }
+    }
+}
+
+impl<'a> PandasColumn<Vec<i64>> for ArrayColumn<'a, i64> {
+    #[throws(ConnectorXPythonError)]
+    fn write(&mut self, val: Vec<i64>) {
+        self.lengths.push(val.len());
+        self.buffer.extend_from_slice(&val[..]);
+        self.try_flush()?;
+    }
+}
+
+impl<'a> PandasColumn<Option<Vec<i64>>> for ArrayColumn<'a, i64> {
+    #[throws(ConnectorXPythonError)]
+    fn write(&mut self, val: Option<Vec<i64>>) {
+        match val {
+            Some(v) => {
+                self.lengths.push(v.len());
+                self.buffer.extend_from_slice(&v[..]);
+                self.try_flush()?;
+            }
+            None => {
+                self.lengths.push(usize::MAX);
+            }
+        }
+    }
+}
+
+impl HasPandasColumn for Vec<f64> {
+    type PandasColumn<'a> = ArrayColumn<'a, f64>;
+}
+
+impl HasPandasColumn for Option<Vec<f64>> {
+    type PandasColumn<'a> = ArrayColumn<'a, f64>;
+}
+
+impl HasPandasColumn for Vec<i64> {
+    type PandasColumn<'a> = ArrayColumn<'a, i64>;
+}
+
+impl HasPandasColumn for Option<Vec<i64>> {
+    type PandasColumn<'a> = ArrayColumn<'a, i64>;
+}
+impl<'a, V> ArrayColumn<'a, V>
+where
+    V: Send + ToPyObject,
+{
+    pub fn partition(self, counts: &[usize]) -> Vec<ArrayColumn<'a, V>> {
+        let mut partitions = vec![];
+        let mut data = self.data;
+
+        for &c in counts {
+            let (splitted, rest) = data.split_at_mut(c);
+            data = rest;
+            partitions.push(ArrayColumn {
+                data: splitted,
+                next_write: 0,
+                lengths: vec![],
+                buffer: Vec::with_capacity(self.buf_size),
+                buf_size: self.buf_size,
+            });
+        }
+        partitions
+    }
+
+    #[throws(ConnectorXPythonError)]
+    pub fn flush(&mut self) {
+        let nvecs = self.lengths.len();
+
+        if nvecs > 0 {
+            let py = unsafe { Python::assume_gil_acquired() };
+
+            {
+                // allocation in python is not thread safe
+                let _guard = GIL_MUTEX
+                    .lock()
+                    .map_err(|e| anyhow!("mutex poisoned {}", e))?;
+                let mut start = 0;
+                for (i, &len) in self.lengths.iter().enumerate() {
+                    if len != usize::MAX {
+                        let end = start + len;
+                        unsafe {
+                            // allocate and write in the same time
+                            *self.data.get_unchecked_mut(self.next_write + i) = PyList(
+                                pyo3::types::PyList::new(py, &self.buffer[start..end]).into(),
+                            );
+                        };
+                        start = end;
+                    } else {
+                        unsafe {
+                            let n: &pyo3::types::PyList =
+                                py.from_borrowed_ptr(pyo3::ffi::Py_None());
+                            *self.data.get_unchecked_mut(self.next_write + i) = PyList(n.into());
+                        }
+                    }
+                }
+            }
+
+            self.buffer.truncate(0);
+            self.lengths.truncate(0);
+            self.next_write += nvecs;
+        }
+    }
+
+    #[throws(ConnectorXPythonError)]
+    pub fn try_flush(&mut self) {
+        if self.buffer.len() >= self.buf_size {
+            self.flush()?;
+        }
+    }
+}