Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Justfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
set dotenv-load := true

build-release:
cargo build --release

Expand Down
1 change: 1 addition & 0 deletions connectorx-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ tokio = {version = "1", features = ["rt", "rt-multi-thread", "net"]}
tokio-util = "0.6"
url = "2"
uuid = "0.8"
lazy_static = "1.4.0"

[build-dependencies]
built = {version = "0.5", features = ["chrono"]}
Expand Down
22 changes: 16 additions & 6 deletions connectorx-python/connectorx/tests/test_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,8 +318,9 @@ def test_read_sql_on_utf8(postgres_url: str) -> None:


def test_types_binary(postgres_url: str) -> None:
query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum FROM test_types"
df = read_sql(postgres_url, query)
query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum, test_farray, test_iarray FROM test_types"
df = read_sql(postgres_url, query,
partition_on="test_int16", partition_num=3)
expected = pd.DataFrame(
index=range(4),
data={
Expand Down Expand Up @@ -367,14 +368,18 @@ def test_types_binary(postgres_url: str) -> None:
"test_enum": pd.Series(
["happy", "very happy", "ecstatic", "ecstatic"], dtype="object"
),
"test_farray": pd.Series([[], None, [0.0123], [0.000234, -12.987654321]], dtype="object"),
"test_iarray": pd.Series([[-1, 0, 1123], [], [-324324], None], dtype="object"),
},
)
print(df)
assert_frame_equal(df, expected, check_names=True)


def test_types_csv(postgres_url: str) -> None:
query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text FROM test_types"
df = read_sql(postgres_url, query, protocol="csv")
query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text, test_farray, test_iarray FROM test_types"
df = read_sql(postgres_url, query, protocol="csv",
partition_on="test_int16", partition_num=2)
expected = pd.DataFrame(
index=range(4),
data={
Expand Down Expand Up @@ -422,14 +427,17 @@ def test_types_csv(postgres_url: str) -> None:
"test_enum": pd.Series(
["happy", "very happy", "ecstatic", "ecstatic"], dtype="object"
),
"test_farray": pd.Series([[], None, [0.0123], [0.000234, -12.987654321]], dtype="object"),
"test_iarray": pd.Series([[-1, 0, 1123], [], [-324324], None], dtype="object"),
},
)
assert_frame_equal(df, expected, check_names=True)


def test_types_cursor(postgres_url: str) -> None:
query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text FROM test_types"
df = read_sql(postgres_url, query, protocol="cursor")
query = "SELECT test_int16, test_char, test_uuid, test_time, test_json, test_jsonb, test_bytea, test_enum::text, test_farray, test_iarray FROM test_types"
df = read_sql(postgres_url, query, protocol="cursor",
partition_on="test_int16", partition_num=4)
expected = pd.DataFrame(
index=range(4),
data={
Expand Down Expand Up @@ -477,6 +485,8 @@ def test_types_cursor(postgres_url: str) -> None:
"test_enum": pd.Series(
["happy", "very happy", "ecstatic", "ecstatic"], dtype="object"
),
"test_farray": pd.Series([[], None, [0.0123], [0.000234, -12.987654321]], dtype="object"),
"test_iarray": pd.Series([[-1, 0, 1123], [], [-324324], None], dtype="object"),
},
)
assert_frame_equal(df, expected, check_names=True)
Expand Down
3 changes: 3 additions & 0 deletions connectorx-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ use pyo3::prelude::*;
use pyo3::{wrap_pyfunction, PyResult};
use std::sync::Once;

#[macro_use]
extern crate lazy_static;

static START: Once = Once::new();

// https://github.com/PyO3/pyo3-built/issues/21
Expand Down
32 changes: 30 additions & 2 deletions connectorx-python/src/pandas/destination.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use super::{
pandas_columns::{
BooleanBlock, BytesBlock, DateTimeBlock, Float64Block, HasPandasColumn, Int64Block,
PandasColumn, PandasColumnObject, PyBytes, StringBlock,
ArrayBlock, BooleanBlock, BytesBlock, DateTimeBlock, Float64Block, HasPandasColumn,
Int64Block, PandasColumn, PandasColumnObject, PyBytes, StringBlock,
},
pystring::PyString,
typesystem::{PandasArrayType, PandasBlockType, PandasTypeSystem},
Expand Down Expand Up @@ -158,6 +158,12 @@ impl<'a> Destination for PandasDestination<'a> {
PandasBlockType::Float64 => {
self.allocate_array::<f64>(dt, placement)?;
}
PandasBlockType::Float64Array => {
self.allocate_array::<super::pandas_columns::PyList>(dt, placement)?;
}
PandasBlockType::Int64Array => {
self.allocate_array::<super::pandas_columns::PyList>(dt, placement)?;
}
PandasBlockType::String => {
self.allocate_array::<PyString>(dt, placement)?;
}
Expand Down Expand Up @@ -210,6 +216,28 @@ impl<'a> Destination for PandasDestination<'a> {
.collect()
}
}
PandasBlockType::Float64Array => {
let fblock = ArrayBlock::<f64>::extract(buf)?;
let fcols = fblock.split()?;
for (&cid, fcol) in block.cids.iter().zip_eq(fcols) {
partitioned_columns[cid] = fcol
.partition(&counts)
.into_iter()
.map(|c| Box::new(c) as _)
.collect()
}
}
PandasBlockType::Int64Array => {
let fblock = ArrayBlock::<i64>::extract(buf)?;
let fcols = fblock.split()?;
for (&cid, fcol) in block.cids.iter().zip_eq(fcols) {
partitioned_columns[cid] = fcol
.partition(&counts)
.into_iter()
.map(|c| Box::new(c) as _)
.collect()
}
}
PandasBlockType::Int64(_) => {
let ublock = Int64Block::extract(buf)?;
let ucols = ublock.split()?;
Expand Down
227 changes: 227 additions & 0 deletions connectorx-python/src/pandas/pandas_columns/array.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
use super::{check_dtype, HasPandasColumn, PandasColumn, PandasColumnObject, GIL_MUTEX};
use crate::errors::ConnectorXPythonError;
use anyhow::anyhow;
use fehler::throws;
use ndarray::{ArrayViewMut2, Axis, Ix2};
use numpy::{npyffi::NPY_TYPES, Element, PyArray, PyArrayDescr};
use pyo3::{FromPyObject, Py, PyAny, PyResult, Python, ToPyObject};
use std::any::TypeId;
use std::marker::PhantomData;

#[derive(Clone)]
#[repr(transparent)]
pub struct PyList(Py<pyo3::types::PyList>);

// In order to put it into a numpy array
impl Element for PyList {
const DATA_TYPE: numpy::DataType = numpy::DataType::Object;
fn is_same_type(dtype: &PyArrayDescr) -> bool {
unsafe { *dtype.as_dtype_ptr() }.type_num == NPY_TYPES::NPY_OBJECT as i32
}
}

pub struct ArrayBlock<'a, V> {
data: ArrayViewMut2<'a, PyList>,
buf_size_mb: usize,
_value_type: PhantomData<V>,
}

impl<'a, V> FromPyObject<'a> for ArrayBlock<'a, V> {
fn extract(ob: &'a PyAny) -> PyResult<Self> {
check_dtype(ob, "object")?;
let array = ob.downcast::<PyArray<PyList, Ix2>>()?;
let data = unsafe { array.as_array_mut() };
Ok(ArrayBlock::<V> {
data,
buf_size_mb: 16, // in MB
_value_type: PhantomData,
})
}
}

impl<'a, V> ArrayBlock<'a, V> {
#[throws(ConnectorXPythonError)]
pub fn split(self) -> Vec<ArrayColumn<'a, V>> {
let mut ret = vec![];
let mut view = self.data;

let nrows = view.ncols();
while view.nrows() > 0 {
let (col, rest) = view.split_at(Axis(0), 1);
view = rest;
ret.push(ArrayColumn::<V> {
data: col
.into_shape(nrows)?
.into_slice()
.ok_or_else(|| anyhow!("get None for splitted FloatArray data"))?,
next_write: 0,
lengths: vec![],
buffer: Vec::with_capacity(self.buf_size_mb * (1 << 17) * 11 / 10), // allocate a little bit more memory to avoid Vec growth
buf_size: self.buf_size_mb * (1 << 17),
})
}
ret
}
}

pub struct ArrayColumn<'a, V> {
data: &'a mut [PyList],
next_write: usize,
buffer: Vec<V>,
lengths: Vec<usize>, // usize::MAX if the string is None
buf_size: usize,
}

impl<'a, V> PandasColumnObject for ArrayColumn<'a, V>
where
V: Send + ToPyObject,
{
fn typecheck(&self, id: TypeId) -> bool {
id == TypeId::of::<PyList>() || id == TypeId::of::<Option<PyList>>()
}
fn len(&self) -> usize {
self.data.len()
}
fn typename(&self) -> &'static str {
std::any::type_name::<PyList>()
}

#[throws(ConnectorXPythonError)]
fn finalize(&mut self) {
self.flush()?;
}
}

impl<'a> PandasColumn<Vec<f64>> for ArrayColumn<'a, f64> {
#[throws(ConnectorXPythonError)]
fn write(&mut self, val: Vec<f64>) {
self.lengths.push(val.len());
self.buffer.extend_from_slice(&val[..]);
self.try_flush()?;
}
}

impl<'a> PandasColumn<Option<Vec<f64>>> for ArrayColumn<'a, f64> {
#[throws(ConnectorXPythonError)]
fn write(&mut self, val: Option<Vec<f64>>) {
match val {
Some(v) => {
self.lengths.push(v.len());
self.buffer.extend_from_slice(&v[..]);
self.try_flush()?;
}
None => {
self.lengths.push(usize::MAX);
}
}
}
}

impl<'a> PandasColumn<Vec<i64>> for ArrayColumn<'a, i64> {
#[throws(ConnectorXPythonError)]
fn write(&mut self, val: Vec<i64>) {
self.lengths.push(val.len());
self.buffer.extend_from_slice(&val[..]);
self.try_flush()?;
}
}

impl<'a> PandasColumn<Option<Vec<i64>>> for ArrayColumn<'a, i64> {
#[throws(ConnectorXPythonError)]
fn write(&mut self, val: Option<Vec<i64>>) {
match val {
Some(v) => {
self.lengths.push(v.len());
self.buffer.extend_from_slice(&v[..]);
self.try_flush()?;
}
None => {
self.lengths.push(usize::MAX);
}
}
}
}

impl HasPandasColumn for Vec<f64> {
type PandasColumn<'a> = ArrayColumn<'a, f64>;
}

impl HasPandasColumn for Option<Vec<f64>> {
type PandasColumn<'a> = ArrayColumn<'a, f64>;
}

impl HasPandasColumn for Vec<i64> {
type PandasColumn<'a> = ArrayColumn<'a, i64>;
}

impl HasPandasColumn for Option<Vec<i64>> {
type PandasColumn<'a> = ArrayColumn<'a, i64>;
}
impl<'a, V> ArrayColumn<'a, V>
where
V: Send + ToPyObject,
{
pub fn partition(self, counts: &[usize]) -> Vec<ArrayColumn<'a, V>> {
let mut partitions = vec![];
let mut data = self.data;

for &c in counts {
let (splitted, rest) = data.split_at_mut(c);
data = rest;
partitions.push(ArrayColumn {
data: splitted,
next_write: 0,
lengths: vec![],
buffer: Vec::with_capacity(self.buf_size),
buf_size: self.buf_size,
});
}
partitions
}

#[throws(ConnectorXPythonError)]
pub fn flush(&mut self) {
let nvecs = self.lengths.len();

if nvecs > 0 {
let py = unsafe { Python::assume_gil_acquired() };

{
// allocation in python is not thread safe
let _guard = GIL_MUTEX
.lock()
.map_err(|e| anyhow!("mutex poisoned {}", e))?;
let mut start = 0;
for (i, &len) in self.lengths.iter().enumerate() {
if len != usize::MAX {
let end = start + len;
unsafe {
// allocate and write in the same time
*self.data.get_unchecked_mut(self.next_write + i) = PyList(
pyo3::types::PyList::new(py, &self.buffer[start..end]).into(),
);
};
start = end;
} else {
unsafe {
let n: &pyo3::types::PyList =
py.from_borrowed_ptr(pyo3::ffi::Py_None());
*self.data.get_unchecked_mut(self.next_write + i) = PyList(n.into());
}
}
}
}

self.buffer.truncate(0);
self.lengths.truncate(0);
self.next_write += nvecs;
}
}

#[throws(ConnectorXPythonError)]
pub fn try_flush(&mut self) {
if self.buffer.len() >= self.buf_size {
self.flush()?;
}
}
}
Loading