Skip to content

Support arrow Table/RecordBatch types #2103

Closed
@dhirschfeld

Description

@dhirschfeld

xref: #614

I'm sure this is possible with #606 so this issue is mostly just to document my attempt to get this working. Out-of-the box (1.22.0+9.gdb758d0f), attempting to pass an arrow Table or RecordBatch results in a TypeError:

TypeError: no default __reduce__ due to non-trivial __cinit__
from distributed import Client
import pandas as pd
import pyarrow as pa

client = Client()

df = pd.DataFrame({'A': list('abc'), 'B': [1,2,3]})
tbl = pa.Table.from_pandas(df, preserve_index=False)

def echo(arg):
    return arg
>>> client.submit(echo, df).result().equals(df)
True
>>> client.submit(echo, tbl).result()
distributed.protocol.pickle - INFO - Failed to serialize (pyarrow.Table
A: string
B: int64
metadata
--------
{b'pandas': b'{"index_columns": [], "column_indexes": [], "columns": [{"name":'
            b' "A", "field_name": "A", "pandas_type": "unicode", "numpy_type":'
            b' "object", "metadata": null}, {"name": "B", "field_name": "B", "'
            b'pandas_type": "int64", "numpy_type": "int64", "metadata": null}]'
            b', "pandas_version": "0.23.1"}'},). Exception: no default __reduce__ due to non-trivial __cinit__
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:\Miniconda3\lib\site-packages\distributed\protocol\pickle.py in dumps(x)
     37     try:
---> 38         result = pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
     39         if len(result) < 1000:

C:\Miniconda3\lib\site-packages\pyarrow\lib.cp36-win_amd64.pyd in pyarrow.lib.RecordBatch.__reduce_cython__()

TypeError: no default __reduce__ due to non-trivial __cinit__

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-48-4e8e2ea90e79> in <module>()
----> 1 client.submit(echo, tbl).result()

C:\Miniconda3\lib\site-packages\distributed\client.py in submit(self, func, *args, **kwargs)
   1236                                          resources={skey: resources} if resources else None,
   1237                                          retries=retries,
-> 1238                                          fifo_timeout=fifo_timeout)
   1239 
   1240         logger.debug("Submit %s(...), %s", funcname(func), key)

C:\Miniconda3\lib\site-packages\distributed\client.py in _graph_to_futures(self, dsk, keys, restrictions, loose_restrictions, priority, user_priority, resources, retries, fifo_timeout)
   2093 
   2094             self._send_to_scheduler({'op': 'update-graph',
-> 2095                                      'tasks': valmap(dumps_task, dsk3),
   2096                                      'dependencies': dependencies,
   2097                                      'keys': list(flatkeys),

C:\Miniconda3\lib\site-packages\cytoolz\dicttoolz.pyx in cytoolz.dicttoolz.valmap()

C:\Miniconda3\lib\site-packages\cytoolz\dicttoolz.pyx in cytoolz.dicttoolz.valmap()

C:\Miniconda3\lib\site-packages\distributed\worker.py in dumps_task(task)
    799         elif not any(map(_maybe_complex, task[1:])):
    800             return {'function': dumps_function(task[0]),
--> 801                     'args': warn_dumps(task[1:])}
    802     return to_serialize(task)
    803 

C:\Miniconda3\lib\site-packages\distributed\worker.py in warn_dumps(obj, dumps, limit)
    808 def warn_dumps(obj, dumps=pickle.dumps, limit=1e6):
    809     """ Dump an object to bytes, warn if those bytes are large """
--> 810     b = dumps(obj)
    811     if not _warn_dumps_warned[0] and len(b) > limit:
    812         _warn_dumps_warned[0] = True

C:\Miniconda3\lib\site-packages\distributed\protocol\pickle.py in dumps(x)
     49     except Exception:
     50         try:
---> 51             return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
     52         except Exception as e:
     53             logger.info("Failed to serialize %s. Exception: %s", x, e)

C:\Miniconda3\lib\site-packages\cloudpickle\cloudpickle.py in dumps(obj, protocol)
    893     try:
    894         cp = CloudPickler(file, protocol=protocol)
--> 895         cp.dump(obj)
    896         return file.getvalue()
    897     finally:

C:\Miniconda3\lib\site-packages\cloudpickle\cloudpickle.py in dump(self, obj)
    266         self.inject_addons()
    267         try:
--> 268             return Pickler.dump(self, obj)
    269         except RuntimeError as e:
    270             if 'recursion' in e.args[0]:

C:\Miniconda3\lib\pickle.py in dump(self, obj)
    407         if self.proto >= 4:
    408             self.framer.start_framing()
--> 409         self.save(obj)
    410         self.write(STOP)
    411         self.framer.end_framing()

C:\Miniconda3\lib\pickle.py in save(self, obj, save_persistent_id)
    474         f = self.dispatch.get(t)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return
    478 

C:\Miniconda3\lib\pickle.py in save_tuple(self, obj)
    734         if n <= 3 and self.proto >= 2:
    735             for element in obj:
--> 736                 save(element)
    737             # Subtle.  Same as in the big comment below.
    738             if id(obj) in memo:

C:\Miniconda3\lib\pickle.py in save(self, obj, save_persistent_id)
    494             reduce = getattr(obj, "__reduce_ex__", None)
    495             if reduce is not None:
--> 496                 rv = reduce(self.proto)
    497             else:
    498                 reduce = getattr(obj, "__reduce__", None)

C:\Miniconda3\lib\site-packages\pyarrow\lib.cp36-win_amd64.pyd in pyarrow.lib.RecordBatch.__reduce_cython__()

TypeError: no default __reduce__ due to non-trivial __cinit__

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions