diff --git a/.gitignore b/.gitignore index eaf84826..a3fa088f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,22 @@ -src/questdb/ingress.html src/questdb/ingress.c +src/questdb/*.html rustup-init.exe +# Linux Perf profiles +perf.data* +perf/*.svg + +# Atheris Crash/OOM and other files +fuzz-artifact/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class +# Parquet files generated as part of example runs +*.parquet + # C extensions *.so diff --git a/.vscode/settings.json b/.vscode/settings.json index a7d0fc7b..c710dcde 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,7 @@ { - "esbonio.sphinx.confDir": "" + "esbonio.sphinx.confDir": "", + "cmake.configureOnOpen": false, + "files.associations": { + "ingress_helper.h": "c" + } } \ No newline at end of file diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1015d247..0fe8b7a3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,9 +2,47 @@ Changelog ========= +1.1.0 (2023-01-04) +------------------ + +Features +~~~~~~~~ + +* High-performance ingestion of `Pandas `_ + dataframes into QuestDB via ILP. + We now support most Pandas column types. The logic is implemented in native + code and is orders of magnitude faster than iterating the dataframe + in Python and calling the ``Buffer.row()`` or ``Sender.row()`` methods: The + ``Buffer`` can be written from Pandas at hundreds of MiB/s per CPU core. + The new ``dataframe()`` method continues working with the ``auto_flush`` + feature. + See API documentation and examples for the new ``dataframe()`` method + available on both the ``Sender`` and ``Buffer`` classes. + +* New ``TimestampNanos.now()`` and ``TimestampMicros.now()`` methods. + *These are the new recommended way of getting the current timestamp.* + +* The Python GIL is now released during calls to ``Sender.flush()`` and when + ``auto_flush`` is triggered. This should improve throughput when using the + ``Sender`` from multiple threads. + +Errata +~~~~~~ + +* In previous releases the documentation for the ``from_datetime()`` methods of + the ``TimestampNanos`` and ``TimestampMicros`` types recommended calling + ``datetime.datetime.utcnow()`` to get the current timestamp. This is incorrect + as it will (confusinly) return object with the local timezone instead of UTC. + This documentation has been corrected and now recommends calling + ``datetime.datetime.now(tz=datetime.timezone.utc)`` or (more efficiently) the + new ``TimestampNanos.now()`` and ``TimestampMicros.now()`` methods. + 1.0.2 (2022-10-31) ------------------ +Features +~~~~~~~~ + * Support for Python 3.11. * Updated to version 2.1.1 of the ``c-questdb-client`` library: @@ -14,13 +52,20 @@ Changelog 1.0.1 (2022-08-16) ------------------ +Features +~~~~~~~~ + +* As a matter of convenience, the ``Buffer.row`` method can now take ``None`` column + values. This has the same semantics as skipping the column altogether. + Closes `#3 `_. + +Bugfixes +~~~~~~~~ + * Fixed a major bug where Python ``int`` and ``float`` types were handled with 32-bit instead of 64-bit precision. This caused certain ``int`` values to be rejected and other ``float`` values to be rounded incorrectly. Closes `#13 `_. -* As a matter of convenience, the ``Buffer.row`` method can now take ``None`` column - values. This has the same semantics as skipping the column altogether. - Closes `#3 `_. * Fixed a minor bug where an error auto-flush caused a second clean-up error. Closes `#4 `_. @@ -28,6 +73,9 @@ Changelog 1.0.0 (2022-07-15) ------------------ +Features +~~~~~~~~ + * First stable release. * Insert data into QuestDB via ILP. * Sender and Buffer APIs. @@ -38,6 +86,9 @@ Changelog 0.0.3 (2022-07-14) ------------------ +Features +~~~~~~~~ + * Initial set of features to connect to the database. * ``Buffer`` and ``Sender`` classes. * First release where ``pip install questdb`` should work. @@ -46,4 +97,7 @@ Changelog 0.0.1 (2022-07-08) ------------------ +Features +~~~~~~~~ + * First release on PyPI. diff --git a/README.rst b/README.rst index ae0e4947..a5dc9469 100644 --- a/README.rst +++ b/README.rst @@ -34,6 +34,22 @@ The latest version of the library is 1.0.2. columns={'temperature': 20.0, 'humidity': 0.5}) sender.flush() +You can also send Pandas dataframes: + +.. code-block:: python + + import pandas as pd + from questdb.ingress import Sender + + df = pd.DataFrame({ + 'id': pd.Categorical(['toronto1', 'paris3']), + 'temperature': [20.0, 21.0], + 'humidity': [0.5, 0.6], + 'timestamp': pd.to_datetime(['2021-01-01', '2021-01-02'])'}) + + with Sender('localhost', 9009) as sender: + sender.dataframe(df, table_name='sensors') + Docs ==== diff --git a/TODO.rst b/TODO.rst index 7bb28af2..cb07d6f6 100644 --- a/TODO.rst +++ b/TODO.rst @@ -6,8 +6,6 @@ TODO Build Tooling ============= -* **[HIGH]** Transition to Azure, move Linux arm to ARM pipeline without QEMU. - * **[MEDIUM]** Automate Apple Silicon as part of CI. * **[LOW]** Release to PyPI from CI. @@ -19,13 +17,3 @@ Docs * **[MEDIUM]** Examples should be tested as part of the unit tests (as they are in the C client). This is to ensure they don't "bit rot" as the code changes. - -* **[MEDIUM]** Document on a per-version basis. - -Development -=========== - -* **[HIGH]** Implement ``tabular()`` API in the buffer. - -* **[MEDIUM]** Implement ``pandas()`` API in the buffer. - *This can probably wait for a future release.* \ No newline at end of file diff --git a/c-questdb-client b/c-questdb-client index 2cd4e7fb..ad3776ef 160000 --- a/c-questdb-client +++ b/c-questdb-client @@ -1 +1 @@ -Subproject commit 2cd4e7fb3ad10b20a7de28527cdf18cf240b9634 +Subproject commit ad3776efb057d09a86a83e15c0f39ae40d75485b diff --git a/ci/cibuildwheel.yaml b/ci/cibuildwheel.yaml index 1f612720..d6fa6c32 100644 --- a/ci/cibuildwheel.yaml +++ b/ci/cibuildwheel.yaml @@ -68,7 +68,7 @@ stages: - bash: | set -o errexit python3 -m pip install --upgrade pip - pip3 install cibuildwheel==2.11.1 + python3 -m pip install cibuildwheel==2.11.2 displayName: Install dependencies - bash: cibuildwheel --output-dir wheelhouse . displayName: Build wheels @@ -83,7 +83,7 @@ stages: - bash: | set -o errexit python3 -m pip install --upgrade pip - pip3 install cibuildwheel==2.11.1 + python3 -m pip install cibuildwheel==2.11.2 displayName: Install dependencies - bash: cibuildwheel --output-dir wheelhouse . displayName: Build wheels @@ -100,7 +100,7 @@ stages: - bash: | set -o errexit python3 -m pip install --upgrade pip - pip3 install cibuildwheel==2.11.1 + python3 -m pip install cibuildwheel==2.11.2 displayName: Install dependencies - bash: cibuildwheel --output-dir wheelhouse . displayName: Build wheels @@ -117,7 +117,7 @@ stages: - bash: | set -o errexit python3 -m pip install --upgrade pip - pip3 install cibuildwheel==2.11.1 + python3 -m pip install cibuildwheel==2.11.2 displayName: Install dependencies - bash: cibuildwheel --output-dir wheelhouse . displayName: Build wheels @@ -134,7 +134,7 @@ stages: - bash: | set -o errexit python3 -m pip install --upgrade pip - pip3 install cibuildwheel==2.11.1 + python3 -m pip install cibuildwheel==2.11.2 displayName: Install dependencies - bash: cibuildwheel --output-dir wheelhouse . displayName: Build wheels @@ -151,7 +151,7 @@ stages: - bash: | set -o errexit python3 -m pip install --upgrade pip - python3 -m pip install cibuildwheel==2.11.1 + python3 -m pip install cibuildwheel==2.11.2 displayName: Install dependencies - bash: cibuildwheel --output-dir wheelhouse . displayName: Build wheels @@ -165,8 +165,8 @@ stages: - task: UsePythonVersion@0 - bash: | set -o errexit - python -m pip install --upgrade pip - pip install cibuildwheel==2.11.1 + python3 -m pip install --upgrade pip + python3 -m pip install cibuildwheel==2.11.2 displayName: Install dependencies - bash: cibuildwheel --output-dir wheelhouse . displayName: Build wheels diff --git a/ci/pip_install_deps.py b/ci/pip_install_deps.py new file mode 100644 index 00000000..13e9f247 --- /dev/null +++ b/ci/pip_install_deps.py @@ -0,0 +1,74 @@ +import sys +import subprocess +import shlex +import textwrap +import platform + + +class UnsupportedDependency(Exception): + pass + + +def pip_install(package): + args = [ + sys.executable, + '-m', 'pip', 'install', + '--upgrade', + '--only-binary', ':all:', + package] + args_s = ' '.join(shlex.quote(arg) for arg in args) + sys.stderr.write(args_s + '\n') + res = subprocess.run( + args, + stderr=subprocess.STDOUT, + stdout=subprocess.PIPE) + if res.returncode == 0: + return + output = res.stdout.decode('utf-8') + if 'Could not find a version that satisfies the requirement' in output: + raise UnsupportedDependency(output) + else: + sys.stderr.write(output + '\n') + sys.exit(res.returncode) + + +def try_pip_install(package): + try: + pip_install(package) + except UnsupportedDependency as e: + msg = textwrap.indent(str(e), ' ' * 8) + sys.stderr.write(f' Ignored unsatisfiable dependency:\n{msg}\n') + + +def ensure_timezone(): + try: + import zoneinfo + if platform.system() == 'Windows': + pip_install('tzdata') # for zoneinfo + except ImportError: + pip_install('pytz') + + +def main(): + ensure_timezone() + try_pip_install('fastparquet>=2022.12.0') + try_pip_install('pandas') + try_pip_install('numpy') + try_pip_install('pyarrow') + + on_linux_is_glibc = ( + (not platform.system() == 'Linux') or + (platform.libc_ver()[0] == 'glibc')) + is_64bits = sys.maxsize > 2**32 + is_cpython = platform.python_implementation() == 'CPython' + if on_linux_is_glibc and is_64bits and is_cpython: + # Ensure that we've managed to install the expected dependencies. + import pandas + import numpy + import pyarrow + if sys.version_info >= (3, 8): + import fastparquet + + +if __name__ == "__main__": + main() diff --git a/ci/run_tests_pipeline.yaml b/ci/run_tests_pipeline.yaml index 78834b50..84f62c7e 100644 --- a/ci/run_tests_pipeline.yaml +++ b/ci/run_tests_pipeline.yaml @@ -28,7 +28,9 @@ stages: submodules: true - task: UsePythonVersion@0 - script: python3 --version - - script: python3 -m pip install cython + - script: | + python3 -m pip install cython + python3 ci/pip_install_deps.py displayName: Installing Python dependencies - script: python3 proj.py build displayName: "Build" diff --git a/dev_requirements.txt b/dev_requirements.txt index a5e835a7..c639c014 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,8 +1,12 @@ setuptools>=45.2.0 Cython>=0.29.32 wheel>=0.34.2 -cibuildwheel>=2.11.1 +cibuildwheel>=2.11.2 Sphinx>=5.0.2 sphinx-rtd-theme>=1.0.0 twine>=4.0.1 bump2version>=1.0.1 +pandas>=1.3.5 +numpy>=1.21.6 +pyarrow>=10.0.1 +fastparquet>=2022.12.0 diff --git a/docs/examples.rst b/docs/examples.rst index a1a4409e..b9e01081 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -5,6 +5,9 @@ Examples Basics ====== +Row-by-row Insertion +-------------------- + The following example connects to the database and sends two rows (lines). The connection is unauthenticated and the data is sent at the end of the @@ -18,7 +21,7 @@ Here the :class:`questdb.ingress.Sender` is constructed with just ``host`` and Authentication and TLS -====================== +---------------------- Continuing from the previous example, the connection is authenticated and also uses TLS. @@ -31,7 +34,7 @@ and ``tls`` arguments. Explicit Buffers -================ +---------------- For more advanced use cases where the same messages need to be sent to multiple questdb instances or you want to decouple serialization and sending (as may be @@ -48,7 +51,7 @@ all data is sent. Ticking Random Data and Timer-based Flush -========================================= +----------------------------------------- The following example somewhat mimics the behavior of a loop in an application. @@ -57,3 +60,57 @@ based on a timer if the auto-flushing logic was not triggered recently. .. literalinclude:: ../examples/random_data.py :language: python + + +Data Frames +=========== + +Pandas Basics +------------- + +The following example shows how to insert data from a Pandas DataFrame to the +``'trades'`` table. + +.. literalinclude:: ../examples/pandas_basic.py + :language: python + +For details on all options, see the +:func:`questdb.ingress.Buffer.dataframe` method. + + +``pd.Categorical`` and multiple tables +-------------------------------------- + +The next example shows some more advanced features inserting data from Pandas. + +* The data is sent to multiple tables. + +* It uses the ``pd.Categorical`` type to determine the table to insert and also + uses it for the sensor name. + +* Columns of type ``pd.Categorical`` are sent as ``SYMBOL`` types. + +* The ``at`` parameter is specified using a column index: -1 is the last column. + +.. literalinclude:: ../examples/pandas_advanced.py + :language: python + +After running this example, the rows will be split across the ``'humidity'``, +``'temp_c'`` and ``'voc_index'`` tables. + +For details on all options, see the +:func:`questdb.ingress.Buffer.dataframe` method. + +Loading Pandas from a Parquet File +---------------------------------- + +The following example shows how to load a Pandas DataFrame from a Parquet file. + +The example also relies on the dataframe's index name to determine the table +name. + +.. literalinclude:: ../examples/pandas_parquet.py + :language: python + +For details on all options, see the +:func:`questdb.ingress.Buffer.dataframe` method. diff --git a/docs/installation.rst b/docs/installation.rst index 0d926ffe..7f450b3b 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -5,9 +5,9 @@ Installation The Python QuestDB client does not have any additional run-time dependencies and will run on any version of Python >= 3.7 on most platforms and architectures. -You can install it globally by running:: +You can install it (or update it) globally by running:: - python3 -m pip install questdb + python3 -m pip install -U questdb Or, from within a virtual environment:: @@ -20,6 +20,15 @@ If you're using poetry, you can add ``questdb`` as a dependency:: poetry add questdb +Note that the :func:`questdb.ingress.Buffer.dataframe` and the +:func:`questdb.ingress.Sender.dataframe` methods also require the following +dependencies to be installed: + +* ``pandas`` +* ``pyarrow`` +* ``numpy`` + + Verifying the Installation ========================== @@ -34,3 +43,16 @@ following statements from a ``python3`` interactive shell: >>> str(buf) 'test,a=b\n' + +If you also want to if check you can serialize from Pandas +(which requires additional dependencies): + +.. code-block:: python + + >>> import questdb.ingress + >>> import pandas as pd + >>> df = pd.DataFrame({'a': [1, 2]}) + >>> buf = questdb.ingress.Buffer() + >>> buf.dataframe(df, table_name='test') + >>> str(buf) + 'test a=1i\ntest a=2i\n' diff --git a/examples.manifest.yaml b/examples.manifest.yaml index 9379adbf..75a1e3fb 100644 --- a/examples.manifest.yaml +++ b/examples.manifest.yaml @@ -5,6 +5,9 @@ Python client library [docs](https://py-questdb-client.readthedocs.io/en/latest/) and [repo](https://github.com/questdb/py-questdb-client). + See more [examples](https://py-questdb-client.readthedocs.io/en/latest/examples.html), + including ingesting data from Pandas dataframes. + ``` python3 -m pip install questdb ``` @@ -15,6 +18,9 @@ Python client library [docs](https://py-questdb-client.readthedocs.io/en/latest/) and [repo](https://github.com/questdb/py-questdb-client). + See more [examples](https://py-questdb-client.readthedocs.io/en/latest/examples.html), + including ingesting data from Pandas dataframes. + ``` python3 -m pip install questdb ``` diff --git a/examples/pandas_advanced.py b/examples/pandas_advanced.py new file mode 100644 index 00000000..7b163cd4 --- /dev/null +++ b/examples/pandas_advanced.py @@ -0,0 +1,33 @@ +from questdb.ingress import Sender, IngressError + +import sys +import pandas as pd + + +def example(host: str = 'localhost', port: int = 9009): + df = pd.DataFrame({ + 'metric': pd.Categorical( + ['humidity', 'temp_c', 'voc_index', 'temp_c']), + 'sensor': pd.Categorical( + ['paris-01', 'london-02', 'london-01', 'paris-01']), + 'value': [ + 0.83, 22.62, 100.0, 23.62], + 'ts': [ + pd.Timestamp('2022-08-06 07:35:23.189062'), + pd.Timestamp('2022-08-06 07:35:23.189062'), + pd.Timestamp('2022-08-06 07:35:23.189062'), + pd.Timestamp('2022-08-06 07:35:23.189062')]}) + try: + with Sender(host, port) as sender: + sender.dataframe( + df, + table_name_col='metric', # Table name from 'metric' column. + symbols='auto', # Category columns as SYMBOL. (Default) + at=-1) # Last column contains the designated timestamps. + + except IngressError as e: + sys.stderr.write(f'Got error: {e}\n') + + +if __name__ == '__main__': + example() diff --git a/examples/pandas_basic.py b/examples/pandas_basic.py new file mode 100644 index 00000000..3c07d7fc --- /dev/null +++ b/examples/pandas_basic.py @@ -0,0 +1,29 @@ +from questdb.ingress import Sender, IngressError + +import sys +import pandas as pd + + +def example(host: str = 'localhost', port: int = 9009): + df = pd.DataFrame({ + 'pair': ['USDGBP', 'EURJPY'], + 'traded_price': [0.83, 142.62], + 'qty': [100, 400], + 'limit_price': [0.84, None], + 'timestamp': [ + pd.Timestamp('2022-08-06 07:35:23.189062', tz='UTC'), + pd.Timestamp('2022-08-06 07:35:23.189062', tz='UTC')]}) + try: + with Sender(host, port) as sender: + sender.dataframe( + df, + table_name='trades', # Table name to insert into. + symbols=['pair'], # Columns to be inserted as SYMBOL types. + at='timestamp') # Column containing the designated timestamps. + + except IngressError as e: + sys.stderr.write(f'Got error: {e}\n') + + +if __name__ == '__main__': + example() diff --git a/examples/pandas_parquet.py b/examples/pandas_parquet.py new file mode 100644 index 00000000..0d3b315d --- /dev/null +++ b/examples/pandas_parquet.py @@ -0,0 +1,43 @@ +from questdb.ingress import Sender +import pandas as pd + + +def write_parquet_file(): + df = pd.DataFrame({ + 'location': pd.Categorical( + ['BP-5541', 'UB-3355', 'SL-0995', 'BP-6653']), + 'provider': pd.Categorical( + ['BP Pulse', 'Ubitricity', 'Source London', 'BP Pulse']), + 'speed_kwh': pd.Categorical( + [50, 7, 7, 120]), + 'connector_type': pd.Categorical( + ['Type 2 & 2+CCS', 'Type 1 & 2', 'Type 1 & 2', 'Type 2 & 2+CCS']), + 'current_type': pd.Categorical( + ['dc', 'ac', 'ac', 'dc']), + 'price_pence': + [54, 34, 32, 59], + 'in_use': + [True, False, False, True], + 'ts': [ + pd.Timestamp('2022-12-30 12:15:00'), + pd.Timestamp('2022-12-30 12:16:00'), + pd.Timestamp('2022-12-30 12:18:00'), + pd.Timestamp('2022-12-30 12:19:00')]}) + name = 'ev_chargers' + df.index.name = name # We set the dataframe's index name here! + filename = f'{name}.parquet' + df.to_parquet(filename) + return filename + + +def example(host: str = 'localhost', port: int = 9009): + filename = write_parquet_file() + + df = pd.read_parquet(filename) + with Sender(host, port) as sender: + # Note: Table name is looked up from the dataframe's index name. + sender.dataframe(df, at='ts') + + +if __name__ == '__main__': + example() diff --git a/install_rust.py b/install_rust.py index a0adacca..d0de3696 100644 --- a/install_rust.py +++ b/install_rust.py @@ -1,4 +1,5 @@ import sys +sys.dont_write_bytecode = True import os import subprocess import pathlib diff --git a/perf/README.md b/perf/README.md new file mode 100644 index 00000000..a6ba49d8 --- /dev/null +++ b/perf/README.md @@ -0,0 +1,28 @@ +# Profiling with Linux Perf + +https://juanjose.garciaripoll.com/blog/profiling-code-with-linux-perf/index.html + +```bash +$ TEST_QUESTDB_PATCH_PATH=1 perf record -g --call-graph dwarf python3 test/benchmark.py -v TestBencharkPandas.test_string_encoding_1m +test_string_encoding_1m (__main__.TestBencharkPandas.test_string_encoding_1m) ... Time: 4.682273147998785, size: 4593750000 +ok + +---------------------------------------------------------------------- +Ran 1 test in 10.166s + +OK +[ perf record: Woken up 1341 times to write data ] +Warning: +Processed 54445 events and lost 91 chunks! + +Check IO/CPU overload! + +[ perf record: Captured and wrote 405.575 MB perf.data (50622 samples) ] +``` + +# Rendering results + +```bash +$ perf script | python3 perf/gprof2dot.py --format=perf | dot -Tsvg > perf/profile_graph.svg +$ (cd perf && python3 -m http.server) +``` \ No newline at end of file diff --git a/perf/gprof2dot.py b/perf/gprof2dot.py new file mode 100644 index 00000000..99554b2a --- /dev/null +++ b/perf/gprof2dot.py @@ -0,0 +1,3555 @@ +#!/usr/bin/env python3 +# +# Copyright 2008-2017 Jose Fonseca +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . +# + +"""Generate a dot graph from the output of several profilers.""" + +__author__ = "Jose Fonseca et al" + + +import sys +import math +import os.path +import re +import textwrap +import optparse +import xml.parsers.expat +import collections +import locale +import json +import fnmatch + +# Python 2.x/3.x compatibility +if sys.version_info[0] >= 3: + PYTHON_3 = True + def compat_iteritems(x): return x.items() # No iteritems() in Python 3 + def compat_itervalues(x): return x.values() # No itervalues() in Python 3 + def compat_keys(x): return list(x.keys()) # keys() is a generator in Python 3 + basestring = str # No class basestring in Python 3 + unichr = chr # No unichr in Python 3 + xrange = range # No xrange in Python 3 +else: + PYTHON_3 = False + def compat_iteritems(x): return x.iteritems() + def compat_itervalues(x): return x.itervalues() + def compat_keys(x): return x.keys() + + + +######################################################################## +# Model + + +MULTIPLICATION_SIGN = unichr(0xd7) + + +def times(x): + return "%u%s" % (x, MULTIPLICATION_SIGN) + +def percentage(p): + return "%.02f%%" % (p*100.0,) + +def add(a, b): + return a + b + +def fail(a, b): + assert False + + +tol = 2 ** -23 + +def ratio(numerator, denominator): + try: + ratio = float(numerator)/float(denominator) + except ZeroDivisionError: + # 0/0 is undefined, but 1.0 yields more useful results + return 1.0 + if ratio < 0.0: + if ratio < -tol: + sys.stderr.write('warning: negative ratio (%s/%s)\n' % (numerator, denominator)) + return 0.0 + if ratio > 1.0: + if ratio > 1.0 + tol: + sys.stderr.write('warning: ratio greater than one (%s/%s)\n' % (numerator, denominator)) + return 1.0 + return ratio + + +class UndefinedEvent(Exception): + """Raised when attempting to get an event which is undefined.""" + + def __init__(self, event): + Exception.__init__(self) + self.event = event + + def __str__(self): + return 'unspecified event %s' % self.event.name + + +class Event(object): + """Describe a kind of event, and its basic operations.""" + + def __init__(self, name, null, aggregator, formatter = str): + self.name = name + self._null = null + self._aggregator = aggregator + self._formatter = formatter + + def __eq__(self, other): + return self is other + + def __hash__(self): + return id(self) + + def null(self): + return self._null + + def aggregate(self, val1, val2): + """Aggregate two event values.""" + assert val1 is not None + assert val2 is not None + return self._aggregator(val1, val2) + + def format(self, val): + """Format an event value.""" + assert val is not None + return self._formatter(val) + + +CALLS = Event("Calls", 0, add, times) +SAMPLES = Event("Samples", 0, add, times) +SAMPLES2 = Event("Samples", 0, add, times) + +# Count of samples where a given function was either executing or on the stack. +# This is used to calculate the total time ratio according to the +# straightforward method described in Mike Dunlavey's answer to +# stackoverflow.com/questions/1777556/alternatives-to-gprof, item 4 (the myth +# "that recursion is a tricky confusing issue"), last edited 2012-08-30: it's +# just the ratio of TOTAL_SAMPLES over the number of samples in the profile. +# +# Used only when totalMethod == callstacks +TOTAL_SAMPLES = Event("Samples", 0, add, times) + +TIME = Event("Time", 0.0, add, lambda x: '(' + str(x) + ')') +TIME_RATIO = Event("Time ratio", 0.0, add, lambda x: '(' + percentage(x) + ')') +TOTAL_TIME = Event("Total time", 0.0, fail) +TOTAL_TIME_RATIO = Event("Total time ratio", 0.0, fail, percentage) + +labels = { + 'self-time': TIME, + 'self-time-percentage': TIME_RATIO, + 'total-time': TOTAL_TIME, + 'total-time-percentage': TOTAL_TIME_RATIO, +} +defaultLabelNames = ['total-time-percentage', 'self-time-percentage'] + +totalMethod = 'callratios' + + +class Object(object): + """Base class for all objects in profile which can store events.""" + + def __init__(self, events=None): + if events is None: + self.events = {} + else: + self.events = events + + def __hash__(self): + return id(self) + + def __eq__(self, other): + return self is other + + def __lt__(self, other): + return id(self) < id(other) + + def __contains__(self, event): + return event in self.events + + def __getitem__(self, event): + try: + return self.events[event] + except KeyError: + raise UndefinedEvent(event) + + def __setitem__(self, event, value): + if value is None: + if event in self.events: + del self.events[event] + else: + self.events[event] = value + + +class Call(Object): + """A call between functions. + + There should be at most one call object for every pair of functions. + """ + + def __init__(self, callee_id): + Object.__init__(self) + self.callee_id = callee_id + self.ratio = None + self.weight = None + + +class Function(Object): + """A function.""" + + def __init__(self, id, name): + Object.__init__(self) + self.id = id + self.name = name + self.module = None + self.process = None + self.calls = {} + self.called = None + self.weight = None + self.cycle = None + self.filename = None + + def add_call(self, call): + if call.callee_id in self.calls: + sys.stderr.write('warning: overwriting call from function %s to %s\n' % (str(self.id), str(call.callee_id))) + self.calls[call.callee_id] = call + + def get_call(self, callee_id): + if not callee_id in self.calls: + call = Call(callee_id) + call[SAMPLES] = 0 + call[SAMPLES2] = 0 + call[CALLS] = 0 + self.calls[callee_id] = call + return self.calls[callee_id] + + _parenthesis_re = re.compile(r'\([^()]*\)') + _angles_re = re.compile(r'<[^<>]*>') + _const_re = re.compile(r'\s+const$') + + def stripped_name(self): + """Remove extraneous information from C++ demangled function names.""" + + name = self.name + + # Strip function parameters from name by recursively removing paired parenthesis + while True: + name, n = self._parenthesis_re.subn('', name) + if not n: + break + + # Strip const qualifier + name = self._const_re.sub('', name) + + # Strip template parameters from name by recursively removing paired angles + while True: + name, n = self._angles_re.subn('', name) + if not n: + break + + return name + + # TODO: write utility functions + + def __repr__(self): + return self.name + + def dump(self, sep1=",\n\t", sep2=":=", sep3="\n"): + """ Returns as a string all information available in this Function object + separators sep1:between entries + sep2:between attribute name and value, + sep3: inserted at end + """ + return sep1.join("".join(k,sep2,v) for (k,v) in sorted(self.__dict__.items())) + sep3 + +class Cycle(Object): + """A cycle made from recursive function calls.""" + + def __init__(self): + Object.__init__(self) + self.functions = set() + + def add_function(self, function): + assert function not in self.functions + self.functions.add(function) + if function.cycle is not None: + for other in function.cycle.functions: + if function not in self.functions: + self.add_function(other) + function.cycle = self + + +class Profile(Object): + """The whole profile.""" + + def __init__(self): + Object.__init__(self) + self.functions = {} + self.cycles = [] + + def add_function(self, function): + if function.id in self.functions: + sys.stderr.write('warning: overwriting function %s (id %s)\n' % (function.name, str(function.id))) + self.functions[function.id] = function + + def add_cycle(self, cycle): + self.cycles.append(cycle) + + def validate(self): + """Validate the edges.""" + + for function in compat_itervalues(self.functions): + for callee_id in compat_keys(function.calls): + assert function.calls[callee_id].callee_id == callee_id + if callee_id not in self.functions: + sys.stderr.write('warning: call to undefined function %s from function %s\n' % (str(callee_id), function.name)) + del function.calls[callee_id] + + def find_cycles(self): + """Find cycles using Tarjan's strongly connected components algorithm.""" + + # Apply the Tarjan's algorithm successively until all functions are visited + stack = [] + data = {} + order = 0 + for function in compat_itervalues(self.functions): + order = self._tarjan(function, order, stack, data) + cycles = [] + for function in compat_itervalues(self.functions): + if function.cycle is not None and function.cycle not in cycles: + cycles.append(function.cycle) + self.cycles = cycles + if 0: + for cycle in cycles: + sys.stderr.write("Cycle:\n") + for member in cycle.functions: + sys.stderr.write("\tFunction %s\n" % member.name) + + def prune_root(self, roots, depth=-1): + visited = set() + frontier = set([(root_node, depth) for root_node in roots]) + while len(frontier) > 0: + node, node_depth = frontier.pop() + visited.add(node) + if node_depth == 0: + continue + f = self.functions[node] + newNodes = set(f.calls.keys()) - visited + frontier = frontier.union({(new_node, node_depth - 1) for new_node in newNodes}) + subtreeFunctions = {} + for n in visited: + f = self.functions[n] + newCalls = {} + for c in f.calls.keys(): + if c in visited: + newCalls[c] = f.calls[c] + f.calls = newCalls + subtreeFunctions[n] = f + self.functions = subtreeFunctions + + def prune_leaf(self, leafs, depth=-1): + edgesUp = collections.defaultdict(set) + for f in self.functions.keys(): + for n in self.functions[f].calls.keys(): + edgesUp[n].add(f) + # build the tree up + visited = set() + frontier = set([(leaf_node, depth) for leaf_node in leafs]) + while len(frontier) > 0: + node, node_depth = frontier.pop() + visited.add(node) + if node_depth == 0: + continue + newNodes = edgesUp[node] - visited + frontier = frontier.union({(new_node, node_depth - 1) for new_node in newNodes}) + downTree = set(self.functions.keys()) + upTree = visited + path = downTree.intersection(upTree) + pathFunctions = {} + for n in path: + f = self.functions[n] + newCalls = {} + for c in f.calls.keys(): + if c in path: + newCalls[c] = f.calls[c] + f.calls = newCalls + pathFunctions[n] = f + self.functions = pathFunctions + + def getFunctionIds(self, funcName): + function_names = {v.name: k for (k, v) in self.functions.items()} + return [function_names[name] for name in fnmatch.filter(function_names.keys(), funcName)] + + def getFunctionId(self, funcName): + for f in self.functions: + if self.functions[f].name == funcName: + return f + return False + + def printFunctionIds(self, selector=None, file=sys.stderr): + """ Print to file function entries selected by fnmatch.fnmatch like in + method getFunctionIds, with following extensions: + - selector starts with "%": dump all information available + - selector is '+' or '-': select all function entries + """ + if selector is None or selector in ("+", "*"): + v = ",\n".join(("%s:\t%s" % (kf,self.functions[kf].name) + for kf in self.functions.keys())) + else: + if selector[0]=="%": + selector=selector[1:] + function_info={k:v for (k,v) + in self.functions.items() + if fnmatch.fnmatch(v.name,selector)} + v = ",\n".join( ("%s\t({k})\t(%s)::\n\t%s" % (v.name,type(v),v.dump()) + for (k,v) in function_info.items() + )) + + else: + function_names = (v.name for v in self.functions.values()) + v = ",\n".join( ( nm for nm in fnmatch.filter(function_names,selector ))) + + file.write(v+"\n") + file.flush() + + class _TarjanData: + def __init__(self, order): + self.order = order + self.lowlink = order + self.onstack = False + + def _tarjan(self, function, order, stack, data): + """Tarjan's strongly connected components algorithm. + + See also: + - http://en.wikipedia.org/wiki/Tarjan's_strongly_connected_components_algorithm + """ + + try: + func_data = data[function.id] + return order + except KeyError: + func_data = self._TarjanData(order) + data[function.id] = func_data + order += 1 + pos = len(stack) + stack.append(function) + func_data.onstack = True + for call in compat_itervalues(function.calls): + try: + callee_data = data[call.callee_id] + if callee_data.onstack: + func_data.lowlink = min(func_data.lowlink, callee_data.order) + except KeyError: + callee = self.functions[call.callee_id] + order = self._tarjan(callee, order, stack, data) + callee_data = data[call.callee_id] + func_data.lowlink = min(func_data.lowlink, callee_data.lowlink) + if func_data.lowlink == func_data.order: + # Strongly connected component found + members = stack[pos:] + del stack[pos:] + if len(members) > 1: + cycle = Cycle() + for member in members: + cycle.add_function(member) + data[member.id].onstack = False + else: + for member in members: + data[member.id].onstack = False + return order + + def call_ratios(self, event): + # Aggregate for incoming calls + cycle_totals = {} + for cycle in self.cycles: + cycle_totals[cycle] = 0.0 + function_totals = {} + for function in compat_itervalues(self.functions): + function_totals[function] = 0.0 + + # Pass 1: function_total gets the sum of call[event] for all + # incoming arrows. Same for cycle_total for all arrows + # that are coming into the *cycle* but are not part of it. + for function in compat_itervalues(self.functions): + for call in compat_itervalues(function.calls): + if call.callee_id != function.id: + callee = self.functions[call.callee_id] + if event in call.events: + function_totals[callee] += call[event] + if callee.cycle is not None and callee.cycle is not function.cycle: + cycle_totals[callee.cycle] += call[event] + else: + sys.stderr.write("call_ratios: No data for " + function.name + " call to " + callee.name + "\n") + + # Pass 2: Compute the ratios. Each call[event] is scaled by the + # function_total of the callee. Calls into cycles use the + # cycle_total, but not calls within cycles. + for function in compat_itervalues(self.functions): + for call in compat_itervalues(function.calls): + assert call.ratio is None + if call.callee_id != function.id: + callee = self.functions[call.callee_id] + if event in call.events: + if callee.cycle is not None and callee.cycle is not function.cycle: + total = cycle_totals[callee.cycle] + else: + total = function_totals[callee] + call.ratio = ratio(call[event], total) + else: + # Warnings here would only repeat those issued above. + call.ratio = 0.0 + + def integrate(self, outevent, inevent): + """Propagate function time ratio along the function calls. + + Must be called after finding the cycles. + + See also: + - http://citeseer.ist.psu.edu/graham82gprof.html + """ + + # Sanity checking + assert outevent not in self + for function in compat_itervalues(self.functions): + assert outevent not in function + assert inevent in function + for call in compat_itervalues(function.calls): + assert outevent not in call + if call.callee_id != function.id: + assert call.ratio is not None + + # Aggregate the input for each cycle + for cycle in self.cycles: + total = inevent.null() + for function in compat_itervalues(self.functions): + total = inevent.aggregate(total, function[inevent]) + self[inevent] = total + + # Integrate along the edges + total = inevent.null() + for function in compat_itervalues(self.functions): + total = inevent.aggregate(total, function[inevent]) + self._integrate_function(function, outevent, inevent) + self[outevent] = total + + def _integrate_function(self, function, outevent, inevent): + if function.cycle is not None: + return self._integrate_cycle(function.cycle, outevent, inevent) + else: + if outevent not in function: + total = function[inevent] + for call in compat_itervalues(function.calls): + if call.callee_id != function.id: + total += self._integrate_call(call, outevent, inevent) + function[outevent] = total + return function[outevent] + + def _integrate_call(self, call, outevent, inevent): + assert outevent not in call + assert call.ratio is not None + callee = self.functions[call.callee_id] + subtotal = call.ratio *self._integrate_function(callee, outevent, inevent) + call[outevent] = subtotal + return subtotal + + def _integrate_cycle(self, cycle, outevent, inevent): + if outevent not in cycle: + + # Compute the outevent for the whole cycle + total = inevent.null() + for member in cycle.functions: + subtotal = member[inevent] + for call in compat_itervalues(member.calls): + callee = self.functions[call.callee_id] + if callee.cycle is not cycle: + subtotal += self._integrate_call(call, outevent, inevent) + total += subtotal + cycle[outevent] = total + + # Compute the time propagated to callers of this cycle + callees = {} + for function in compat_itervalues(self.functions): + if function.cycle is not cycle: + for call in compat_itervalues(function.calls): + callee = self.functions[call.callee_id] + if callee.cycle is cycle: + try: + callees[callee] += call.ratio + except KeyError: + callees[callee] = call.ratio + + for member in cycle.functions: + member[outevent] = outevent.null() + + for callee, call_ratio in compat_iteritems(callees): + ranks = {} + call_ratios = {} + partials = {} + self._rank_cycle_function(cycle, callee, ranks) + self._call_ratios_cycle(cycle, callee, ranks, call_ratios, set()) + partial = self._integrate_cycle_function(cycle, callee, call_ratio, partials, ranks, call_ratios, outevent, inevent) + + # Ensure `partial == max(partials.values())`, but with round-off tolerance + max_partial = max(partials.values()) + assert abs(partial - max_partial) <= 1e-7*max_partial + + assert abs(call_ratio*total - partial) <= 0.001*call_ratio*total + + return cycle[outevent] + + def _rank_cycle_function(self, cycle, function, ranks): + """Dijkstra's shortest paths algorithm. + + See also: + - http://en.wikipedia.org/wiki/Dijkstra's_algorithm + """ + + import heapq + Q = [] + Qd = {} + p = {} + visited = set([function]) + + ranks[function] = 0 + for call in compat_itervalues(function.calls): + if call.callee_id != function.id: + callee = self.functions[call.callee_id] + if callee.cycle is cycle: + ranks[callee] = 1 + item = [ranks[callee], function, callee] + heapq.heappush(Q, item) + Qd[callee] = item + + while Q: + cost, parent, member = heapq.heappop(Q) + if member not in visited: + p[member]= parent + visited.add(member) + for call in compat_itervalues(member.calls): + if call.callee_id != member.id: + callee = self.functions[call.callee_id] + if callee.cycle is cycle: + member_rank = ranks[member] + rank = ranks.get(callee) + if rank is not None: + if rank > 1 + member_rank: + rank = 1 + member_rank + ranks[callee] = rank + Qd_callee = Qd[callee] + Qd_callee[0] = rank + Qd_callee[1] = member + heapq._siftdown(Q, 0, Q.index(Qd_callee)) + else: + rank = 1 + member_rank + ranks[callee] = rank + item = [rank, member, callee] + heapq.heappush(Q, item) + Qd[callee] = item + + def _call_ratios_cycle(self, cycle, function, ranks, call_ratios, visited): + if function not in visited: + visited.add(function) + for call in compat_itervalues(function.calls): + if call.callee_id != function.id: + callee = self.functions[call.callee_id] + if callee.cycle is cycle: + if ranks[callee] > ranks[function]: + call_ratios[callee] = call_ratios.get(callee, 0.0) + call.ratio + self._call_ratios_cycle(cycle, callee, ranks, call_ratios, visited) + + def _integrate_cycle_function(self, cycle, function, partial_ratio, partials, ranks, call_ratios, outevent, inevent): + if function not in partials: + partial = partial_ratio*function[inevent] + for call in compat_itervalues(function.calls): + if call.callee_id != function.id: + callee = self.functions[call.callee_id] + if callee.cycle is not cycle: + assert outevent in call + partial += partial_ratio*call[outevent] + else: + if ranks[callee] > ranks[function]: + callee_partial = self._integrate_cycle_function(cycle, callee, partial_ratio, partials, ranks, call_ratios, outevent, inevent) + call_ratio = ratio(call.ratio, call_ratios[callee]) + call_partial = call_ratio*callee_partial + try: + call[outevent] += call_partial + except UndefinedEvent: + call[outevent] = call_partial + partial += call_partial + partials[function] = partial + try: + function[outevent] += partial + except UndefinedEvent: + function[outevent] = partial + return partials[function] + + def aggregate(self, event): + """Aggregate an event for the whole profile.""" + + total = event.null() + for function in compat_itervalues(self.functions): + try: + total = event.aggregate(total, function[event]) + except UndefinedEvent: + return + self[event] = total + + def ratio(self, outevent, inevent): + assert outevent not in self + assert inevent in self + for function in compat_itervalues(self.functions): + assert outevent not in function + assert inevent in function + function[outevent] = ratio(function[inevent], self[inevent]) + for call in compat_itervalues(function.calls): + assert outevent not in call + if inevent in call: + call[outevent] = ratio(call[inevent], self[inevent]) + self[outevent] = 1.0 + + def prune(self, node_thres, edge_thres, paths, color_nodes_by_selftime): + """Prune the profile""" + + # compute the prune ratios + for function in compat_itervalues(self.functions): + try: + function.weight = function[TOTAL_TIME_RATIO] + except UndefinedEvent: + pass + + for call in compat_itervalues(function.calls): + callee = self.functions[call.callee_id] + + if TOTAL_TIME_RATIO in call: + # handle exact cases first + call.weight = call[TOTAL_TIME_RATIO] + else: + try: + # make a safe estimate + call.weight = min(function[TOTAL_TIME_RATIO], callee[TOTAL_TIME_RATIO]) + except UndefinedEvent: + pass + + # prune the nodes + for function_id in compat_keys(self.functions): + function = self.functions[function_id] + if function.weight is not None: + if function.weight < node_thres: + del self.functions[function_id] + + # prune file paths + for function_id in compat_keys(self.functions): + function = self.functions[function_id] + if paths and function.filename and not any(function.filename.startswith(path) for path in paths): + del self.functions[function_id] + elif paths and function.module and not any((function.module.find(path)>-1) for path in paths): + del self.functions[function_id] + + # prune the edges + for function in compat_itervalues(self.functions): + for callee_id in compat_keys(function.calls): + call = function.calls[callee_id] + if callee_id not in self.functions or call.weight is not None and call.weight < edge_thres: + del function.calls[callee_id] + + if color_nodes_by_selftime: + weights = [] + for function in compat_itervalues(self.functions): + try: + weights.append(function[TIME_RATIO]) + except UndefinedEvent: + pass + max_ratio = max(weights or [1]) + + # apply rescaled weights for coloriung + for function in compat_itervalues(self.functions): + try: + function.weight = function[TIME_RATIO] / max_ratio + except (ZeroDivisionError, UndefinedEvent): + pass + + def dump(self): + for function in compat_itervalues(self.functions): + sys.stderr.write('Function %s:\n' % (function.name,)) + self._dump_events(function.events) + for call in compat_itervalues(function.calls): + callee = self.functions[call.callee_id] + sys.stderr.write(' Call %s:\n' % (callee.name,)) + self._dump_events(call.events) + for cycle in self.cycles: + sys.stderr.write('Cycle:\n') + self._dump_events(cycle.events) + for function in cycle.functions: + sys.stderr.write(' Function %s\n' % (function.name,)) + + def _dump_events(self, events): + for event, value in compat_iteritems(events): + sys.stderr.write(' %s: %s\n' % (event.name, event.format(value))) + + + +######################################################################## +# Parsers + + +class Struct: + """Masquerade a dictionary with a structure-like behavior.""" + + def __init__(self, attrs = None): + if attrs is None: + attrs = {} + self.__dict__['_attrs'] = attrs + + def __getattr__(self, name): + try: + return self._attrs[name] + except KeyError: + raise AttributeError(name) + + def __setattr__(self, name, value): + self._attrs[name] = value + + def __str__(self): + return str(self._attrs) + + def __repr__(self): + return repr(self._attrs) + + +class ParseError(Exception): + """Raised when parsing to signal mismatches.""" + + def __init__(self, msg, line): + Exception.__init__(self) + self.msg = msg + # TODO: store more source line information + self.line = line + + def __str__(self): + return '%s: %r' % (self.msg, self.line) + + +class Parser: + """Parser interface.""" + + stdinInput = True + multipleInput = False + + def __init__(self): + pass + + def parse(self): + raise NotImplementedError + + +class JsonParser(Parser): + """Parser for a custom JSON representation of profile data. + + See schema.json for details. + """ + + + def __init__(self, stream): + Parser.__init__(self) + self.stream = stream + + def parse(self): + + obj = json.load(self.stream) + + assert obj['version'] == 0 + + profile = Profile() + profile[SAMPLES] = 0 + + fns = obj['functions'] + + for functionIndex in range(len(fns)): + fn = fns[functionIndex] + function = Function(functionIndex, fn['name']) + try: + function.module = fn['module'] + except KeyError: + pass + try: + function.process = fn['process'] + except KeyError: + pass + function[SAMPLES] = 0 + function.called = 0 + profile.add_function(function) + + for event in obj['events']: + callchain = [] + + for functionIndex in event['callchain']: + function = profile.functions[functionIndex] + callchain.append(function) + + # increment the call count of the first in the callchain + function = profile.functions[event['callchain'][0]] + function.called = function.called + 1 + + cost = event['cost'][0] + + callee = callchain[0] + callee[SAMPLES] += cost + profile[SAMPLES] += cost + + for caller in callchain[1:]: + try: + call = caller.calls[callee.id] + except KeyError: + call = Call(callee.id) + call[SAMPLES2] = cost + caller.add_call(call) + else: + call[SAMPLES2] += cost + + callee = caller + + if False: + profile.dump() + + # compute derived data + profile.validate() + profile.find_cycles() + profile.ratio(TIME_RATIO, SAMPLES) + profile.call_ratios(SAMPLES2) + profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + + return profile + + +class LineParser(Parser): + """Base class for parsers that read line-based formats.""" + + def __init__(self, stream): + Parser.__init__(self) + self._stream = stream + self.__line = None + self.__eof = False + self.line_no = 0 + + def readline(self): + line = self._stream.readline() + if not line: + self.__line = '' + self.__eof = True + else: + self.line_no += 1 + line = line.rstrip('\r\n') + if not PYTHON_3: + encoding = self._stream.encoding + if encoding is None: + encoding = locale.getpreferredencoding() + line = line.decode(encoding) + self.__line = line + + def lookahead(self): + assert self.__line is not None + return self.__line + + def consume(self): + assert self.__line is not None + line = self.__line + self.readline() + return line + + def eof(self): + assert self.__line is not None + return self.__eof + + +XML_ELEMENT_START, XML_ELEMENT_END, XML_CHARACTER_DATA, XML_EOF = range(4) + + +class XmlToken: + + def __init__(self, type, name_or_data, attrs = None, line = None, column = None): + assert type in (XML_ELEMENT_START, XML_ELEMENT_END, XML_CHARACTER_DATA, XML_EOF) + self.type = type + self.name_or_data = name_or_data + self.attrs = attrs + self.line = line + self.column = column + + def __str__(self): + if self.type == XML_ELEMENT_START: + return '<' + self.name_or_data + ' ...>' + if self.type == XML_ELEMENT_END: + return '' + if self.type == XML_CHARACTER_DATA: + return self.name_or_data + if self.type == XML_EOF: + return 'end of file' + assert 0 + + +class XmlTokenizer: + """Expat based XML tokenizer.""" + + def __init__(self, fp, skip_ws = True): + self.fp = fp + self.tokens = [] + self.index = 0 + self.final = False + self.skip_ws = skip_ws + + self.character_pos = 0, 0 + self.character_data = '' + + self.parser = xml.parsers.expat.ParserCreate() + self.parser.StartElementHandler = self.handle_element_start + self.parser.EndElementHandler = self.handle_element_end + self.parser.CharacterDataHandler = self.handle_character_data + + def handle_element_start(self, name, attributes): + self.finish_character_data() + line, column = self.pos() + token = XmlToken(XML_ELEMENT_START, name, attributes, line, column) + self.tokens.append(token) + + def handle_element_end(self, name): + self.finish_character_data() + line, column = self.pos() + token = XmlToken(XML_ELEMENT_END, name, None, line, column) + self.tokens.append(token) + + def handle_character_data(self, data): + if not self.character_data: + self.character_pos = self.pos() + self.character_data += data + + def finish_character_data(self): + if self.character_data: + if not self.skip_ws or not self.character_data.isspace(): + line, column = self.character_pos + token = XmlToken(XML_CHARACTER_DATA, self.character_data, None, line, column) + self.tokens.append(token) + self.character_data = '' + + def next(self): + size = 16*1024 + while self.index >= len(self.tokens) and not self.final: + self.tokens = [] + self.index = 0 + data = self.fp.read(size) + self.final = len(data) < size + self.parser.Parse(data, self.final) + if self.index >= len(self.tokens): + line, column = self.pos() + token = XmlToken(XML_EOF, None, None, line, column) + else: + token = self.tokens[self.index] + self.index += 1 + return token + + def pos(self): + return self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber + + +class XmlTokenMismatch(Exception): + + def __init__(self, expected, found): + Exception.__init__(self) + self.expected = expected + self.found = found + + def __str__(self): + return '%u:%u: %s expected, %s found' % (self.found.line, self.found.column, str(self.expected), str(self.found)) + + +class XmlParser(Parser): + """Base XML document parser.""" + + def __init__(self, fp): + Parser.__init__(self) + self.tokenizer = XmlTokenizer(fp) + self.consume() + + def consume(self): + self.token = self.tokenizer.next() + + def match_element_start(self, name): + return self.token.type == XML_ELEMENT_START and self.token.name_or_data == name + + def match_element_end(self, name): + return self.token.type == XML_ELEMENT_END and self.token.name_or_data == name + + def element_start(self, name): + while self.token.type == XML_CHARACTER_DATA: + self.consume() + if self.token.type != XML_ELEMENT_START: + raise XmlTokenMismatch(XmlToken(XML_ELEMENT_START, name), self.token) + if self.token.name_or_data != name: + raise XmlTokenMismatch(XmlToken(XML_ELEMENT_START, name), self.token) + attrs = self.token.attrs + self.consume() + return attrs + + def element_end(self, name): + while self.token.type == XML_CHARACTER_DATA: + self.consume() + if self.token.type != XML_ELEMENT_END: + raise XmlTokenMismatch(XmlToken(XML_ELEMENT_END, name), self.token) + if self.token.name_or_data != name: + raise XmlTokenMismatch(XmlToken(XML_ELEMENT_END, name), self.token) + self.consume() + + def character_data(self, strip = True): + data = '' + while self.token.type == XML_CHARACTER_DATA: + data += self.token.name_or_data + self.consume() + if strip: + data = data.strip() + return data + + +class GprofParser(Parser): + """Parser for GNU gprof output. + + See also: + - Chapter "Interpreting gprof's Output" from the GNU gprof manual + http://sourceware.org/binutils/docs-2.18/gprof/Call-Graph.html#Call-Graph + - File "cg_print.c" from the GNU gprof source code + http://sourceware.org/cgi-bin/cvsweb.cgi/~checkout~/src/gprof/cg_print.c?rev=1.12&cvsroot=src + """ + + def __init__(self, fp): + Parser.__init__(self) + self.fp = fp + self.functions = {} + self.cycles = {} + + def readline(self): + line = self.fp.readline() + if not line: + sys.stderr.write('error: unexpected end of file\n') + sys.exit(1) + line = line.rstrip('\r\n') + return line + + _int_re = re.compile(r'^\d+$') + _float_re = re.compile(r'^\d+\.\d+$') + + def translate(self, mo): + """Extract a structure from a match object, while translating the types in the process.""" + attrs = {} + groupdict = mo.groupdict() + for name, value in compat_iteritems(groupdict): + if value is None: + value = None + elif self._int_re.match(value): + value = int(value) + elif self._float_re.match(value): + value = float(value) + attrs[name] = (value) + return Struct(attrs) + + _cg_header_re = re.compile( + # original gprof header + r'^\s+called/total\s+parents\s*$|' + + r'^index\s+%time\s+self\s+descendents\s+called\+self\s+name\s+index\s*$|' + + r'^\s+called/total\s+children\s*$|' + + # GNU gprof header + r'^index\s+%\s+time\s+self\s+children\s+called\s+name\s*$' + ) + + _cg_ignore_re = re.compile( + # spontaneous + r'^\s+\s*$|' + # internal calls (such as "mcount") + r'^.*\((\d+)\)$' + ) + + _cg_primary_re = re.compile( + r'^\[(?P\d+)\]?' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?:(?P\d+)(?:\+(?P\d+))?)?' + + r'\s+(?P\S.*?)' + + r'(?:\s+\d+)>)?' + + r'\s\[(\d+)\]$' + ) + + _cg_parent_re = re.compile( + r'^\s+(?P\d+\.\d+)?' + + r'\s+(?P\d+\.\d+)?' + + r'\s+(?P\d+)(?:/(?P\d+))?' + + r'\s+(?P\S.*?)' + + r'(?:\s+\d+)>)?' + + r'\s\[(?P\d+)\]$' + ) + + _cg_child_re = _cg_parent_re + + _cg_cycle_header_re = re.compile( + r'^\[(?P\d+)\]?' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?:(?P\d+)(?:\+(?P\d+))?)?' + + r'\s+\d+)\sas\sa\swhole>' + + r'\s\[(\d+)\]$' + ) + + _cg_cycle_member_re = re.compile( + r'^\s+(?P\d+\.\d+)?' + + r'\s+(?P\d+\.\d+)?' + + r'\s+(?P\d+)(?:\+(?P\d+))?' + + r'\s+(?P\S.*?)' + + r'(?:\s+\d+)>)?' + + r'\s\[(?P\d+)\]$' + ) + + _cg_sep_re = re.compile(r'^--+$') + + def parse_function_entry(self, lines): + parents = [] + children = [] + + while True: + if not lines: + sys.stderr.write('warning: unexpected end of entry\n') + line = lines.pop(0) + if line.startswith('['): + break + + # read function parent line + mo = self._cg_parent_re.match(line) + if not mo: + if self._cg_ignore_re.match(line): + continue + sys.stderr.write('warning: unrecognized call graph entry: %r\n' % line) + else: + parent = self.translate(mo) + parents.append(parent) + + # read primary line + mo = self._cg_primary_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry: %r\n' % line) + return + else: + function = self.translate(mo) + + while lines: + line = lines.pop(0) + + # read function subroutine line + mo = self._cg_child_re.match(line) + if not mo: + if self._cg_ignore_re.match(line): + continue + sys.stderr.write('warning: unrecognized call graph entry: %r\n' % line) + else: + child = self.translate(mo) + children.append(child) + + function.parents = parents + function.children = children + + self.functions[function.index] = function + + def parse_cycle_entry(self, lines): + + # read cycle header line + line = lines[0] + mo = self._cg_cycle_header_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry: %r\n' % line) + return + cycle = self.translate(mo) + + # read cycle member lines + cycle.functions = [] + for line in lines[1:]: + mo = self._cg_cycle_member_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry: %r\n' % line) + continue + call = self.translate(mo) + cycle.functions.append(call) + + self.cycles[cycle.cycle] = cycle + + def parse_cg_entry(self, lines): + if lines[0].startswith("["): + self.parse_cycle_entry(lines) + else: + self.parse_function_entry(lines) + + def parse_cg(self): + """Parse the call graph.""" + + # skip call graph header + while not self._cg_header_re.match(self.readline()): + pass + line = self.readline() + while self._cg_header_re.match(line): + line = self.readline() + + # process call graph entries + entry_lines = [] + while line != '\014': # form feed + if line and not line.isspace(): + if self._cg_sep_re.match(line): + self.parse_cg_entry(entry_lines) + entry_lines = [] + else: + entry_lines.append(line) + line = self.readline() + + def parse(self): + self.parse_cg() + self.fp.close() + + profile = Profile() + profile[TIME] = 0.0 + + cycles = {} + for index in self.cycles: + cycles[index] = Cycle() + + for entry in compat_itervalues(self.functions): + # populate the function + function = Function(entry.index, entry.name) + function[TIME] = entry.self + if entry.called is not None: + function.called = entry.called + if entry.called_self is not None: + call = Call(entry.index) + call[CALLS] = entry.called_self + function.called += entry.called_self + + # populate the function calls + for child in entry.children: + call = Call(child.index) + + assert child.called is not None + call[CALLS] = child.called + + if child.index not in self.functions: + # NOTE: functions that were never called but were discovered by gprof's + # static call graph analysis dont have a call graph entry so we need + # to add them here + missing = Function(child.index, child.name) + function[TIME] = 0.0 + function.called = 0 + profile.add_function(missing) + + function.add_call(call) + + profile.add_function(function) + + if entry.cycle is not None: + try: + cycle = cycles[entry.cycle] + except KeyError: + sys.stderr.write('warning: entry missing\n' % entry.cycle) + cycle = Cycle() + cycles[entry.cycle] = cycle + cycle.add_function(function) + + profile[TIME] = profile[TIME] + function[TIME] + + for cycle in compat_itervalues(cycles): + profile.add_cycle(cycle) + + # Compute derived events + profile.validate() + profile.ratio(TIME_RATIO, TIME) + profile.call_ratios(CALLS) + profile.integrate(TOTAL_TIME, TIME) + profile.ratio(TOTAL_TIME_RATIO, TOTAL_TIME) + + return profile + + +# Clone&hack of GprofParser for VTune Amplifier XE 2013 gprof-cc output. +# Tested only with AXE 2013 for Windows. +# - Use total times as reported by AXE. +# - In the absence of call counts, call ratios are faked from the relative +# proportions of total time. This affects only the weighting of the calls. +# - Different header, separator, and end marker. +# - Extra whitespace after function names. +# - You get a full entry for , which does not have parents. +# - Cycles do have parents. These are saved but unused (as they are +# for functions). +# - Disambiguated "unrecognized call graph entry" error messages. +# Notes: +# - Total time of functions as reported by AXE passes the val3 test. +# - CPU Time:Children in the input is sometimes a negative number. This +# value goes to the variable descendants, which is unused. +# - The format of gprof-cc reports is unaffected by the use of +# -knob enable-call-counts=true (no call counts, ever), or +# -show-as=samples (results are quoted in seconds regardless). +class AXEParser(Parser): + "Parser for VTune Amplifier XE 2013 gprof-cc report output." + + def __init__(self, fp): + Parser.__init__(self) + self.fp = fp + self.functions = {} + self.cycles = {} + + def readline(self): + line = self.fp.readline() + if not line: + sys.stderr.write('error: unexpected end of file\n') + sys.exit(1) + line = line.rstrip('\r\n') + return line + + _int_re = re.compile(r'^\d+$') + _float_re = re.compile(r'^\d+\.\d+$') + + def translate(self, mo): + """Extract a structure from a match object, while translating the types in the process.""" + attrs = {} + groupdict = mo.groupdict() + for name, value in compat_iteritems(groupdict): + if value is None: + value = None + elif self._int_re.match(value): + value = int(value) + elif self._float_re.match(value): + value = float(value) + attrs[name] = (value) + return Struct(attrs) + + _cg_header_re = re.compile( + '^Index |' + '^-----+ ' + ) + + _cg_footer_re = re.compile(r'^Index\s+Function\s*$') + + _cg_primary_re = re.compile( + r'^\[(?P\d+)\]?' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\S.*?)' + + r'(?:\s+\d+)>)?' + + r'\s+\[(\d+)\]' + + r'\s*$' + ) + + _cg_parent_re = re.compile( + r'^\s+(?P\d+\.\d+)?' + + r'\s+(?P\d+\.\d+)?' + + r'\s+(?P\S.*?)' + + r'(?:\s+\d+)>)?' + + r'(?:\s+\[(?P\d+)\]\s*)?' + + r'\s*$' + ) + + _cg_child_re = _cg_parent_re + + _cg_cycle_header_re = re.compile( + r'^\[(?P\d+)\]?' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\d+\.\d+)' + + r'\s+(?P\d+\.\d+)' + + r'\s+\d+)\sas\sa\swhole>' + + r'\s+\[(\d+)\]' + + r'\s*$' + ) + + _cg_cycle_member_re = re.compile( + r'^\s+(?P\d+\.\d+)?' + + r'\s+(?P\d+\.\d+)?' + + r'\s+(?P\S.*?)' + + r'(?:\s+\d+)>)?' + + r'\s+\[(?P\d+)\]' + + r'\s*$' + ) + + def parse_function_entry(self, lines): + parents = [] + children = [] + + while True: + if not lines: + sys.stderr.write('warning: unexpected end of entry\n') + return + line = lines.pop(0) + if line.startswith('['): + break + + # read function parent line + mo = self._cg_parent_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry (1): %r\n' % line) + else: + parent = self.translate(mo) + if parent.name != '': + parents.append(parent) + + # read primary line + mo = self._cg_primary_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry (2): %r\n' % line) + return + else: + function = self.translate(mo) + + while lines: + line = lines.pop(0) + + # read function subroutine line + mo = self._cg_child_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry (3): %r\n' % line) + else: + child = self.translate(mo) + if child.name != '': + children.append(child) + + if function.name != '': + function.parents = parents + function.children = children + + self.functions[function.index] = function + + def parse_cycle_entry(self, lines): + + # Process the parents that were not there in gprof format. + parents = [] + while True: + if not lines: + sys.stderr.write('warning: unexpected end of cycle entry\n') + return + line = lines.pop(0) + if line.startswith('['): + break + mo = self._cg_parent_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry (6): %r\n' % line) + else: + parent = self.translate(mo) + if parent.name != '': + parents.append(parent) + + # read cycle header line + mo = self._cg_cycle_header_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry (4): %r\n' % line) + return + cycle = self.translate(mo) + + # read cycle member lines + cycle.functions = [] + for line in lines[1:]: + mo = self._cg_cycle_member_re.match(line) + if not mo: + sys.stderr.write('warning: unrecognized call graph entry (5): %r\n' % line) + continue + call = self.translate(mo) + cycle.functions.append(call) + + cycle.parents = parents + self.cycles[cycle.cycle] = cycle + + def parse_cg_entry(self, lines): + if any("as a whole" in linelooper for linelooper in lines): + self.parse_cycle_entry(lines) + else: + self.parse_function_entry(lines) + + def parse_cg(self): + """Parse the call graph.""" + + # skip call graph header + line = self.readline() + while self._cg_header_re.match(line): + line = self.readline() + + # process call graph entries + entry_lines = [] + # An EOF in readline terminates the program without returning. + while not self._cg_footer_re.match(line): + if line.isspace(): + self.parse_cg_entry(entry_lines) + entry_lines = [] + else: + entry_lines.append(line) + line = self.readline() + + def parse(self): + sys.stderr.write('warning: for axe format, edge weights are unreliable estimates derived from function total times.\n') + self.parse_cg() + self.fp.close() + + profile = Profile() + profile[TIME] = 0.0 + + cycles = {} + for index in self.cycles: + cycles[index] = Cycle() + + for entry in compat_itervalues(self.functions): + # populate the function + function = Function(entry.index, entry.name) + function[TIME] = entry.self + function[TOTAL_TIME_RATIO] = entry.percentage_time / 100.0 + + # populate the function calls + for child in entry.children: + call = Call(child.index) + # The following bogus value affects only the weighting of + # the calls. + call[TOTAL_TIME_RATIO] = function[TOTAL_TIME_RATIO] + + if child.index not in self.functions: + # NOTE: functions that were never called but were discovered by gprof's + # static call graph analysis dont have a call graph entry so we need + # to add them here + # FIXME: Is this applicable? + missing = Function(child.index, child.name) + function[TIME] = 0.0 + profile.add_function(missing) + + function.add_call(call) + + profile.add_function(function) + + if entry.cycle is not None: + try: + cycle = cycles[entry.cycle] + except KeyError: + sys.stderr.write('warning: entry missing\n' % entry.cycle) + cycle = Cycle() + cycles[entry.cycle] = cycle + cycle.add_function(function) + + profile[TIME] = profile[TIME] + function[TIME] + + for cycle in compat_itervalues(cycles): + profile.add_cycle(cycle) + + # Compute derived events. + profile.validate() + profile.ratio(TIME_RATIO, TIME) + # Lacking call counts, fake call ratios based on total times. + profile.call_ratios(TOTAL_TIME_RATIO) + # The TOTAL_TIME_RATIO of functions is already set. Propagate that + # total time to the calls. (TOTAL_TIME is neither set nor used.) + for function in compat_itervalues(profile.functions): + for call in compat_itervalues(function.calls): + if call.ratio is not None: + callee = profile.functions[call.callee_id] + call[TOTAL_TIME_RATIO] = call.ratio * callee[TOTAL_TIME_RATIO] + + return profile + + +class CallgrindParser(LineParser): + """Parser for valgrind's callgrind tool. + + See also: + - http://valgrind.org/docs/manual/cl-format.html + """ + + _call_re = re.compile(r'^calls=\s*(\d+)\s+((\d+|\+\d+|-\d+|\*)\s+)+$') + + def __init__(self, infile): + LineParser.__init__(self, infile) + + # Textual positions + self.position_ids = {} + self.positions = {} + + # Numeric positions + self.num_positions = 1 + self.cost_positions = ['line'] + self.last_positions = [0] + + # Events + self.num_events = 0 + self.cost_events = [] + + self.profile = Profile() + self.profile[SAMPLES] = 0 + + def parse(self): + # read lookahead + self.readline() + + self.parse_key('version') + self.parse_key('creator') + while self.parse_part(): + pass + if not self.eof(): + sys.stderr.write('warning: line %u: unexpected line\n' % self.line_no) + sys.stderr.write('%s\n' % self.lookahead()) + + # compute derived data + self.profile.validate() + self.profile.find_cycles() + self.profile.ratio(TIME_RATIO, SAMPLES) + self.profile.call_ratios(SAMPLES2) + self.profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + + return self.profile + + def parse_part(self): + if not self.parse_header_line(): + return False + while self.parse_header_line(): + pass + if not self.parse_body_line(): + return False + while self.parse_body_line(): + pass + return True + + def parse_header_line(self): + return \ + self.parse_empty() or \ + self.parse_comment() or \ + self.parse_part_detail() or \ + self.parse_description() or \ + self.parse_event_specification() or \ + self.parse_cost_line_def() or \ + self.parse_cost_summary() + + _detail_keys = set(('cmd', 'pid', 'thread', 'part')) + + def parse_part_detail(self): + return self.parse_keys(self._detail_keys) + + def parse_description(self): + return self.parse_key('desc') is not None + + def parse_event_specification(self): + event = self.parse_key('event') + if event is None: + return False + return True + + def parse_cost_line_def(self): + pair = self.parse_keys(('events', 'positions')) + if pair is None: + return False + key, value = pair + items = value.split() + if key == 'events': + self.num_events = len(items) + self.cost_events = items + if key == 'positions': + self.num_positions = len(items) + self.cost_positions = items + self.last_positions = [0]*self.num_positions + return True + + def parse_cost_summary(self): + pair = self.parse_keys(('summary', 'totals')) + if pair is None: + return False + return True + + def parse_body_line(self): + return \ + self.parse_empty() or \ + self.parse_comment() or \ + self.parse_cost_line() or \ + self.parse_position_spec() or \ + self.parse_association_spec() + + __subpos_re = r'(0x[0-9a-fA-F]+|\d+|\+\d+|-\d+|\*)' + _cost_re = re.compile(r'^' + + __subpos_re + r'( +' + __subpos_re + r')*' + + r'( +\d+)*' + + '$') + + def parse_cost_line(self, calls=None): + line = self.lookahead().rstrip() + mo = self._cost_re.match(line) + if not mo: + return False + + function = self.get_function() + + if calls is None: + # Unlike other aspects, call object (cob) is relative not to the + # last call object, but to the caller's object (ob), so try to + # update it when processing a functions cost line + try: + self.positions['cob'] = self.positions['ob'] + except KeyError: + pass + + values = line.split() + assert len(values) <= self.num_positions + self.num_events + + positions = values[0 : self.num_positions] + events = values[self.num_positions : ] + events += ['0']*(self.num_events - len(events)) + + for i in range(self.num_positions): + position = positions[i] + if position == '*': + position = self.last_positions[i] + elif position[0] in '-+': + position = self.last_positions[i] + int(position) + elif position.startswith('0x'): + position = int(position, 16) + else: + position = int(position) + self.last_positions[i] = position + + events = [float(event) for event in events] + + if calls is None: + function[SAMPLES] += events[0] + self.profile[SAMPLES] += events[0] + else: + callee = self.get_callee() + callee.called += calls + + try: + call = function.calls[callee.id] + except KeyError: + call = Call(callee.id) + call[CALLS] = calls + call[SAMPLES2] = events[0] + function.add_call(call) + else: + call[CALLS] += calls + call[SAMPLES2] += events[0] + + self.consume() + return True + + def parse_association_spec(self): + line = self.lookahead() + if not line.startswith('calls='): + return False + + _, values = line.split('=', 1) + values = values.strip().split() + calls = int(values[0]) + call_position = values[1:] + self.consume() + + self.parse_cost_line(calls) + + return True + + _position_re = re.compile(r'^(?P[cj]?(?:ob|fl|fi|fe|fn))=\s*(?:\((?P\d+)\))?(?:\s*(?P.+))?') + + _position_table_map = { + 'ob': 'ob', + 'fl': 'fl', + 'fi': 'fl', + 'fe': 'fl', + 'fn': 'fn', + 'cob': 'ob', + 'cfl': 'fl', + 'cfi': 'fl', + 'cfe': 'fl', + 'cfn': 'fn', + 'jfi': 'fl', + } + + _position_map = { + 'ob': 'ob', + 'fl': 'fl', + 'fi': 'fl', + 'fe': 'fl', + 'fn': 'fn', + 'cob': 'cob', + 'cfl': 'cfl', + 'cfi': 'cfl', + 'cfe': 'cfl', + 'cfn': 'cfn', + 'jfi': 'jfi', + } + + def parse_position_spec(self): + line = self.lookahead() + + if line.startswith('jump=') or line.startswith('jcnd='): + self.consume() + return True + + mo = self._position_re.match(line) + if not mo: + return False + + position, id, name = mo.groups() + if id: + table = self._position_table_map[position] + if name: + self.position_ids[(table, id)] = name + else: + name = self.position_ids.get((table, id), '') + self.positions[self._position_map[position]] = name + + self.consume() + return True + + def parse_empty(self): + if self.eof(): + return False + line = self.lookahead() + if line.strip(): + return False + self.consume() + return True + + def parse_comment(self): + line = self.lookahead() + if not line.startswith('#'): + return False + self.consume() + return True + + _key_re = re.compile(r'^(\w+):') + + def parse_key(self, key): + pair = self.parse_keys((key,)) + if not pair: + return None + key, value = pair + return value + + def parse_keys(self, keys): + line = self.lookahead() + mo = self._key_re.match(line) + if not mo: + return None + key, value = line.split(':', 1) + if key not in keys: + return None + value = value.strip() + self.consume() + return key, value + + def make_function(self, module, filename, name): + # FIXME: module and filename are not being tracked reliably + #id = '|'.join((module, filename, name)) + id = name + try: + function = self.profile.functions[id] + except KeyError: + function = Function(id, name) + if module: + function.module = os.path.basename(module) + function[SAMPLES] = 0 + function.called = 0 + self.profile.add_function(function) + return function + + def get_function(self): + module = self.positions.get('ob', '') + filename = self.positions.get('fl', '') + function = self.positions.get('fn', '') + return self.make_function(module, filename, function) + + def get_callee(self): + module = self.positions.get('cob', '') + filename = self.positions.get('cfi', '') + function = self.positions.get('cfn', '') + return self.make_function(module, filename, function) + + def readline(self): + # Override LineParser.readline to ignore comment lines + while True: + LineParser.readline(self) + if self.eof() or not self.lookahead().startswith('#'): + break + + +class PerfParser(LineParser): + """Parser for linux perf callgraph output. + + It expects output generated with + + perf record -g + perf script | gprof2dot.py --format=perf + """ + + def __init__(self, infile): + LineParser.__init__(self, infile) + self.profile = Profile() + + def readline(self): + # Override LineParser.readline to ignore comment lines + while True: + LineParser.readline(self) + if self.eof() or not self.lookahead().startswith('#'): + break + + def parse(self): + # read lookahead + self.readline() + + profile = self.profile + profile[SAMPLES] = 0 + while not self.eof(): + self.parse_event() + + # compute derived data + profile.validate() + profile.find_cycles() + profile.ratio(TIME_RATIO, SAMPLES) + profile.call_ratios(SAMPLES2) + if totalMethod == "callratios": + # Heuristic approach. TOTAL_SAMPLES is unused. + profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + elif totalMethod == "callstacks": + # Use the actual call chains for functions. + profile[TOTAL_SAMPLES] = profile[SAMPLES] + profile.ratio(TOTAL_TIME_RATIO, TOTAL_SAMPLES) + # Then propagate that total time to the calls. + for function in compat_itervalues(profile.functions): + for call in compat_itervalues(function.calls): + if call.ratio is not None: + callee = profile.functions[call.callee_id] + call[TOTAL_TIME_RATIO] = call.ratio * callee[TOTAL_TIME_RATIO] + else: + assert False + + return profile + + def parse_event(self): + if self.eof(): + return + + line = self.consume() + assert line + + callchain = self.parse_callchain() + if not callchain: + return + + callee = callchain[0] + callee[SAMPLES] += 1 + self.profile[SAMPLES] += 1 + + for caller in callchain[1:]: + try: + call = caller.calls[callee.id] + except KeyError: + call = Call(callee.id) + call[SAMPLES2] = 1 + caller.add_call(call) + else: + call[SAMPLES2] += 1 + + callee = caller + + # Increment TOTAL_SAMPLES only once on each function. + stack = set(callchain) + for function in stack: + function[TOTAL_SAMPLES] += 1 + + def parse_callchain(self): + callchain = [] + while self.lookahead(): + function = self.parse_call() + if function is None: + break + callchain.append(function) + if self.lookahead() == '': + self.consume() + return callchain + + call_re = re.compile(r'^\s+(?P
[0-9a-fA-F]+)\s+(?P.*)\s+\((?P.*)\)$') + addr2_re = re.compile(r'\+0x[0-9a-fA-F]+$') + + def parse_call(self): + line = self.consume() + mo = self.call_re.match(line) + assert mo + if not mo: + return None + + function_name = mo.group('symbol') + + # If present, amputate program counter from function name. + if function_name: + function_name = re.sub(self.addr2_re, '', function_name) + + if not function_name or function_name == '[unknown]': + function_name = mo.group('address') + + module = mo.group('module') + + function_id = function_name + ':' + module + + try: + function = self.profile.functions[function_id] + except KeyError: + function = Function(function_id, function_name) + function.module = os.path.basename(module) + function[SAMPLES] = 0 + function[TOTAL_SAMPLES] = 0 + self.profile.add_function(function) + + return function + + +class OprofileParser(LineParser): + """Parser for oprofile callgraph output. + + See also: + - http://oprofile.sourceforge.net/doc/opreport.html#opreport-callgraph + """ + + _fields_re = { + 'samples': r'(\d+)', + '%': r'(\S+)', + 'linenr info': r'(?P\(no location information\)|\S+:\d+)', + 'image name': r'(?P\S+(?:\s\(tgid:[^)]*\))?)', + 'app name': r'(?P\S+)', + 'symbol name': r'(?P\(no symbols\)|.+?)', + } + + def __init__(self, infile): + LineParser.__init__(self, infile) + self.entries = {} + self.entry_re = None + + def add_entry(self, callers, function, callees): + try: + entry = self.entries[function.id] + except KeyError: + self.entries[function.id] = (callers, function, callees) + else: + callers_total, function_total, callees_total = entry + self.update_subentries_dict(callers_total, callers) + function_total.samples += function.samples + self.update_subentries_dict(callees_total, callees) + + def update_subentries_dict(self, totals, partials): + for partial in compat_itervalues(partials): + try: + total = totals[partial.id] + except KeyError: + totals[partial.id] = partial + else: + total.samples += partial.samples + + def parse(self): + # read lookahead + self.readline() + + self.parse_header() + while self.lookahead(): + self.parse_entry() + + profile = Profile() + + reverse_call_samples = {} + + # populate the profile + profile[SAMPLES] = 0 + for _callers, _function, _callees in compat_itervalues(self.entries): + function = Function(_function.id, _function.name) + function[SAMPLES] = _function.samples + profile.add_function(function) + profile[SAMPLES] += _function.samples + + if _function.application: + function.process = os.path.basename(_function.application) + if _function.image: + function.module = os.path.basename(_function.image) + + total_callee_samples = 0 + for _callee in compat_itervalues(_callees): + total_callee_samples += _callee.samples + + for _callee in compat_itervalues(_callees): + if not _callee.self: + call = Call(_callee.id) + call[SAMPLES2] = _callee.samples + function.add_call(call) + + # compute derived data + profile.validate() + profile.find_cycles() + profile.ratio(TIME_RATIO, SAMPLES) + profile.call_ratios(SAMPLES2) + profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + + return profile + + def parse_header(self): + while not self.match_header(): + self.consume() + line = self.lookahead() + fields = re.split(r'\s\s+', line) + entry_re = r'^\s*' + r'\s+'.join([self._fields_re[field] for field in fields]) + r'(?P\s+\[self\])?$' + self.entry_re = re.compile(entry_re) + self.skip_separator() + + def parse_entry(self): + callers = self.parse_subentries() + if self.match_primary(): + function = self.parse_subentry() + if function is not None: + callees = self.parse_subentries() + self.add_entry(callers, function, callees) + self.skip_separator() + + def parse_subentries(self): + subentries = {} + while self.match_secondary(): + subentry = self.parse_subentry() + subentries[subentry.id] = subentry + return subentries + + def parse_subentry(self): + entry = Struct() + line = self.consume() + mo = self.entry_re.match(line) + if not mo: + raise ParseError('failed to parse', line) + fields = mo.groupdict() + entry.samples = int(mo.group(1)) + if 'source' in fields and fields['source'] != '(no location information)': + source = fields['source'] + filename, lineno = source.split(':') + entry.filename = filename + entry.lineno = int(lineno) + else: + source = '' + entry.filename = None + entry.lineno = None + entry.image = fields.get('image', '') + entry.application = fields.get('application', '') + if 'symbol' in fields and fields['symbol'] != '(no symbols)': + entry.symbol = fields['symbol'] + else: + entry.symbol = '' + if entry.symbol.startswith('"') and entry.symbol.endswith('"'): + entry.symbol = entry.symbol[1:-1] + entry.id = ':'.join((entry.application, entry.image, source, entry.symbol)) + entry.self = fields.get('self', None) != None + if entry.self: + entry.id += ':self' + if entry.symbol: + entry.name = entry.symbol + else: + entry.name = entry.image + return entry + + def skip_separator(self): + while not self.match_separator(): + self.consume() + self.consume() + + def match_header(self): + line = self.lookahead() + return line.startswith('samples') + + def match_separator(self): + line = self.lookahead() + return line == '-'*len(line) + + def match_primary(self): + line = self.lookahead() + return not line[:1].isspace() + + def match_secondary(self): + line = self.lookahead() + return line[:1].isspace() + + +class HProfParser(LineParser): + """Parser for java hprof output + + See also: + - http://java.sun.com/developer/technicalArticles/Programming/HPROF.html + """ + + trace_re = re.compile(r'\t(.*)\((.*):(.*)\)') + trace_id_re = re.compile(r'^TRACE (\d+):$') + + def __init__(self, infile): + LineParser.__init__(self, infile) + self.traces = {} + self.samples = {} + + def parse(self): + # read lookahead + self.readline() + + while not self.lookahead().startswith('------'): self.consume() + while not self.lookahead().startswith('TRACE '): self.consume() + + self.parse_traces() + + while not self.lookahead().startswith('CPU'): + self.consume() + + self.parse_samples() + + # populate the profile + profile = Profile() + profile[SAMPLES] = 0 + + functions = {} + + # build up callgraph + for id, trace in compat_iteritems(self.traces): + if not id in self.samples: continue + mtime = self.samples[id][0] + last = None + + for func, file, line in trace: + if not func in functions: + function = Function(func, func) + function[SAMPLES] = 0 + profile.add_function(function) + functions[func] = function + + function = functions[func] + # allocate time to the deepest method in the trace + if not last: + function[SAMPLES] += mtime + profile[SAMPLES] += mtime + else: + c = function.get_call(last) + c[SAMPLES2] += mtime + + last = func + + # compute derived data + profile.validate() + profile.find_cycles() + profile.ratio(TIME_RATIO, SAMPLES) + profile.call_ratios(SAMPLES2) + profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + + return profile + + def parse_traces(self): + while self.lookahead().startswith('TRACE '): + self.parse_trace() + + def parse_trace(self): + l = self.consume() + mo = self.trace_id_re.match(l) + tid = mo.group(1) + last = None + trace = [] + + while self.lookahead().startswith('\t'): + l = self.consume() + match = self.trace_re.search(l) + if not match: + #sys.stderr.write('Invalid line: %s\n' % l) + break + else: + function_name, file, line = match.groups() + trace += [(function_name, file, line)] + + self.traces[int(tid)] = trace + + def parse_samples(self): + self.consume() + self.consume() + + while not self.lookahead().startswith('CPU'): + rank, percent_self, percent_accum, count, traceid, method = self.lookahead().split() + self.samples[int(traceid)] = (int(count), method) + self.consume() + + +class SysprofParser(XmlParser): + + def __init__(self, stream): + XmlParser.__init__(self, stream) + + def parse(self): + objects = {} + nodes = {} + + self.element_start('profile') + while self.token.type == XML_ELEMENT_START: + if self.token.name_or_data == 'objects': + assert not objects + objects = self.parse_items('objects') + elif self.token.name_or_data == 'nodes': + assert not nodes + nodes = self.parse_items('nodes') + else: + self.parse_value(self.token.name_or_data) + self.element_end('profile') + + return self.build_profile(objects, nodes) + + def parse_items(self, name): + assert name[-1] == 's' + items = {} + self.element_start(name) + while self.token.type == XML_ELEMENT_START: + id, values = self.parse_item(name[:-1]) + assert id not in items + items[id] = values + self.element_end(name) + return items + + def parse_item(self, name): + attrs = self.element_start(name) + id = int(attrs['id']) + values = self.parse_values() + self.element_end(name) + return id, values + + def parse_values(self): + values = {} + while self.token.type == XML_ELEMENT_START: + name = self.token.name_or_data + value = self.parse_value(name) + assert name not in values + values[name] = value + return values + + def parse_value(self, tag): + self.element_start(tag) + value = self.character_data() + self.element_end(tag) + if value.isdigit(): + return int(value) + if value.startswith('"') and value.endswith('"'): + return value[1:-1] + return value + + def build_profile(self, objects, nodes): + profile = Profile() + + profile[SAMPLES] = 0 + for id, object in compat_iteritems(objects): + # Ignore fake objects (process names, modules, "Everything", "kernel", etc.) + if object['self'] == 0: + continue + + function = Function(id, object['name']) + function[SAMPLES] = object['self'] + profile.add_function(function) + profile[SAMPLES] += function[SAMPLES] + + for id, node in compat_iteritems(nodes): + # Ignore fake calls + if node['self'] == 0: + continue + + # Find a non-ignored parent + parent_id = node['parent'] + while parent_id != 0: + parent = nodes[parent_id] + caller_id = parent['object'] + if objects[caller_id]['self'] != 0: + break + parent_id = parent['parent'] + if parent_id == 0: + continue + + callee_id = node['object'] + + assert objects[caller_id]['self'] + assert objects[callee_id]['self'] + + function = profile.functions[caller_id] + + samples = node['self'] + try: + call = function.calls[callee_id] + except KeyError: + call = Call(callee_id) + call[SAMPLES2] = samples + function.add_call(call) + else: + call[SAMPLES2] += samples + + # Compute derived events + profile.validate() + profile.find_cycles() + profile.ratio(TIME_RATIO, SAMPLES) + profile.call_ratios(SAMPLES2) + profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + + return profile + + +class XPerfParser(Parser): + """Parser for CSVs generated by XPerf, from Microsoft Windows Performance Tools. + """ + + def __init__(self, stream): + Parser.__init__(self) + self.stream = stream + self.profile = Profile() + self.profile[SAMPLES] = 0 + self.column = {} + + def parse(self): + import csv + reader = csv.reader( + self.stream, + delimiter = ',', + quotechar = None, + escapechar = None, + doublequote = False, + skipinitialspace = True, + lineterminator = '\r\n', + quoting = csv.QUOTE_NONE) + header = True + for row in reader: + if header: + self.parse_header(row) + header = False + else: + self.parse_row(row) + + # compute derived data + self.profile.validate() + self.profile.find_cycles() + self.profile.ratio(TIME_RATIO, SAMPLES) + self.profile.call_ratios(SAMPLES2) + self.profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + + return self.profile + + def parse_header(self, row): + for column in range(len(row)): + name = row[column] + assert name not in self.column + self.column[name] = column + + def parse_row(self, row): + fields = {} + for name, column in compat_iteritems(self.column): + value = row[column] + for factory in int, float: + try: + value = factory(value) + except ValueError: + pass + else: + break + fields[name] = value + + process = fields['Process Name'] + symbol = fields['Module'] + '!' + fields['Function'] + weight = fields['Weight'] + count = fields['Count'] + + if process == 'Idle': + return + + function = self.get_function(process, symbol) + function[SAMPLES] += weight * count + self.profile[SAMPLES] += weight * count + + stack = fields['Stack'] + if stack != '?': + stack = stack.split('/') + assert stack[0] == '[Root]' + if stack[-1] != symbol: + # XXX: some cases the sampled function does not appear in the stack + stack.append(symbol) + caller = None + for symbol in stack[1:]: + callee = self.get_function(process, symbol) + if caller is not None: + try: + call = caller.calls[callee.id] + except KeyError: + call = Call(callee.id) + call[SAMPLES2] = count + caller.add_call(call) + else: + call[SAMPLES2] += count + caller = callee + + def get_function(self, process, symbol): + function_id = process + '!' + symbol + + try: + function = self.profile.functions[function_id] + except KeyError: + module, name = symbol.split('!', 1) + function = Function(function_id, name) + function.process = process + function.module = module + function[SAMPLES] = 0 + self.profile.add_function(function) + + return function + + +class SleepyParser(Parser): + """Parser for GNU gprof output. + + See also: + - http://www.codersnotes.com/sleepy/ + - http://sleepygraph.sourceforge.net/ + """ + + stdinInput = False + + def __init__(self, filename): + Parser.__init__(self) + + from zipfile import ZipFile + + self.database = ZipFile(filename) + + self.symbols = {} + self.calls = {} + + self.profile = Profile() + + _symbol_re = re.compile( + r'^(?P\w+)' + + r'\s+"(?P[^"]*)"' + + r'\s+"(?P[^"]*)"' + + r'\s+"(?P[^"]*)"' + + r'\s+(?P\d+)$' + ) + + def openEntry(self, name): + # Some versions of verysleepy use lowercase filenames + for database_name in self.database.namelist(): + if name.lower() == database_name.lower(): + name = database_name + break + + return self.database.open(name, 'r') + + def parse_symbols(self): + for line in self.openEntry('Symbols.txt'): + line = line.decode('UTF-8').rstrip('\r\n') + + mo = self._symbol_re.match(line) + if mo: + symbol_id, module, procname, sourcefile, sourceline = mo.groups() + + function_id = ':'.join([module, procname]) + + try: + function = self.profile.functions[function_id] + except KeyError: + function = Function(function_id, procname) + function.module = module + function[SAMPLES] = 0 + self.profile.add_function(function) + + self.symbols[symbol_id] = function + + def parse_callstacks(self): + for line in self.openEntry('Callstacks.txt'): + line = line.decode('UTF-8').rstrip('\r\n') + + fields = line.split() + samples = float(fields[0]) + callstack = fields[1:] + + callstack = [self.symbols[symbol_id] for symbol_id in callstack] + + callee = callstack[0] + + callee[SAMPLES] += samples + self.profile[SAMPLES] += samples + + for caller in callstack[1:]: + try: + call = caller.calls[callee.id] + except KeyError: + call = Call(callee.id) + call[SAMPLES2] = samples + caller.add_call(call) + else: + call[SAMPLES2] += samples + + callee = caller + + def parse(self): + profile = self.profile + profile[SAMPLES] = 0 + + self.parse_symbols() + self.parse_callstacks() + + # Compute derived events + profile.validate() + profile.find_cycles() + profile.ratio(TIME_RATIO, SAMPLES) + profile.call_ratios(SAMPLES2) + profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + + return profile + + +class PstatsParser: + """Parser python profiling statistics saved with te pstats module.""" + + stdinInput = False + multipleInput = True + + def __init__(self, *filename): + import pstats + try: + self.stats = pstats.Stats(*filename) + except ValueError: + if PYTHON_3: + sys.stderr.write('error: failed to load %s, maybe they are generated by different python version?\n' % ', '.join(filename)) + sys.exit(1) + import hotshot.stats + self.stats = hotshot.stats.load(filename[0]) + self.profile = Profile() + self.function_ids = {} + + def get_function_name(self, key): + filename, line, name = key + module = os.path.splitext(filename)[0] + module = os.path.basename(module) + return "%s:%d:%s" % (module, line, name) + + def get_function(self, key): + try: + id = self.function_ids[key] + except KeyError: + id = len(self.function_ids) + name = self.get_function_name(key) + function = Function(id, name) + function.filename = key[0] + self.profile.functions[id] = function + self.function_ids[key] = id + else: + function = self.profile.functions[id] + return function + + def parse(self): + self.profile[TIME] = 0.0 + self.profile[TOTAL_TIME] = self.stats.total_tt + for fn, (cc, nc, tt, ct, callers) in compat_iteritems(self.stats.stats): + callee = self.get_function(fn) + callee.called = nc + callee[TOTAL_TIME] = ct + callee[TIME] = tt + self.profile[TIME] += tt + self.profile[TOTAL_TIME] = max(self.profile[TOTAL_TIME], ct) + for fn, value in compat_iteritems(callers): + caller = self.get_function(fn) + call = Call(callee.id) + if isinstance(value, tuple): + for i in xrange(0, len(value), 4): + nc, cc, tt, ct = value[i:i+4] + if CALLS in call: + call[CALLS] += cc + else: + call[CALLS] = cc + + if TOTAL_TIME in call: + call[TOTAL_TIME] += ct + else: + call[TOTAL_TIME] = ct + + else: + call[CALLS] = value + call[TOTAL_TIME] = ratio(value, nc)*ct + + caller.add_call(call) + + if False: + self.stats.print_stats() + self.stats.print_callees() + + # Compute derived events + self.profile.validate() + self.profile.ratio(TIME_RATIO, TIME) + self.profile.ratio(TOTAL_TIME_RATIO, TOTAL_TIME) + + return self.profile + +class DtraceParser(LineParser): + """Parser for linux perf callgraph output. + + It expects output generated with + + # Refer to https://github.com/brendangregg/FlameGraph#dtrace + # 60 seconds of user-level stacks, including time spent in-kernel, for PID 12345 at 97 Hertz + sudo dtrace -x ustackframes=100 -n 'profile-97 /pid == 12345/ { @[ustack()] = count(); } tick-60s { exit(0); }' -o out.user_stacks + + # The dtrace output + gprof2dot.py -f dtrace out.user_stacks + + # Notice: sometimes, the dtrace outputs format may be latin-1, and gprof2dot will fail to parse it. + # To solve this problem, you should use iconv to convert to UTF-8 explicitly. + # TODO: add an encoding flag to tell gprof2dot how to decode the profile file. + iconv -f ISO-8859-1 -t UTF-8 out.user_stacks | gprof2dot.py -f dtrace + """ + + def __init__(self, infile): + LineParser.__init__(self, infile) + self.profile = Profile() + + def readline(self): + # Override LineParser.readline to ignore comment lines + while True: + LineParser.readline(self) + if self.eof(): + break + + line = self.lookahead().strip() + if line.startswith('CPU'): + # The format likes: + # CPU ID FUNCTION:NAME + # 1 29684 :tick-60s + # Skip next line + LineParser.readline(self) + elif not line == '': + break + + + def parse(self): + # read lookahead + self.readline() + + profile = self.profile + profile[SAMPLES] = 0 + while not self.eof(): + self.parse_event() + + # compute derived data + profile.validate() + profile.find_cycles() + profile.ratio(TIME_RATIO, SAMPLES) + profile.call_ratios(SAMPLES2) + if totalMethod == "callratios": + # Heuristic approach. TOTAL_SAMPLES is unused. + profile.integrate(TOTAL_TIME_RATIO, TIME_RATIO) + elif totalMethod == "callstacks": + # Use the actual call chains for functions. + profile[TOTAL_SAMPLES] = profile[SAMPLES] + profile.ratio(TOTAL_TIME_RATIO, TOTAL_SAMPLES) + # Then propagate that total time to the calls. + for function in compat_itervalues(profile.functions): + for call in compat_itervalues(function.calls): + if call.ratio is not None: + callee = profile.functions[call.callee_id] + call[TOTAL_TIME_RATIO] = call.ratio * callee[TOTAL_TIME_RATIO] + else: + assert False + + return profile + + def parse_event(self): + if self.eof(): + return + + callchain, count = self.parse_callchain() + if not callchain: + return + + callee = callchain[0] + callee[SAMPLES] += count + self.profile[SAMPLES] += count + + for caller in callchain[1:]: + try: + call = caller.calls[callee.id] + except KeyError: + call = Call(callee.id) + call[SAMPLES2] = count + caller.add_call(call) + else: + call[SAMPLES2] += count + + callee = caller + + # Increment TOTAL_SAMPLES only once on each function. + stack = set(callchain) + for function in stack: + function[TOTAL_SAMPLES] += count + + + def parse_callchain(self): + callchain = [] + count = 0 + while self.lookahead(): + function, count = self.parse_call() + if function is None: + break + callchain.append(function) + return callchain, count + + call_re = re.compile(r'^\s+(?P.*)`(?P.*)') + addr2_re = re.compile(r'\+0x[0-9a-fA-F]+$') + + def parse_call(self): + line = self.consume() + mo = self.call_re.match(line) + if not mo: + # The line must be the stack count + return None, int(line.strip()) + + function_name = mo.group('symbol') + + # If present, amputate program counter from function name. + if function_name: + function_name = re.sub(self.addr2_re, '', function_name) + + # if not function_name or function_name == '[unknown]': + # function_name = mo.group('address') + + module = mo.group('module') + + function_id = function_name + ':' + module + + try: + function = self.profile.functions[function_id] + except KeyError: + function = Function(function_id, function_name) + function.module = os.path.basename(module) + function[SAMPLES] = 0 + function[TOTAL_SAMPLES] = 0 + self.profile.add_function(function) + + return function, None + +formats = { + "axe": AXEParser, + "callgrind": CallgrindParser, + "hprof": HProfParser, + "json": JsonParser, + "oprofile": OprofileParser, + "perf": PerfParser, + "prof": GprofParser, + "pstats": PstatsParser, + "sleepy": SleepyParser, + "sysprof": SysprofParser, + "xperf": XPerfParser, + "dtrace": DtraceParser, +} + + +######################################################################## +# Output + + +class Theme: + + def __init__(self, + bgcolor = (0.0, 0.0, 1.0), + mincolor = (0.0, 0.0, 0.0), + maxcolor = (0.0, 0.0, 1.0), + fontname = "Arial", + fontcolor = "white", + nodestyle = "filled", + minfontsize = 10.0, + maxfontsize = 10.0, + minpenwidth = 0.5, + maxpenwidth = 4.0, + gamma = 2.2, + skew = 1.0): + self.bgcolor = bgcolor + self.mincolor = mincolor + self.maxcolor = maxcolor + self.fontname = fontname + self.fontcolor = fontcolor + self.nodestyle = nodestyle + self.minfontsize = minfontsize + self.maxfontsize = maxfontsize + self.minpenwidth = minpenwidth + self.maxpenwidth = maxpenwidth + self.gamma = gamma + self.skew = skew + + def graph_bgcolor(self): + return self.hsl_to_rgb(*self.bgcolor) + + def graph_fontname(self): + return self.fontname + + def graph_fontcolor(self): + return self.fontcolor + + def graph_fontsize(self): + return self.minfontsize + + def node_bgcolor(self, weight): + return self.color(weight) + + def node_fgcolor(self, weight): + if self.nodestyle == "filled": + return self.graph_bgcolor() + else: + return self.color(weight) + + def node_fontsize(self, weight): + return self.fontsize(weight) + + def node_style(self): + return self.nodestyle + + def edge_color(self, weight): + return self.color(weight) + + def edge_fontsize(self, weight): + return self.fontsize(weight) + + def edge_penwidth(self, weight): + return max(weight*self.maxpenwidth, self.minpenwidth) + + def edge_arrowsize(self, weight): + return 0.5 * math.sqrt(self.edge_penwidth(weight)) + + def fontsize(self, weight): + return max(weight**2 * self.maxfontsize, self.minfontsize) + + def color(self, weight): + weight = min(max(weight, 0.0), 1.0) + + hmin, smin, lmin = self.mincolor + hmax, smax, lmax = self.maxcolor + + if self.skew < 0: + raise ValueError("Skew must be greater than 0") + elif self.skew == 1.0: + h = hmin + weight*(hmax - hmin) + s = smin + weight*(smax - smin) + l = lmin + weight*(lmax - lmin) + else: + base = self.skew + h = hmin + ((hmax-hmin)*(-1.0 + (base ** weight)) / (base - 1.0)) + s = smin + ((smax-smin)*(-1.0 + (base ** weight)) / (base - 1.0)) + l = lmin + ((lmax-lmin)*(-1.0 + (base ** weight)) / (base - 1.0)) + + return self.hsl_to_rgb(h, s, l) + + def hsl_to_rgb(self, h, s, l): + """Convert a color from HSL color-model to RGB. + + See also: + - http://www.w3.org/TR/css3-color/#hsl-color + """ + + h = h % 1.0 + s = min(max(s, 0.0), 1.0) + l = min(max(l, 0.0), 1.0) + + if l <= 0.5: + m2 = l*(s + 1.0) + else: + m2 = l + s - l*s + m1 = l*2.0 - m2 + r = self._hue_to_rgb(m1, m2, h + 1.0/3.0) + g = self._hue_to_rgb(m1, m2, h) + b = self._hue_to_rgb(m1, m2, h - 1.0/3.0) + + # Apply gamma correction + r **= self.gamma + g **= self.gamma + b **= self.gamma + + return (r, g, b) + + def _hue_to_rgb(self, m1, m2, h): + if h < 0.0: + h += 1.0 + elif h > 1.0: + h -= 1.0 + if h*6 < 1.0: + return m1 + (m2 - m1)*h*6.0 + elif h*2 < 1.0: + return m2 + elif h*3 < 2.0: + return m1 + (m2 - m1)*(2.0/3.0 - h)*6.0 + else: + return m1 + + +TEMPERATURE_COLORMAP = Theme( + mincolor = (2.0/3.0, 0.80, 0.25), # dark blue + maxcolor = (0.0, 1.0, 0.5), # satured red + gamma = 1.0 +) + +PINK_COLORMAP = Theme( + mincolor = (0.0, 1.0, 0.90), # pink + maxcolor = (0.0, 1.0, 0.5), # satured red +) + +GRAY_COLORMAP = Theme( + mincolor = (0.0, 0.0, 0.85), # light gray + maxcolor = (0.0, 0.0, 0.0), # black +) + +BW_COLORMAP = Theme( + minfontsize = 8.0, + maxfontsize = 24.0, + mincolor = (0.0, 0.0, 0.0), # black + maxcolor = (0.0, 0.0, 0.0), # black + minpenwidth = 0.1, + maxpenwidth = 8.0, +) + +PRINT_COLORMAP = Theme( + minfontsize = 18.0, + maxfontsize = 30.0, + fontcolor = "black", + nodestyle = "solid", + mincolor = (0.0, 0.0, 0.0), # black + maxcolor = (0.0, 0.0, 0.0), # black + minpenwidth = 0.1, + maxpenwidth = 8.0, +) + + +themes = { + "color": TEMPERATURE_COLORMAP, + "pink": PINK_COLORMAP, + "gray": GRAY_COLORMAP, + "bw": BW_COLORMAP, + "print": PRINT_COLORMAP, +} + + +def sorted_iteritems(d): + # Used mostly for result reproducibility (while testing.) + keys = compat_keys(d) + keys.sort() + for key in keys: + value = d[key] + yield key, value + + +class DotWriter: + """Writer for the DOT language. + + See also: + - "The DOT Language" specification + http://www.graphviz.org/doc/info/lang.html + """ + + strip = False + wrap = False + + def __init__(self, fp): + self.fp = fp + + def wrap_function_name(self, name): + """Split the function name on multiple lines.""" + + if len(name) > 32: + ratio = 2.0/3.0 + height = max(int(len(name)/(1.0 - ratio) + 0.5), 1) + width = max(len(name)/height, 32) + # TODO: break lines in symbols + name = textwrap.fill(name, width, break_long_words=False) + + # Take away spaces + name = name.replace(", ", ",") + name = name.replace("> >", ">>") + name = name.replace("> >", ">>") # catch consecutive + + return name + + show_function_events = [TOTAL_TIME_RATIO, TIME_RATIO] + show_edge_events = [TOTAL_TIME_RATIO, CALLS] + + def graph(self, profile, theme): + self.begin_graph() + + fontname = theme.graph_fontname() + fontcolor = theme.graph_fontcolor() + nodestyle = theme.node_style() + + self.attr('graph', fontname=fontname, ranksep=0.25, nodesep=0.125) + self.attr('node', fontname=fontname, shape="box", style=nodestyle, fontcolor=fontcolor, width=0, height=0) + self.attr('edge', fontname=fontname) + + for _, function in sorted_iteritems(profile.functions): + labels = [] + if function.process is not None: + labels.append(function.process) + if function.module is not None: + labels.append(function.module) + + if self.strip: + function_name = function.stripped_name() + else: + function_name = function.name + + # dot can't parse quoted strings longer than YY_BUF_SIZE, which + # defaults to 16K. But some annotated C++ functions (e.g., boost, + # https://github.com/jrfonseca/gprof2dot/issues/30) can exceed that + MAX_FUNCTION_NAME = 4096 + if len(function_name) >= MAX_FUNCTION_NAME: + sys.stderr.write('warning: truncating function name with %u chars (%s)\n' % (len(function_name), function_name[:32] + '...')) + function_name = function_name[:MAX_FUNCTION_NAME - 1] + unichr(0x2026) + + if self.wrap: + function_name = self.wrap_function_name(function_name) + labels.append(function_name) + + for event in self.show_function_events: + if event in function.events: + label = event.format(function[event]) + labels.append(label) + if function.called is not None: + labels.append("%u%s" % (function.called, MULTIPLICATION_SIGN)) + + if function.weight is not None: + weight = function.weight + else: + weight = 0.0 + + label = '\n'.join(labels) + self.node(function.id, + label = label, + color = self.color(theme.node_bgcolor(weight)), + fontcolor = self.color(theme.node_fgcolor(weight)), + fontsize = "%.2f" % theme.node_fontsize(weight), + tooltip = function.filename, + ) + + for _, call in sorted_iteritems(function.calls): + callee = profile.functions[call.callee_id] + + labels = [] + for event in self.show_edge_events: + if event in call.events: + label = event.format(call[event]) + labels.append(label) + + if call.weight is not None: + weight = call.weight + elif callee.weight is not None: + weight = callee.weight + else: + weight = 0.0 + + label = '\n'.join(labels) + + self.edge(function.id, call.callee_id, + label = label, + color = self.color(theme.edge_color(weight)), + fontcolor = self.color(theme.edge_color(weight)), + fontsize = "%.2f" % theme.edge_fontsize(weight), + penwidth = "%.2f" % theme.edge_penwidth(weight), + labeldistance = "%.2f" % theme.edge_penwidth(weight), + arrowsize = "%.2f" % theme.edge_arrowsize(weight), + ) + + self.end_graph() + + def begin_graph(self): + self.write('digraph {\n') + + def end_graph(self): + self.write('}\n') + + def attr(self, what, **attrs): + self.write("\t") + self.write(what) + self.attr_list(attrs) + self.write(";\n") + + def node(self, node, **attrs): + self.write("\t") + self.id(node) + self.attr_list(attrs) + self.write(";\n") + + def edge(self, src, dst, **attrs): + self.write("\t") + self.id(src) + self.write(" -> ") + self.id(dst) + self.attr_list(attrs) + self.write(";\n") + + def attr_list(self, attrs): + if not attrs: + return + self.write(' [') + first = True + for name, value in sorted_iteritems(attrs): + if value is None: + continue + if first: + first = False + else: + self.write(", ") + self.id(name) + self.write('=') + self.id(value) + self.write(']') + + def id(self, id): + if isinstance(id, (int, float)): + s = str(id) + elif isinstance(id, basestring): + if id.isalnum() and not id.startswith('0x'): + s = id + else: + s = self.escape(id) + else: + raise TypeError + self.write(s) + + def color(self, rgb): + r, g, b = rgb + + def float2int(f): + if f <= 0.0: + return 0 + if f >= 1.0: + return 255 + return int(255.0*f + 0.5) + + return "#" + "".join(["%02x" % float2int(c) for c in (r, g, b)]) + + def escape(self, s): + if not PYTHON_3: + s = s.encode('utf-8') + s = s.replace('\\', r'\\') + s = s.replace('\n', r'\n') + s = s.replace('\t', r'\t') + s = s.replace('"', r'\"') + return '"' + s + '"' + + def write(self, s): + self.fp.write(s) + + + +######################################################################## +# Main program + + +def naturalJoin(values): + if len(values) >= 2: + return ', '.join(values[:-1]) + ' or ' + values[-1] + + else: + return ''.join(values) + + +def main(argv=sys.argv[1:]): + """Main program.""" + + global totalMethod + + formatNames = list(formats.keys()) + formatNames.sort() + + themeNames = list(themes.keys()) + themeNames.sort() + + labelNames = list(labels.keys()) + labelNames.sort() + + optparser = optparse.OptionParser( + usage="\n\t%prog [options] [file] ...") + optparser.add_option( + '-o', '--output', metavar='FILE', + type="string", dest="output", + help="output filename [stdout]") + optparser.add_option( + '-n', '--node-thres', metavar='PERCENTAGE', + type="float", dest="node_thres", default=0.5, + help="eliminate nodes below this threshold [default: %default]") + optparser.add_option( + '-e', '--edge-thres', metavar='PERCENTAGE', + type="float", dest="edge_thres", default=0.1, + help="eliminate edges below this threshold [default: %default]") + optparser.add_option( + '-f', '--format', + type="choice", choices=formatNames, + dest="format", default="prof", + help="profile format: %s [default: %%default]" % naturalJoin(formatNames)) + optparser.add_option( + '--total', + type="choice", choices=('callratios', 'callstacks'), + dest="totalMethod", default=totalMethod, + help="preferred method of calculating total time: callratios or callstacks (currently affects only perf format) [default: %default]") + optparser.add_option( + '-c', '--colormap', + type="choice", choices=themeNames, + dest="theme", default="color", + help="color map: %s [default: %%default]" % naturalJoin(themeNames)) + optparser.add_option( + '-s', '--strip', + action="store_true", + dest="strip", default=False, + help="strip function parameters, template parameters, and const modifiers from demangled C++ function names") + optparser.add_option( + '--color-nodes-by-selftime', + action="store_true", + dest="color_nodes_by_selftime", default=False, + help="color nodes by self time, rather than by total time (sum of self and descendants)") + optparser.add_option( + '--colour-nodes-by-selftime', + action="store_true", + dest="color_nodes_by_selftime", + help=optparse.SUPPRESS_HELP) + optparser.add_option( + '-w', '--wrap', + action="store_true", + dest="wrap", default=False, + help="wrap function names") + optparser.add_option( + '--show-samples', + action="store_true", + dest="show_samples", default=False, + help="show function samples") + optparser.add_option( + '--node-label', metavar='MEASURE', + type='choice', choices=labelNames, + action='append', + dest='node_labels', + help="measurements to on show the node (can be specified multiple times): %s [default: %s]" % ( + naturalJoin(labelNames), ', '.join(defaultLabelNames))) + # add option to show information on available entries () + optparser.add_option( + '--list-functions', + type="string", + dest="list_functions", default=None, + help="""\ +list functions available for selection in -z or -l, requires selector argument +( use '+' to select all). +Recall that the selector argument is used with Unix/Bash globbing/pattern matching, +and that entries are formatted '::'. When argument starts +with '%', a dump of all available information is performed for selected entries, + after removal of leading '%'. +""") + # add option to create subtree or show paths + optparser.add_option( + '-z', '--root', + type="string", + dest="root", default="", + help="prune call graph to show only descendants of specified root function") + optparser.add_option( + '-l', '--leaf', + type="string", + dest="leaf", default="", + help="prune call graph to show only ancestors of specified leaf function") + optparser.add_option( + '--depth', + type="int", + dest="depth", default=-1, + help="prune call graph to show only descendants or ancestors until specified depth") + # add a new option to control skew of the colorization curve + optparser.add_option( + '--skew', + type="float", dest="theme_skew", default=1.0, + help="skew the colorization curve. Values < 1.0 give more variety to lower percentages. Values > 1.0 give less variety to lower percentages") + # add option for filtering by file path + optparser.add_option( + '-p', '--path', action="append", + type="string", dest="filter_paths", + help="Filter all modules not in a specified path") + (options, args) = optparser.parse_args(argv) + + if len(args) > 1 and options.format != 'pstats': + optparser.error('incorrect number of arguments') + + try: + theme = themes[options.theme] + except KeyError: + optparser.error('invalid colormap \'%s\'' % options.theme) + + # set skew on the theme now that it has been picked. + if options.theme_skew: + theme.skew = options.theme_skew + + totalMethod = options.totalMethod + + try: + Format = formats[options.format] + except KeyError: + optparser.error('invalid format \'%s\'' % options.format) + + if Format.stdinInput: + if not args: + fp = sys.stdin + elif PYTHON_3: + fp = open(args[0], 'rt', encoding='UTF-8') + else: + fp = open(args[0], 'rt') + parser = Format(fp) + elif Format.multipleInput: + if not args: + optparser.error('at least a file must be specified for %s input' % options.format) + parser = Format(*args) + else: + if len(args) != 1: + optparser.error('exactly one file must be specified for %s input' % options.format) + parser = Format(args[0]) + + profile = parser.parse() + + if options.output is None: + if PYTHON_3: + output = open(sys.stdout.fileno(), mode='wt', encoding='UTF-8', closefd=False) + else: + output = sys.stdout + else: + if PYTHON_3: + output = open(options.output, 'wt', encoding='UTF-8') + else: + output = open(options.output, 'wt') + + dot = DotWriter(output) + dot.strip = options.strip + dot.wrap = options.wrap + + labelNames = options.node_labels or defaultLabelNames + dot.show_function_events = [labels[l] for l in labelNames] + if options.show_samples: + dot.show_function_events.append(SAMPLES) + + profile = profile + profile.prune(options.node_thres/100.0, options.edge_thres/100.0, options.filter_paths, options.color_nodes_by_selftime) + + if options.list_functions: + profile.printFunctionIds(selector=options.list_functions) + sys.exit(0) + + if options.root: + rootIds = profile.getFunctionIds(options.root) + if not rootIds: + sys.stderr.write('root node ' + options.root + ' not found (might already be pruned : try -e0 -n0 flags)\n') + sys.exit(1) + profile.prune_root(rootIds, options.depth) + if options.leaf: + leafIds = profile.getFunctionIds(options.leaf) + if not leafIds: + sys.stderr.write('leaf node ' + options.leaf + ' not found (maybe already pruned : try -e0 -n0 flags)\n') + sys.exit(1) + profile.prune_leaf(leafIds, options.depth) + + dot.graph(profile, theme) + + +if __name__ == '__main__': + main() diff --git a/proj.py b/proj.py index de09db79..c967cdfb 100755 --- a/proj.py +++ b/proj.py @@ -66,8 +66,15 @@ def build(): _run('python3', 'setup.py', 'build_ext', '--inplace') +@command +def build_fuzzing(): + _run('python3', 'setup.py', 'build_ext', '--inplace', + env={'TEST_QUESTDB_FUZZING': '1'}) + + @command def test(all=False, patch_path='1', *args): + _run('cargo', 'test', cwd=PROJ_ROOT / 'pystr-to-utf8') env = {'TEST_QUESTDB_PATCH_PATH': patch_path} if _arg2bool(all): env['TEST_QUESTDB_INTEGRATION'] = '1' @@ -75,6 +82,74 @@ def test(all=False, patch_path='1', *args): env=env) +@command +def test_fuzzing(*args): + import atheris + import pathlib + lib_path = pathlib.Path(atheris.path()) / 'asan_with_fuzzer.so' + if not lib_path.exists(): + sys.stderr.write(f'WARNING: {lib_path} not found\n') + sys.exit(42) + ld_preload = os.environ.get('LD_PRELOAD', '') + if ld_preload: + ld_preload += ':' + ld_preload += str(lib_path) + cmd = [ + 'python3', + 'test/test_dataframe_fuzz.py'] + list(args) + if not args: + cmd.extend([ + '-detect_leaks=0', + '-rss_limit_mb=32768', + '-artifact_prefix=fuzz-artifact/', + '-create_missing_dirs=1']) + _run(*cmd, env={'LD_PRELOAD': ld_preload}) + + +@command +def benchmark(*args): + env = {'TEST_QUESTDB_PATCH_PATH': '1'} + _run('python3', 'test/benchmark.py', '-v', *args, env=env) + + +@command +def gdb_test(*args): + env = {'TEST_QUESTDB_PATCH_PATH': '1'} + _run('gdb', '-ex', 'r', '--args', 'python3', 'test/test.py', '-v', *args, + env=env) + + +@command +def rr_test(*args): + """ + Linux-only reverse debugger. + https://github.com/rr-debugger/rr + https://www.youtube.com/watch?v=61kD3x4Pu8I + + Install rr: + $ sudo apt install rr + $ sudo vim /proc/sys/kernel/perf_event_paranoid # set to -1 + """ + env = {'TEST_QUESTDB_PATCH_PATH': '1'} + try: + _run('rr', 'record', 'python3', 'test/test.py', '-v', *args, + env=env) + finally: + sys.stdout.flush() + sys.stderr.flush() + red = '\033[01;31m' + reset = '\033[0m' + sys.stderr.write(f'''\n{red} + Now first re-run marking stdout/stderr events with a unique ID: + $ rr -M replay -a + + Then re-run inside GDB, running up to a specific event: + $ rr replay -g $EVENT_ID + (rr) break ingress.c:9999 + (rr) continue # or step, next, etc.{reset}\n\n''') + + + @command def doc(http_serve=False, port=None): _run('python3', '-m', 'sphinx.cmd.build', @@ -112,6 +187,17 @@ def cibuildwheel(*args): *args) +@command +def repl(*args): + _run('python3', env={'PYTHONPATH': str(PROJ_ROOT / 'src')}) + + +@command +def example(name, *args): + _run('python3', 'examples/' + name + '.py', *args, + env={'PYTHONPATH': str(PROJ_ROOT / 'src')}) + + @command def cw(*args): cibuildwheel(args) @@ -128,6 +214,7 @@ def clean(): _rmtree(PROJ_ROOT / 'dist') _rmtree(PROJ_ROOT / 'c-questdb-client' / 'questdb-rs-ffi' / 'target') _rmtree(PROJ_ROOT / 'c-questdb-client' / 'build') + _rmtree(PROJ_ROOT / 'pystr-to-utf8' / 'target') _rmtree(PROJ_ROOT / 'src' / 'questdb.egg-info') _rmtree(PROJ_ROOT / 'venv') _rmtree(PROJ_ROOT / 'wheelhouse') diff --git a/pyproject.toml b/pyproject.toml index e01713a6..c7cb32e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ requires = [ # See: https://cibuildwheel.readthedocs.io/en/stable/options/#configuration-file build-verbosity = "3" before-build = "python {project}/install_rust.py" +before-test = "python {project}/ci/pip_install_deps.py" test-command = "python {project}/test/test.py -v" skip = [ # No 32-bit musl C native tool chain for Rust. diff --git a/pystr-to-utf8/Cargo.lock b/pystr-to-utf8/Cargo.lock new file mode 100644 index 00000000..5210f8ca --- /dev/null +++ b/pystr-to-utf8/Cargo.lock @@ -0,0 +1,242 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "cbindgen" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6358dedf60f4d9b8db43ad187391afe959746101346fe51bb978126bec61dfb" +dependencies = [ + "heck", + "indexmap", + "log", + "proc-macro2", + "quote", + "serde", + "serde_json", + "syn", + "tempfile", + "toml", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "fastrand" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +dependencies = [ + "instant", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "indexmap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "itoa" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" + +[[package]] +name = "libc" +version = "0.2.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89" + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "proc-macro2" +version = "1.0.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pystr-to-utf8" +version = "0.1.0" +dependencies = [ + "cbindgen", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags", +] + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + +[[package]] +name = "serde" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce777b7b150d76b9cf60d28b55f5847135a003f7d7350c6be7a773508ce7d45" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "toml" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82e1a7758622a465f8cee077614c73484dac5b836c02ff6a40d5d1010324d7" +dependencies = [ + "serde", +] + +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/pystr-to-utf8/Cargo.toml b/pystr-to-utf8/Cargo.toml new file mode 100644 index 00000000..4eb8f445 --- /dev/null +++ b/pystr-to-utf8/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "pystr-to-utf8" +version = "0.1.0" +edition = "2021" +publish = false + +[lib] +name = "pystr_to_utf8" +crate-type = ["staticlib"] + +[build-dependencies] +cbindgen = { version = "0.24.3", optional = true, default-features = false } \ No newline at end of file diff --git a/pystr-to-utf8/README.md b/pystr-to-utf8/README.md new file mode 100644 index 00000000..607f98b1 --- /dev/null +++ b/pystr-to-utf8/README.md @@ -0,0 +1,11 @@ +By default, when compiling, we don't re-generate the `.h` and `.pxd` files. +This is to speed up compile time. + +If you've updated the API, regenerate them by running: + +``` +$ cargo clean +$ cargo build --features cbindgen +``` + +Then make sure to commit the updated generated files. diff --git a/pystr-to-utf8/build.rs b/pystr-to-utf8/build.rs new file mode 100644 index 00000000..a93cb918 --- /dev/null +++ b/pystr-to-utf8/build.rs @@ -0,0 +1,51 @@ +#[cfg(feature = "cbindgen")] +extern crate cbindgen; + +#[cfg(feature = "cbindgen")] +const BAD_PXD: &str = " +cdef extern from *: + ctypedef bint bool + ctypedef struct va_list"; + +#[cfg(feature = "cbindgen")] +fn main() -> Result<(), Box> { + let crate_dir = std::env::var("CARGO_MANIFEST_DIR")?; + let bindings = cbindgen::generate(&crate_dir)?; + bindings.write_to_file("include/pystr_to_utf8.h"); + + let config = cbindgen::Config { + language: cbindgen::Language::Cython, + documentation: true, + cython: cbindgen::CythonConfig { + header: Some("\"pystr_to_utf8.h\"".to_owned()), + cimports: std::collections::BTreeMap::new()}, + usize_is_size_t: true, + ..Default::default() + }; + + let bindings = cbindgen::Builder::new() + .with_crate(&crate_dir) + .with_config(config) + .generate()?; + + // Instead of just writing out the file: + // bindings.write_to_file("include/pystr_to_utf8.pxd"); + // We need to do some post-processing to make it work our code. + // The default output is too opinionated and has unwanted typedefs. + let mut pxd = Vec::new(); + bindings.write(&mut pxd); + let pxd = String::from_utf8(pxd)?; + if !pxd.contains(BAD_PXD) { + panic!("cbindgen generated unexpected pxd: {}", pxd); + } + let pxd = pxd.replace(BAD_PXD, ""); + let pxd = pxd.replace("bool", "bint"); + let pxd = pxd.replace(";", ""); + // println!("{}", &pxd); + std::fs::write("../src/questdb/pystr_to_utf8.pxd", &pxd)?; + Ok(()) +} + +#[cfg(not(feature = "cbindgen"))] +fn main() {} + diff --git a/pystr-to-utf8/cbindgen.toml b/pystr-to-utf8/cbindgen.toml new file mode 100644 index 00000000..d9f4e5d0 --- /dev/null +++ b/pystr-to-utf8/cbindgen.toml @@ -0,0 +1,59 @@ +language = "C" + +header = """/******************************************************************************* + * ___ _ ____ ____ + * / _ \\ _ _ ___ ___| |_| _ \\| __ ) + * | | | | | | |/ _ \\/ __| __| | | | _ \\ + * | |_| | |_| | __/\\__ \\ |_| |_| | |_) | + * \\__\\_\\\\__,_|\\___||___/\\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2022 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/""" + +pragma_once = true + +autogen_warning = "// This header is auto-generated. Do not edit directly!" + +# A list of sys headers to #include (with angle brackets) +# default: [] +sys_includes = ["stdint.h", "stddef.h", "stdbool.h"] + +# A list of headers to #include (with quotes) +# default: [] +includes = [] # ["my_great_lib.h"] + +# Suppress cbindgen's default includes. +no_includes = true + +# #ifdef __cplusplus +# extern "C" { +# #endif // __cplusplus +cpp_compat = true + +# Code Style Options +braces = "NextLine" +line_length = 79 +tab_width = 4 +documentation = true +documentation_style = "doxy" + +# Codegen Options +style = "both" +usize_is_size_t = true + +[fn] +args = "vertical" diff --git a/pystr-to-utf8/include/pystr_to_utf8.h b/pystr-to-utf8/include/pystr_to_utf8.h new file mode 100644 index 00000000..cf783284 --- /dev/null +++ b/pystr-to-utf8/include/pystr_to_utf8.h @@ -0,0 +1,113 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2022 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#pragma once + +// This header is auto-generated. Do not edit directly! + +#include +#include +#include + +typedef struct qdb_pystr_buf qdb_pystr_buf; + +typedef struct qdb_pystr_pos +{ + size_t chain; + size_t string; +} qdb_pystr_pos; + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/** + * Prepare a new buffer. The buffer must be freed with `qdb_pystr_free`. + * The `qdb_ucsX_to_utf8` functions will write to this buffer. + */ +struct qdb_pystr_buf *qdb_pystr_buf_new(void); + +/** + * Get current position. Use in conjunction with `truncate`. + */ +struct qdb_pystr_pos qdb_pystr_buf_tell(const struct qdb_pystr_buf *b); + +/** + * Trim the buffer to the given position. Use in conjunction with `tell`. + */ +void qdb_pystr_buf_truncate(struct qdb_pystr_buf *b, + struct qdb_pystr_pos pos); + +/** + * Reset the converter's buffer to zero length. + */ +void qdb_pystr_buf_clear(struct qdb_pystr_buf *b); + +/** + * Free the buffer. Must be called after `qdb_pystr_buf_new`. + */ +void qdb_pystr_buf_free(struct qdb_pystr_buf *b); + +/** + * Convert a Py_UCS1 string to UTF-8. + * Returns a `buf_out` borrowed ptr of `size_out` len. + * The buffer is borrowed from `b`. + */ +void qdb_ucs1_to_utf8(struct qdb_pystr_buf *b, + size_t count, + const uint8_t *input, + size_t *size_out, + const char **buf_out); + +/** + * Convert a Py_UCS2 string to UTF-8. + * Returns a `buf_out` borrowed ptr of `size_out` len. + * The buffer is borrowed from `b`. + * In case of errors, returns `false` and bad_codepoint_out is set to the + * offending codepoint. + */ +bool qdb_ucs2_to_utf8(struct qdb_pystr_buf *b, + size_t count, + const uint16_t *input, + size_t *size_out, + const char **buf_out, + uint32_t *bad_codepoint_out); + +/** + * Convert a Py_UCS4 string to UTF-8. + * Returns a `buf_out` borrowed ptr of `size_out` len. + * The buffer is borrowed from `b`. + * In case of errors, returns `false` and bad_codepoint_out is set to the + * offending codepoint. + */ +bool qdb_ucs4_to_utf8(struct qdb_pystr_buf *b, + size_t count, + const uint32_t *input, + size_t *size_out, + const char **buf_out, + uint32_t *bad_codepoint_out); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/pystr-to-utf8/src/lib.rs b/pystr-to-utf8/src/lib.rs new file mode 100644 index 00000000..aa58ac7c --- /dev/null +++ b/pystr-to-utf8/src/lib.rs @@ -0,0 +1,286 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2022 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +use std::ffi::c_char; +use std::slice::from_raw_parts; + +#[allow(non_camel_case_types)] +pub struct qdb_pystr_buf(Vec); + +#[repr(C)] +#[allow(non_camel_case_types)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct qdb_pystr_pos { + pub chain: usize, + pub string: usize +} + +/// Prepare a new buffer. The buffer must be freed with `qdb_pystr_free`. +/// The `qdb_ucsX_to_utf8` functions will write to this buffer. +#[no_mangle] +pub unsafe extern "C" fn qdb_pystr_buf_new() -> *mut qdb_pystr_buf { + Box::into_raw(Box::new(qdb_pystr_buf(Vec::new()))) +} + +/// Get current position. Use in conjunction with `truncate`. +#[no_mangle] +pub unsafe extern "C" fn qdb_pystr_buf_tell( + b: *const qdb_pystr_buf) -> qdb_pystr_pos { + let b = &*b; + let chain_pos = b.0.len(); + let string_pos = if chain_pos > 0 { + b.0[chain_pos - 1].len() + } else { + 0 + }; + qdb_pystr_pos { chain: chain_pos, string: string_pos } +} + +/// Trim the buffer to the given position. Use in conjunction with `tell`. +#[no_mangle] +pub unsafe extern "C" fn qdb_pystr_buf_truncate( + b: *mut qdb_pystr_buf, pos: qdb_pystr_pos) { + let b = &mut *b; + b.0.truncate(pos.chain); + if !b.0.is_empty() { + b.0[pos.chain - 1].truncate(pos.string); + } +} + +/// Reset the converter's buffer to zero length. +#[no_mangle] +pub unsafe extern "C" fn qdb_pystr_buf_clear(b: *mut qdb_pystr_buf) { + let b = &mut *b; + if !b.0.is_empty() { + b.0.truncate(1); + b.0[0].clear(); + } +} + +/// Free the buffer. Must be called after `qdb_pystr_buf_new`. +#[no_mangle] +pub unsafe extern "C" fn qdb_pystr_buf_free(b: *mut qdb_pystr_buf) { + if !b.is_null() { + drop(Box::from_raw(b)); + } +} + +const MIN_BUF_LEN: usize = 1024; + +/// A carefully crafted buffer with spare capacity for `len` bytes. +/// This is necessary to return "stable" addresses and avoid segfaults. +/// Rust is unaware we are borrowing its memory and could try to free it as +/// part of a reallocation if we were to use a `String` directly. +fn get_dest(chain: &mut Vec, len: usize) -> &mut String { + if !chain.is_empty() { + let last = chain.last_mut().unwrap(); + if last.capacity() - last.len() >= len { + return chain.last_mut().unwrap(); + } + } + chain.push(String::with_capacity(std::cmp::max(len, MIN_BUF_LEN))); + chain.last_mut().unwrap() +} + +#[inline(always)] +fn encode_loop<'a, 'b, T, F>( + utf8_mult: usize, + chain: &'a mut Vec, + buf: &'b [T], + get_char: F) -> Result<&'a str, u32> + where + F: Fn(T) -> Option, + T: Copy + Into +{ + let dest = get_dest(chain, utf8_mult * buf.len()); + let last = dest.len(); + // for &b in buf.iter() { + // // Checking for validity is not optional: + // // >>> for n in range(2 ** 16): + // // >>> chr(n).encode('utf-8') + // // UnicodeEncodeError: 'utf-8' codec can't encode character '\ud800' + // // in position 0: surrogates not allowed + // match get_char(b) { + // Some(c) => dest.push(c), + // None => { + // dest.truncate(last); + // return Err(b.into()); + // } + // } + // } + // Ok(&dest[last..]) + unsafe { + let v = dest.as_mut_vec(); + v.set_len(v.capacity()); + let mut index = last; + + for &b in buf.iter() { + let c = match get_char(b) { + Some(c) => c, + None => { + v.set_len(last); + return Err(b.into()) + } + }; + let utf_c_len = c.len_utf8(); + match utf_c_len { + 1 => { + v[index] = c as u8; + }, + 2 => { + let mut codepoint_buf = [0; 4]; + let bytes = c + .encode_utf8(&mut codepoint_buf).as_bytes(); + *v.get_unchecked_mut(index) = + *bytes.get_unchecked(0); + *v.get_unchecked_mut(index + 1) = + *bytes.get_unchecked(1); + }, + 3 => { + let mut codepoint_buf = [0; 4]; + let bytes = c + .encode_utf8(&mut codepoint_buf).as_bytes(); + *v.get_unchecked_mut(index) = + *bytes.get_unchecked(0); + *v.get_unchecked_mut(index + 1) = + *bytes.get_unchecked(1); + *v.get_unchecked_mut(index + 2) = + *bytes.get_unchecked(2); + }, + 4 => { + let mut codepoint_buf = [0; 4]; + let bytes = c + .encode_utf8(&mut codepoint_buf).as_bytes(); + *v.get_unchecked_mut(index) = + *bytes.get_unchecked(0); + *v.get_unchecked_mut(index + 1) = + *bytes.get_unchecked(1); + *v.get_unchecked_mut(index + 2) = + *bytes.get_unchecked(2); + *v.get_unchecked_mut(index + 3) = + *bytes.get_unchecked(3); + }, + _ => unreachable!() + } + index += utf_c_len; + } + v.set_len(index); + } + Ok(&dest[last..]) +} + +/// Convert a Py_UCS1 string to UTF-8. +/// Returns a `buf_out` borrowed ptr of `size_out` len. +/// The buffer is borrowed from `b`. +#[no_mangle] +pub unsafe extern "C" fn qdb_ucs1_to_utf8( + b: *mut qdb_pystr_buf, + count: usize, input: *const u8, + size_out: *mut usize, buf_out: *mut *const c_char) { + let b = &mut *b; + let i = from_raw_parts(input, count); + + // len(chr(2 ** 8 - 1).encode('utf-8')) == 2 + let utf8_mult = 2; + let res = encode_loop( + utf8_mult, + &mut b.0, + i, + |c| Some(c as char)).unwrap(); + *size_out = res.len(); + *buf_out = res.as_ptr() as *const c_char; +} + +/// Convert a Py_UCS2 string to UTF-8. +/// Returns a `buf_out` borrowed ptr of `size_out` len. +/// The buffer is borrowed from `b`. +/// In case of errors, returns `false` and bad_codepoint_out is set to the +/// offending codepoint. +#[no_mangle] +pub unsafe extern "C" fn qdb_ucs2_to_utf8(b: *mut qdb_pystr_buf, + count: usize, + input: *const u16, + size_out: *mut usize, + buf_out: *mut *const c_char, + bad_codepoint_out: *mut u32) -> bool { + let b = &mut *b; + let i = from_raw_parts(input, count); + + // len(chr(2 ** 16 - 1).encode('utf-8')) == 3 + let utf8_mult = 3; + let res = encode_loop( + utf8_mult, + &mut b.0, + i, + |c| char::from_u32(c as u32)); + match res { + Ok(s) => { + *size_out = s.len(); + *buf_out = s.as_ptr() as *const c_char; + true + } + Err(bad) => { + *bad_codepoint_out = bad; + false + } + } +} + +/// Convert a Py_UCS4 string to UTF-8. +/// Returns a `buf_out` borrowed ptr of `size_out` len. +/// The buffer is borrowed from `b`. +/// In case of errors, returns `false` and bad_codepoint_out is set to the +/// offending codepoint. +#[no_mangle] +pub unsafe extern "C" fn qdb_ucs4_to_utf8(b: *mut qdb_pystr_buf, + count: usize, + input: *const u32, + size_out: *mut usize, + buf_out: *mut *const c_char, + bad_codepoint_out: *mut u32) -> bool { + let b = &mut *b; + let i = from_raw_parts(input, count); + + // Max 4 bytes allowed by RFC: https://www.rfc-editor.org/rfc/rfc3629#page-4 + let utf8_mult = 4; + let res = encode_loop( + utf8_mult, + &mut b.0, + i, + |c| char::from_u32(c)); + match res { + Ok(s) => { + *size_out = s.len(); + *buf_out = s.as_ptr() as *const c_char; + true + } + Err(bad) => { + *bad_codepoint_out = bad; + false + } + } +} + +#[cfg(test)] +mod tests; diff --git a/pystr-to-utf8/src/tests.rs b/pystr-to-utf8/src/tests.rs new file mode 100644 index 00000000..68da613e --- /dev/null +++ b/pystr-to-utf8/src/tests.rs @@ -0,0 +1,315 @@ +use super::*; + +struct Buf { + buf: *mut qdb_pystr_buf, +} + +impl Buf { + fn new() -> Self { + Self { + buf: unsafe { qdb_pystr_buf_new() }, + } + } + + fn chain(&self) -> &Vec { + unsafe { &(*self.buf).0 } + } + + fn chain_mut(&mut self) -> &mut Vec { + unsafe { &mut (*self.buf).0 } + } + + fn clear(&mut self) { + unsafe { qdb_pystr_buf_clear(self.buf) } + } + + fn tell(&self) -> qdb_pystr_pos { + unsafe { qdb_pystr_buf_tell(self.buf) } + } + + fn truncate(&mut self, pos: qdb_pystr_pos) { + unsafe { qdb_pystr_buf_truncate(self.buf, pos) } + } + + fn ucs1_to_utf8(&mut self, input: &[u8]) -> &'static str { + let mut size_out = 0; + let mut buf_out = std::ptr::null(); + unsafe { + qdb_ucs1_to_utf8( + self.buf, + input.len(), + input.as_ptr(), + &mut size_out, + &mut buf_out); + } + let slice = unsafe { + from_raw_parts(buf_out as *const u8, size_out) }; + std::str::from_utf8(slice).unwrap() + } + + fn ucs2_to_utf8(&mut self, input: &[u16]) -> Result<&'static str, u32> { + let mut size_out = 0; + let mut buf_out = std::ptr::null(); + let mut bad_codepoint = 0u32; + let ok = unsafe { + qdb_ucs2_to_utf8( + self.buf, + input.len(), + input.as_ptr(), + &mut size_out, + &mut buf_out, + &mut bad_codepoint) + }; + if ok { + let slice = unsafe { + from_raw_parts(buf_out as *const u8, size_out) }; + let msg = std::str::from_utf8(slice).unwrap(); + Ok(msg) + } else { + Err(bad_codepoint) + } + } + + fn ucs4_to_utf8(&mut self, input: &[u32]) -> Result<&'static str, u32> { + let mut size_out = 0; + let mut buf_out = std::ptr::null(); + let mut bad_codepoint = 0u32; + let ok = unsafe { + qdb_ucs4_to_utf8( + self.buf, + input.len(), + input.as_ptr(), + &mut size_out, + &mut buf_out, + &mut bad_codepoint) + }; + if ok { + let slice = unsafe { + from_raw_parts(buf_out as *const u8, size_out) }; + let msg = std::str::from_utf8(slice).unwrap(); + Ok(msg) + } else { + Err(bad_codepoint) + } + } +} + +impl Drop for Buf { + fn drop(&mut self) { + unsafe { + qdb_pystr_buf_free(self.buf); + } + } +} + +#[test] +fn test_empty() { + let b = Buf::new(); + assert_eq!(b.chain().len(), 0); + let pos = b.tell(); + assert_eq!(pos.chain, 0); + assert_eq!(pos.string, 0); +} + +#[test] +fn test_ucs1() { + let mut b = Buf::new(); + let s1 = b.ucs1_to_utf8(b"hello"); + assert_eq!(s1, "hello"); + assert_eq!(b.chain_mut().len(), 1); + assert_eq!(b.chain_mut()[0].as_str().as_ptr(), s1.as_ptr()); + assert_eq!(b.chain()[0], "hello"); + assert_eq!(b.tell().chain, 1); + assert_eq!(b.tell().string, 5); + b.clear(); + assert_eq!(b.chain().len(), 1); + assert_eq!(b.chain()[0], ""); + let s2 = b.ucs1_to_utf8(b""); + assert_eq!(s2, ""); + assert_eq!(b.tell(), qdb_pystr_pos { chain: 1, string: 0 }); + assert_eq!(s2.as_ptr(), b.chain()[0].as_str().as_ptr()); + let s3 = b.ucs1_to_utf8(b"10\xb5"); + assert_eq!(s3, "10µ"); + assert_eq!(s3.len(), 4); // 3 bytes in UCS-1, 4 bytes in UTF-8. + assert_eq!(b.chain().len(), 1); + assert_eq!(s3.as_ptr(), unsafe { + b.chain()[0].as_str().as_ptr().add(s2.len()) + }); + assert_eq!(b.tell(), qdb_pystr_pos { + chain: 1, string: s2.len() + s3.len() }); +} + +#[test] +fn test_resize_and_truncate() { + let mut b = Buf::new(); + let s1 = b.ucs1_to_utf8(b"abcdefghijklmnopqrstuvwxyz"); + assert_eq!(s1, "abcdefghijklmnopqrstuvwxyz"); + assert_eq!(b.chain_mut().len(), 1); + assert_eq!(b.chain_mut()[0].as_str().as_ptr(), s1.as_ptr()); + + let big_string = "hello world".repeat(1000); + assert!(big_string.len() > MIN_BUF_LEN); + let s2 = b.ucs1_to_utf8(big_string.as_bytes()); + assert_eq!(s2, big_string); + assert_eq!(b.chain_mut().len(), 2); + assert_eq!(b.chain_mut()[0].as_str().as_ptr(), s1.as_ptr()); + assert_eq!(b.chain_mut()[1].as_str().as_ptr(), s2.as_ptr()); + assert_eq!(b.tell(), qdb_pystr_pos { chain: 2, string: 11000 }); + b.truncate(b.tell()); + assert_eq!(b.tell(), qdb_pystr_pos { chain: 2, string: 11000 }); + + let spare = b.chain_mut()[1].capacity() - b.chain_mut()[1].len(); + assert!(spare > 4); + + let test_string = "ab"; + let s3 = b.ucs1_to_utf8(test_string.as_bytes()); + assert_eq!(s3, test_string); + assert_eq!(b.chain_mut().len(), 2); + assert_eq!(b.chain_mut()[0].as_str().as_ptr(), s1.as_ptr()); + assert_eq!(b.chain_mut()[1].as_str().as_ptr(), s2.as_ptr()); + assert_eq!(b.tell(), qdb_pystr_pos { + chain: 2, string: 11000 + test_string.len() }); +} + +#[test] +fn test_ucs2() { + let mut b = Buf::new(); + + // We first check code points within the ASCII range. + let s1 = b.ucs2_to_utf8( + &[0x61, 0x62, 0x63, 0x64, 0x65]).unwrap(); + assert_eq!(s1, "abcde"); + assert_eq!(s1.len(), 5); + + // Now chars outside ASCII range, but within UCS-1 range. + // These will yield two bytes each in UTF-8. + let s2 = b.ucs2_to_utf8( + &[0x00f0, 0x00e3, 0x00b5, 0x00b6]) + .unwrap(); + assert_eq!(s2, "ðãµ¶"); + assert_eq!(s2.len(), 8); + + // Now chars that actually require two bytes in UCS-2, but also fit in + // two bytes in UTF-8. + let s3 = b.ucs2_to_utf8( + &[0x0100, 0x069c]) + .unwrap(); + assert_eq!(s3, "Āڜ"); + assert_eq!(s3.len(), 4); + + // Now chars that require two bytes in UCS-2 and 3 bytes in UTF-8. + let s4 = b.ucs2_to_utf8( + &[0x569c, 0xa4c2]) + .unwrap(); + assert_eq!(s4, "嚜꓂"); + assert_eq!(s4.len(), 6); + + // Quick check that we're just writing to the same buffer. + assert_eq!(b.tell(), qdb_pystr_pos { + chain: 1, + string: [s1, s2, s3, s4].iter().map(|s| s.len()).sum() }); + + // Now we finally check that errors are captured. + // For this, we use a code point which is valid in a Python string + // (in UCS-2), but which is not valid when encoded as UTF-8. + // >>> chr(0xd800).encode('utf-8') + // Traceback (most recent call last): + // File "", line 1, in + // UnicodeEncodeError: 'utf-8' codec can't encode character '\ud800' + // in position 0: surrogates not allowed + let before_pos = b.tell(); + let s5 = b.ucs2_to_utf8(&[0x061, 0xd800]); + assert!(s5.is_err()); + assert_eq!(s5.unwrap_err(), 0xd800 as u32); + + // Even though 0x061 (ASCII char 'a') was valid and successfully encoded, + // we also want to be sure that the buffer was not modified and appended to. + assert_eq!(b.tell(), before_pos); + + // Now we check that the buffer is still in a valid state. + let s6 = b.ucs2_to_utf8(&[0x062, 0x063]).unwrap(); + assert_eq!(s6, "bc"); + assert_eq!(b.tell(), qdb_pystr_pos { + chain: 1, + string: [s1, s2, s3, s4, s6].iter().map(|s| s.len()).sum() }); +} + +#[test] +fn test_ucs4() { + let mut b = Buf::new(); + + // We first check code points within the ASCII range. + let s1 = b.ucs4_to_utf8( + &[0x61, 0x62, 0x63, 0x64, 0x65]).unwrap(); + assert_eq!(s1, "abcde"); + assert_eq!(s1.len(), 5); + + // Now chars outside ASCII range, but within UCS-1 range. + // These will yield two bytes each in UTF-8. + let s2 = b.ucs4_to_utf8( + &[0x00f0, 0x00e3, 0x00b5, 0x00b6]) + .unwrap(); + assert_eq!(s2, "ðãµ¶"); + assert_eq!(s2.len(), 8); + + // Now chars that actually require two bytes in UCS-2, but also fit in + // two bytes in UTF-8. + let s3 = b.ucs4_to_utf8( + &[0x0100, 0x069c]) + .unwrap(); + assert_eq!(s3, "Āڜ"); + assert_eq!(s3.len(), 4); + + // Now chars that require two bytes in UCS-2 and 3 bytes in UTF-8. + let s4 = b.ucs4_to_utf8( + &[0x569c, 0xa4c2]) + .unwrap(); + assert_eq!(s4, "嚜꓂"); + assert_eq!(s4.len(), 6); + + // Now chars that require four bytes in UCS-4 and 4 bytes in UTF-8. + let s5 = b.ucs4_to_utf8( + &[0x1f4a9, 0x1f99e]) + .unwrap(); + assert_eq!(s5, "💩🦞"); + assert_eq!(s5.len(), 8); + + // Quick check that we're just writing to the same buffer. + assert_eq!(b.tell(), qdb_pystr_pos { + chain: 1, + string: [s1, s2, s3, s4, s5].iter().map(|s| s.len()).sum() }); + + // Now we finally check that errors are captured. + // For this, we use a code point which is valid in a Python string + // (in UCS-4), but which is not valid when encoded as UTF-8. + // >>> chr(0xd800).encode('utf-8') + // Traceback (most recent call last): + // File "", line 1, in + // UnicodeEncodeError: 'utf-8' codec can't encode character '\ud800' + // in position 0: surrogates not allowed + let before_pos = b.tell(); + let s6 = b.ucs4_to_utf8(&[0x061, 0xd800]); + assert!(s6.is_err()); + assert_eq!(s6.unwrap_err(), 0xd800 as u32); + + // Even though 0x061 (ASCII char 'a') was valid and successfully encoded, + // we also want to be sure that the buffer was not modified and appended to. + assert_eq!(b.tell(), before_pos); + + // We repeat the same with chars with code points higher than the u16 type. + let before_pos = b.tell(); + let s7 = b.ucs4_to_utf8(&[0x061, 0x110000]); + assert!(s7.is_err()); + assert_eq!(s7.unwrap_err(), 0x110000); + + // Even though 0x061 (ASCII char 'a') was valid and successfully encoded, + // we also want to be sure that the buffer was not modified and appended to. + assert_eq!(b.tell(), before_pos); + + // Now we check that the buffer is still in a valid state. + let s8 = b.ucs4_to_utf8(&[0x062, 0x063]).unwrap(); + assert_eq!(s8, "bc"); + assert_eq!(b.tell(), qdb_pystr_pos { + chain: 1, + string: [s1, s2, s3, s4, s5, s8].iter().map(|s| s.len()).sum() }); +} \ No newline at end of file diff --git a/setup.py b/setup.py index 3b0e93d1..f871e695 100755 --- a/setup.py +++ b/setup.py @@ -21,8 +21,18 @@ WIN_32BIT_CARGO_TARGET = 'i686-pc-windows-msvc' +INSTRUMENT_FUZZING = False +if os.environ.get('TEST_QUESTDB_FUZZING') == '1': + INSTRUMENT_FUZZING = True + ORIG_CC = os.environ.get('CC') + os.environ['CC'] = "clang" + ORIG_CXX = os.environ.get('CXX') + os.environ['CXX'] = "clang++" + + def ingress_extension(): - lib_name = None + lib_prefix = '' + lib_suffix = '' lib_paths = [] libraries = [] extra_compile_args = [] @@ -30,31 +40,50 @@ def ingress_extension(): extra_objects = [] questdb_rs_ffi_dir = PROJ_ROOT / 'c-questdb-client' / 'questdb-rs-ffi' + pystr_to_utf8_dir = PROJ_ROOT / 'pystr-to-utf8' questdb_client_lib_dir = None + pystr_to_utf8_lib_dir = None if PLATFORM == 'win32' and MODE == '32bit': questdb_client_lib_dir = \ questdb_rs_ffi_dir / 'target' / WIN_32BIT_CARGO_TARGET / 'release' + pystr_to_utf8_lib_dir = \ + pystr_to_utf8_dir / 'target' / WIN_32BIT_CARGO_TARGET / 'release' else: questdb_client_lib_dir = questdb_rs_ffi_dir / 'target' / 'release' + pystr_to_utf8_lib_dir = pystr_to_utf8_dir / 'target' / 'release' + if INSTRUMENT_FUZZING: + extra_compile_args.append('-fsanitize=fuzzer-no-link') + extra_link_args.append('-fsanitize=fuzzer-no-link') + else: + extra_compile_args.append('-flto') + extra_link_args.append('-flto') if PLATFORM == 'darwin': - lib_name = 'libquestdb_client.a' - extra_objects = [str(questdb_client_lib_dir / lib_name)] + lib_prefix = 'lib' + lib_suffix = '.a' extra_link_args.extend(['-framework', 'Security']) elif PLATFORM == 'win32': - lib_name = 'questdb_client.lib' - extra_objects = [str(questdb_client_lib_dir / lib_name)] + lib_prefix = '' + lib_suffix = '.lib' libraries.extend(['wsock32', 'ws2_32', 'AdvAPI32', 'bcrypt', 'UserEnv']) elif PLATFORM == 'linux': - lib_name = 'libquestdb_client.a' - extra_objects = [str(questdb_client_lib_dir / lib_name)] + lib_prefix = 'lib' + lib_suffix = '.a' else: raise NotImplementedError(f'Unsupported platform: {PLATFORM}') + extra_objects = [ + str(loc / f'{lib_prefix}{name}{lib_suffix}') + for loc, name in ( + (questdb_client_lib_dir, 'questdb_client'), + (pystr_to_utf8_lib_dir, 'pystr_to_utf8'))] + return Extension( "questdb.ingress", ["src/questdb/ingress.pyx"], - include_dirs=["c-questdb-client/include"], + include_dirs=[ + "c-questdb-client/include", + "pystr-to-utf8/include"], library_dirs=lib_paths, libraries=libraries, extra_compile_args=extra_compile_args, @@ -98,9 +127,25 @@ def cargo_build(): if PLATFORM == 'win32' and MODE == '32bit': cargo_args.append(f'--target={WIN_32BIT_CARGO_TARGET}') + env = os.environ.copy() + if INSTRUMENT_FUZZING: + if ORIG_CC is not None: + env['CC'] = ORIG_CC + else: + del env['CC'] + if ORIG_CXX is not None: + env['CXX'] = ORIG_CXX + else: + del env['CXX'] + subprocess.check_call( + cargo_args, + cwd=str(PROJ_ROOT / 'c-questdb-client' / 'questdb-rs-ffi'), + env=env) + subprocess.check_call( cargo_args, - cwd=str(PROJ_ROOT / 'c-questdb-client' / 'questdb-rs-ffi')) + cwd=str(PROJ_ROOT / 'pystr-to-utf8'), + env=env) class questdb_build_ext(build_ext): diff --git a/src/questdb/arrow_c_data_interface.h b/src/questdb/arrow_c_data_interface.h new file mode 100644 index 00000000..d58417e6 --- /dev/null +++ b/src/questdb/arrow_c_data_interface.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +#define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE + +#ifdef __cplusplus +} +#endif diff --git a/src/questdb/arrow_c_data_interface.pxd b/src/questdb/arrow_c_data_interface.pxd new file mode 100644 index 00000000..8c0b5472 --- /dev/null +++ b/src/questdb/arrow_c_data_interface.pxd @@ -0,0 +1,38 @@ +from libc.stdint cimport int64_t + +cdef extern from "arrow_c_data_interface.h": + + cdef int ARROW_FLAG_DICTIONARY_ORDERED + cdef int ARROW_FLAG_NULLABLE + cdef int ARROW_FLAG_MAP_KEYS_SORTED + + cdef struct ArrowSchema: + # Array type description + const char* format + const char* name + const char* metadata + int64_t flags + int64_t n_children + ArrowSchema** children + ArrowSchema* dictionary + + # Release callback + void (*release)(ArrowSchema*) + # Opaque producer-specific data + void* private_data + + cdef struct ArrowArray: + # Array data description + int64_t length + int64_t null_count + int64_t offset + int64_t n_buffers + int64_t n_children + const void** buffers + ArrowArray** children + ArrowArray* dictionary + + # Release callback + void (*release)(ArrowArray*) + # Opaque producer-specific data + void* private_data diff --git a/src/questdb/dataframe.md b/src/questdb/dataframe.md new file mode 100644 index 00000000..70928f93 --- /dev/null +++ b/src/questdb/dataframe.md @@ -0,0 +1,661 @@ +# Pandas Integration High-level Overview + +## Goal + +We want to access data in a pandas dataframe from Cython efficiently. +To do this, we need to access its raw memory to traverse it efficiently. +The data held by a dataframe is organized in a columnar fashion. +Each column is a Series object in Python. +Each series object can be backed up by either a Numpy data-structure or +by an Arrow data-structure. + +## Accessing raw Numpy data +To access Numpy data we take the series, call its `.to_numpy()` method +and then access the numpy data as a `Py_buffer`. +* https://docs.python.org/3/c-api/buffer.html +* http://jakevdp.github.io/blog/2014/05/05/introduction-to-the-python-buffer-protocol/ + +## Accessing raw Arrow data +To access Arrow data we first need to unpack each chunk of data at the +Python level giving us `pyarrow` wrapper Array objects. +Each Arrow object in `pyarrow` has a `._export_to_c(..)` python method where we +can pass a Python ints with the addresses to a pre-allocated `ArrowArray` and +`ArrowSchema` C structures. +* https://arrow.apache.org/docs/python/integration/python_java.html + (Ignore the Java part, we just use the same approach for Python to C.) +* https://arrow.apache.org/docs/format/CDataInterface.html +* https://arrow.apache.org/docs/format/Columnar.html#format-columnar + +## Consolidating data access +Now that we've obtained all the pointers we can traverse through the data +without the aid of the Python interpreter (until we hit a Python string in a +Numpy array that is). + +The trouble is, though, that we're dealing with so many potential column types +numpy strides, arrow dictionaries and nullables that we risk having an +unmaintainable spaghetti mess of conditionals, special cases and downright +untestability. + +To tame this and maintain one's sanity we need to remember that we +don't need to support every type, data-structure et cetera that pandas, numpy +and arrow can throw at us: Instead we approach this by only accepting +one-dimensional arrays that support our basic ILP supported types _only_. + +We can also further simplify iteration via the introduction of a cursor: +a struct that is a mishmash of the simplified subsets of arrow and py buffers +that we actually care about. + +## Cherry-picking `Py_buffer` and `ArrowArray` features + +First, off the bat, we can exclude supporting some of these structs' fields: + +### `Py_buffer` +_Always one single `Py_buffer` per column. Not chunked._ + +* `void *buf`: Points to the start of our data. **`[NEEDED]`** +* `PyObject *obj`: No need to access Py object again. **`[IGNORED]`** +* `int readonly`: We never write. **`[IGNORED]`** +* `Py_ssize_t len`: We already have the row-count. **`[IGNORED]`** +* `Py_ssize_t itemsize`: It's enough to know our stride. **`[IGNORED]`** +* `int ndim`: We only support 1-D data. **`[VALIDATED]`** +* `Py_ssize_t *shape`: We only support 1-D data. **`[IGNORED]`** +* `Py_ssize_t *strides`: We only need the first value **`[SIMPLIFIED]`** +* `Py_ssize_t *suboffsets`: Numpy shouldn't be using this. **`[VALIDATED]`** +* `void *internal`: Says on the tin. **`[IGNORED]`** + +### `ArrowArray` +_Multiple of these `ArrowArray` structs per column. Chunked._ + +* `int64_t length`: We need it for the length of the chunk. **`[NEEDED]`** +* `int64_t null_count`: Needed as if == 0, null col may be NULL. **`[NEEDED]`** +* `int64_t offset`: Needed to determine number of skipped rows. **`[NEEDED]`** +* `int64_t n_buffers`: A function of the type, not needed. **`[IGNORED]`** +* `int64_t n_children`: A function of the type, not needed. **`[IGNORED]`** +* `const void** buffers`: Data, e.g. buffers[0] is validity bitvec. **`[NEEDED]`** +* `ArrowArray** children`: Needed only for strings where: **`[NEEDED]`** + * `buffers[0]` is nulls bitvec + * `buffers[1]` is int32 offsets buffer + * `children[0]` is ArrowArray of int8 + * See: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout +* `ArrowArray* dictionary`: Needed to support Pandas categories. + * This ends up being an array of strings, whilst the index is kept in the + parent `buffers[1]` with `buffers[0]` (possibly) as the validity bitmap. + +## Mapping Datatypes + +We can now start to remind ourselves of the destination data-types that we +actually need to support, and see how these map from source column data-types +in both of Numpy and Arrow. + +We need to extract: +* booleans +* 64-bit signed integers +* 64-bit floats +* UTF-8 string buffers +* Nanosecond-precision UTC unix epoch 64-bit signed int timestamps + +```python +import pandas as pd +import pyarrow as pa +``` + +### Booleans + +```python +>>> df = pd.DataFrame({ +... 'bool_col': [True, False, False, True], +... 'obj_bool_col': [True, False, None, False], +... 'nullable_bool_col': pd.array( +... [True, False, None, False], dtype="boolean")}) +``` + +#### Numpy-native representation. +```python +>>> df.dtypes['bool_col'] +dtype('bool') +>>> type(df.dtypes['bool_col']).mro() +[, , ] +>>> df.bool_col.to_numpy().dtype +dtype('bool') +``` + +#### Bools as Python objects +```python +>>> df.obj_bool_col +0 True +1 False +2 None +3 False +Name: obj_bool_col, dtype: object +``` + +It's unclear if this should be supported or not. We might want to and error out +as soon as we encounter either a `None` or a `pandas.NA` object. + +```python +>>> df.obj_bool_col.astype('bool') +0 True +1 False +2 False +3 False +Name: obj_bool_col, dtype: bool +``` + +Lastly, we have what appears to be an Arrow-backed representation. +```python +>>> df.dtypes['nullable_bool_col'] +BooleanDtype +>>> type(df.dtypes['nullable_bool_col']).mro() +[, , , ] +``` + +We can convert it and then access its contents: +``` +>>> arr1 = pa.Array.from_pandas(df.nullable_bool_col) +>>> arr1 + +[ + true, + false, + null, + false +] +>>> arr1._export_to_c(.... pointer_refs to ArrowArray and ArrowSchema) +``` + +This last type is represented as two bitmaps. +See: https://docs.rs/arrow-array/26.0.0/src/arrow_array/array/boolean_array.rs.html#107 + +We want to support this representation, but skip out on nulls. +We want to error out as soon as we see a `null`. + +### 64-bit signed integers + +From Numpy's side, we've got a fair few to deal with: +https://numpy.org/doc/stable/user/basics.types.html + +This is all your usual signed/unsigned integers with 8, 16, 32 and 64 bit width. + +The good news is that the default _is_ `int64`: + +```python +>>> df = pd.DataFrame({'n': [1, 2, 3, 4, 5]}) +>>> df.n +0 1 +1 2 +2 3 +3 4 +4 5 +Name: n, dtype: int64 + +>>> df.dtypes['n'] +dtype('int64') + +>>> type(df.dtypes['n']).mro() +[, , ] +``` + +Some of these are going to be in bounds, others out of bounds of 64-bit signed: +Validation needed. + +Pandas also provides its own (arrow-based) nullable integers. + +```python +>>> df2 = pd.DataFrame({'nullable_n': pd.array([1, 2, None], dtype=pd.Int64Dtype())}) +>>> df2.nullable_n +0 1 +1 2 +2 +Name: nullable_n, dtype: Int64 +>>> type(df2.dtypes['nullable_n']).mro() +[, , , , , ] +``` + +We also need to validate against potential byte-order issues as we're not going +to support this until someone asks: +https://pandas.pydata.org/pandas-docs/version/0.19.1/gotchas.html#byte-ordering-issues + +```python +>>> df3 = pd.DataFrame({'big_e': np.array([1, 2, 3, 4]).astype('>u4')}) +>>> df3.big_e +0 1 +1 2 +2 3 +3 4 +Name: big_e, dtype: uint32 +>>> type(df3.dtypes['big_e']).mro() +[, , ] +>>> df3.dtypes['big_e'].byteorder +'>' +``` + + +### 64-bit floats + +32-bit and 64-bit floats. They all support nullability. We will disallow 16-bit +floats. + +64-bit is default. + +```python +>>> df = pd.DataFrame({'a': [None, 1.0, 1.5, 2.0], 'b': pd.Series([None, 1.0, 1.5, 2.0], dtype='float32'), 'c': pd.Series([None, 1.0, 1.5, 2.0], dtype='float64')}) +>>> df + a b c +0 NaN NaN NaN +1 1.0 1.0 1.0 +2 1.5 1.5 1.5 +3 2.0 2.0 2.0 +>>> df.a +0 NaN +1 1.0 +2 1.5 +3 2.0 +Name: a, dtype: float64 +>>> df.b +0 NaN +1 1.0 +2 1.5 +3 2.0 +Name: b, dtype: float32 +>>> df.c +0 NaN +1 1.0 +2 1.5 +3 2.0 +Name: c, dtype: float64 +``` + +#### Arrow floats + +Pandas also has arrow-compatible floats. +These have an additional bitvector to represent nulls. + + + +#### 16-bit floats + +16-bit floats _do exist_ in Pandas, but we will disallow them: + +```python +>>> df = pd.DataFrame({'a': pd.Series([1.0, 1.5, 2.0], dtype='float16')}) +>>> df + a +0 1.0 +1 1.5 +2 2.0 +>>> df.a +0 1.0 +1 1.5 +2 2.0 +Name: a, dtype: float16 +``` + +### UTF-8 string buffers + +Strings are.. hard. Strings in dataframes are harder. + +#### Python Strings + +Numpy usually holds strings as Python objects. + +```python +>>> df = pd.DataFrame({'a': [ +... 'Strings', 'in', 'Pandas', 'are', 'objects', 'by', 'default']}) +>>> df.dtypes['a'] +dtype('O') +>>> type(df.dtypes['a']).mro() +[, , ] +``` + +Ouch. + +Python string objects internally hold buffers that, depending on need are +encoded as one of UCS-1, UCS-2 or UCS-4. These are variable-length arrays of +codepoints. One codepoint per array element. + +In UCS-1 that's 1-byte elements - effectively `uint8_t`, so the highest code +point is `2 ** 8 - 1 == 255`, or in other words: + +```python +>>> chr(255) +'ÿ' +``` + +If a string contains a codepoint with a numeric value higher than this, it would +need UCS-2 or UCS-4. Such representations are backed by `uint16_t` or `uint32_t` +arrays. + +For example, the codepoint for a lobster is 129438. + +```python +>>> ord('🦞') +129438 +``` + +We _could_ ask Python to convert strings to UTF-8 for us, + +```python +>>> '🦞'.encode('utf-8') +b'\xf0\x9f\xa6\x9e' +``` + +but this would require invoking the Python interpreter and the creation of a +gargantuan amount of little temporary objects. + +This is such a common use case that we do the encoding in a supporting Rust +library. See `pystr-to-utf8` in the source tree. + +It accumulates strings in a address-stable buffer (internally a `Vec`) +and allows us to borrow its memory. + +As a side-note, we should also be ready to handle nulls here: + +```python +>>> df = pd.DataFrame({'a': ['interspersed', None, 'in', None, 'data']}) +>>> type(df.a[1]) + +``` + +#### Fixed-length strings + +Numpy also has some fixed-length strings via two datatypes: +* `S`: Bytes +* `U`: Unicode + +```python +>>> df = pd.DataFrame({ +... 'a': np.array(['fixed', 'len', 'strings'], dtype='S'), +... 'b': np.array(['example', 'with', 'unicode 🦞'], dtype='U')}) +>>> df + a b +0 b'fixed' example +1 b'len' with +2 b'strings' unicode 🦞 +``` + +It doesn't really matter much though. Their Pandas datatype is actually just +`'O'` (object). + +```python +>>> df.dtypes['a'] +dtype('O') +>>> df.dtypes['b'] +dtype('O') +>>> type(df.dtypes['b']) + +``` + +We should: +* reject the first one (because in Python3 bytes aren't strings) - We lack the powers to guess which text encoding was used. It's usually `latin-1`, but was it? + ```python + >>> type(df.a[0]) + + ``` +* Accept the second one without further optimisations: + ```python + >>> type(df.b[0]) + + ``` + +#### Pandas `string[object]` dtype + +Since the `'O'` dtype could hold anything (not just strings), Pandas introduced a new column type that ensures the column only holds strings. + +```python +>>> df = pd.DataFrame({'a': pd.Series(['another', None, 'str', 'example'], dtype='string')}) +>>> df + a +0 another +1 +2 str +3 example +>>> df.dtypes['a'] +string[python] +>>> type(df.dtypes['a']).mro() +[, , , ] +``` + +Note that by default the storage is still Python objects (sigh), +so our Rust-based conversion will come handy here as well. + +Note however that we need to handle nulls not as `None` objects, +but as `pandas.NA` objects. + +```python +>>> df.a[1] + +``` + +At other times, we end up with `nan` python float objects to represent nulls. +_Yay!_. + +#### Arrow-backed Strings + +Finally - as we would expect when obtaining a frame from something like Parquet - there's string columns in UTF-8-native format backed by Arrow. + +_note the different `dtype`:_ + +```python +df = pd.DataFrame({'a': pd.Series(['arrow', None, 'str', 'example'], dtype='string[pyarrow]')}) +``` + +``` +>>> df = pd.DataFrame({'a': pd.Series(['arrow', None, 'str', 'example'], dtype='string[pyarrow]')}) +>>> df + a +0 arrow +1 +2 str +3 example +>>> df.dtypes['a'] +string[pyarrow] +>>> type(df.dtypes['a']).mro() +[, , , ] +``` + +Note that these strings will always have indices based on `int32_t`. + +Arrow also has a `pyarrow.large_string()` type, but +pandas doesn't support it. + +#### Symbol-like Categorical Data + +Pandas supports categories. These are backed by Arrow. + +```python +>>> df = pd.DataFrame({'a': pd.Series( +... ['symbol', 'like', 'type', 'symbol', 'like', 'like', 'like', None], +... dtype='category')}) +>>> df + a +0 symbol +1 like +2 type +3 symbol +4 like +5 like +6 like +7 NaN +>>> df.dtypes['a'] +CategoricalDtype(categories=['like', 'symbol', 'type'], ordered=False) +>>> type(df.dtypes['a']).mro() +[, , , ] +``` + +This is how it's represented: + +```python +>>> pa.Array.from_pandas(df.a) + + +-- dictionary: + [ + "like", + "symbol", + "type" + ] +-- indices: + [ + 1, + 0, + 2, + 1, + 0, + 0, + 0, + null + ] +``` + +For this, we need the `dictionary` field in the `ArrowArray` struct. + +What's also neat is that we know the categories in advance _before_ running the +encoding. This means we can build up our `line_sender_utf8` objects in advance, +though they are all UTF-8 buffers already so.. little gain. + + +### Nanosecond-precision UTC unix epoch 64-bit signed int timestamps + +#### Timezone-free timestamp + +```python +>>> n1 = pd.Timestamp(dt.datetime.utcnow()) +>>> n2 = pd.Timestamp(dt.datetime.utcnow()) +>>> df = pd.DataFrame({'a': [n1, n2]}) +>>> df + a +0 2022-11-15 17:47:23.131445 +1 2022-11-15 17:47:26.943899 +``` + +The data is held as nanos since unix epoch as a 64-bit int. +```python +>>> df.dtypes['a'] +dtype('>> type(df.dtypes['a']).mro() +[, , ] +``` + +This matches our own designated timestamp representation and we just need to convert to micros for the rest of the columns. + +Null values _are_ supported. + +```python +>>> df = pd.DataFrame({'a': [n1, n2, None]}) +>>> df + a +0 2022-11-15 17:47:23.131445 +1 2022-11-15 17:47:26.943899 +2 NaT +``` + +Unclear what the sentinel value for `NaT` is yet, but we want to map it internally to 0 for the designated timestamp and to recognise it +and skip the column otherwise. + +#### Additionally, we can also have datetimes with a timezone + +```python +>>> ts = pd.Timestamp( +... year=2020, month=1, day=1, hour=12, minute=0, second=0, +... tz=zoneinfo.ZoneInfo('America/Los_Angeles')) +>>> df = pd.DataFrame({'a': [ts]}) +>>> df.dtypes['a'] +datetime64[ns, America/Los_Angeles] +>>> type(_) + +>>> df.dtypes['a'].tz +zoneinfo.ZoneInfo(key='America/Los_Angeles') +``` + +The good news here is that the timestamp is still held as UTC (regardless of +timezone), so no timezone conversion logic is required here. + +```python +>>> pa.Array.from_pandas(df.a) + +[ + 2020-01-01 20:00:00.000000000 +] +``` + +**Note**: We need PyArrow to access the buffer, or we need to convert to +`datetime64[ns]`. + + +## Strided Numpy Arrays + +Numpy arrays need not be contiguous. In Pandas, however, we +need not worry about this. + +If we construct a `(4, 3)`-shaped 2D numpy array + +```python +>>> import numpy as np +>>> a1 = np.array([[1, 10, 100], [2, 20, 200], [3, 30, 300], [4, 40, 400]]) +>>> a1 +array([[ 1, 10, 100], + [ 2, 20, 200], + [ 3, 30, 300], + [ 4, 40, 400]]) +>>> a1.dtype +dtype('int64') +``` + +and then select it's second column + +```python +>>> a2 = a1[:, 1] +>>> a2 +array([10, 20, 30, 40]) +``` + +We encounter a non-contiguous array. + +```python +>>> a2.data + +>>> a2.data.contiguous +False +>>> a2.data.strides +(24,) +``` + +If we then wrap up the array in a dataframe and convert the series back to numpy + +```python +>>> df = pd.DataFrame({'a': a2}) +>>> df + a +0 10 +1 20 +2 30 +3 40 +>>> df.a +0 10 +1 20 +2 30 +3 40 +Name: a, dtype: int64 +>>> a3 = df.a.to_numpy() +``` + +We see that we get a new object back, and that the new object actually _is_ +contiguous. + +```python +>>> id(a2) +140389455034672 +>>> id(a3) +140388032511696 +>>> a3.data + +>>> a3.data.contiguous +True +``` + +For this reason, supporting strides is not necessary. + + +## Unified Cursor + +TO BE CONTINUED diff --git a/src/questdb/dataframe.pxi b/src/questdb/dataframe.pxi new file mode 100644 index 00000000..62cfd365 --- /dev/null +++ b/src/questdb/dataframe.pxi @@ -0,0 +1,2275 @@ +# See: dataframe.md for technical overview. + +cdef struct auto_flush_t: + line_sender* sender + size_t watermark + + +cdef auto_flush_t auto_flush_blank(): + cdef auto_flush_t af + af.sender = NULL + af.watermark = 0 + return af + + +cdef struct col_chunks_t: + size_t n_chunks + ArrowArray* chunks # We calloc `n_chunks + 1` of these. + + +cdef struct col_cursor_t: + ArrowArray* chunk # Current chunk. + size_t chunk_index + size_t offset # i.e. the element index (not byte offset) + + +cdef enum col_target_t: + col_target_skip = 0 + col_target_table = 1 + col_target_symbol = 2 + col_target_column_bool = 3 + col_target_column_i64 = 4 + col_target_column_f64 = 5 + col_target_column_str = 6 + col_target_column_ts = 7 + col_target_at = 8 + + +cdef dict _TARGET_NAMES = { + col_target_t.col_target_skip: "skipped", + col_target_t.col_target_table: "table name", + col_target_t.col_target_symbol: "symbol", + col_target_t.col_target_column_bool: "boolean", + col_target_t.col_target_column_i64: "integer", + col_target_t.col_target_column_f64: "float", + col_target_t.col_target_column_str: "string", + col_target_t.col_target_column_ts: "timestamp", + col_target_t.col_target_at: "designated timestamp", +} + + +cdef enum col_source_t: + # Note: Hundreds digit set to 1 if GIL is required. + col_source_nulls = 0 + col_source_bool_pyobj = 101100 + col_source_bool_numpy = 102000 + col_source_bool_arrow = 103000 + col_source_int_pyobj = 201100 + col_source_u8_numpy = 202000 + col_source_i8_numpy = 203000 + col_source_u16_numpy = 204000 + col_source_i16_numpy = 205000 + col_source_u32_numpy = 206000 + col_source_i32_numpy = 207000 + col_source_u64_numpy = 208000 + col_source_i64_numpy = 209000 + col_source_u8_arrow = 210000 + col_source_i8_arrow = 211000 + col_source_u16_arrow = 212000 + col_source_i16_arrow = 213000 + col_source_u32_arrow = 214000 + col_source_i32_arrow = 215000 + col_source_u64_arrow = 216000 + col_source_i64_arrow = 217000 + col_source_float_pyobj = 301100 + col_source_f32_numpy = 302000 + col_source_f64_numpy = 303000 + col_source_f32_arrow = 304000 + col_source_f64_arrow = 305000 + col_source_str_pyobj = 401100 + col_source_str_arrow = 402000 + col_source_str_i8_cat = 403000 + col_source_str_i16_cat = 404000 + col_source_str_i32_cat = 405000 + col_source_dt64ns_numpy = 501000 + col_source_dt64ns_tz_arrow = 502000 + + +cdef bint col_source_needs_gil(col_source_t source): + # Check if hundreds digit is 1. + return source // 100 % 10 == 1 + + +cdef set _STR_SOURCES = { + col_source_t.col_source_str_pyobj, + col_source_t.col_source_str_arrow, + col_source_t.col_source_str_i8_cat, + col_source_t.col_source_str_i16_cat, + col_source_t.col_source_str_i32_cat, +} + + +cdef dict _PYOBJ_SOURCE_DESCR = { + col_source_t.col_source_bool_pyobj: "bool", + col_source_t.col_source_int_pyobj: "int", + col_source_t.col_source_float_pyobj: "float", + col_source_t.col_source_str_pyobj: "str", +} + + +cdef dict _TARGET_TO_SOURCES = { + col_target_t.col_target_skip: { + col_source_t.col_source_nulls, + }, + col_target_t.col_target_table: { + col_source_t.col_source_str_pyobj, + col_source_t.col_source_str_arrow, + col_source_t.col_source_str_i8_cat, + col_source_t.col_source_str_i16_cat, + col_source_t.col_source_str_i32_cat, + }, + col_target_t.col_target_symbol: { + col_source_t.col_source_str_pyobj, + col_source_t.col_source_str_arrow, + col_source_t.col_source_str_i8_cat, + col_source_t.col_source_str_i16_cat, + col_source_t.col_source_str_i32_cat, + }, + col_target_t.col_target_column_bool: { + col_source_t.col_source_bool_pyobj, + col_source_t.col_source_bool_numpy, + col_source_t.col_source_bool_arrow, + }, + col_target_t.col_target_column_i64: { + col_source_t.col_source_int_pyobj, + col_source_t.col_source_u8_numpy, + col_source_t.col_source_i8_numpy, + col_source_t.col_source_u16_numpy, + col_source_t.col_source_i16_numpy, + col_source_t.col_source_u32_numpy, + col_source_t.col_source_i32_numpy, + col_source_t.col_source_u64_numpy, + col_source_t.col_source_i64_numpy, + col_source_t.col_source_u8_arrow, + col_source_t.col_source_i8_arrow, + col_source_t.col_source_u16_arrow, + col_source_t.col_source_i16_arrow, + col_source_t.col_source_u32_arrow, + col_source_t.col_source_i32_arrow, + col_source_t.col_source_u64_arrow, + col_source_t.col_source_i64_arrow, + }, + col_target_t.col_target_column_f64: { + col_source_t.col_source_float_pyobj, + col_source_t.col_source_f32_numpy, + col_source_t.col_source_f64_numpy, + col_source_t.col_source_f32_arrow, + col_source_t.col_source_f64_arrow, + }, + col_target_t.col_target_column_str: { + col_source_t.col_source_str_pyobj, + col_source_t.col_source_str_arrow, + col_source_t.col_source_str_i8_cat, + col_source_t.col_source_str_i16_cat, + col_source_t.col_source_str_i32_cat, + }, + col_target_t.col_target_column_ts: { + col_source_t.col_source_dt64ns_numpy, + col_source_t.col_source_dt64ns_tz_arrow, + }, + col_target_t.col_target_at: { + col_source_t.col_source_dt64ns_numpy, + col_source_t.col_source_dt64ns_tz_arrow, + }, +} + + +# Targets associated with col_meta_target.field. +cdef tuple _FIELD_TARGETS = ( + col_target_t.col_target_skip, + col_target_t.col_target_column_bool, + col_target_t.col_target_column_i64, + col_target_t.col_target_column_f64, + col_target_t.col_target_column_str, + col_target_t.col_target_column_ts) + + +# Targets that map directly from a meta target. +cdef set _DIRECT_META_TARGETS = { + col_target_t.col_target_table, + col_target_t.col_target_symbol, + col_target_t.col_target_at, +} + + +# This is verbose, but.. +# * Enums give us constants. +# * Constants allow unfolding `if` statements into `switch` +# * Switch statements can be more heavily optimized by the C compiler. +cdef enum col_dispatch_code_t: + col_dispatch_code_skip_nulls = \ + col_target_t.col_target_skip + col_source_t.col_source_nulls + + col_dispatch_code_table__str_pyobj = \ + col_target_t.col_target_table + col_source_t.col_source_str_pyobj + col_dispatch_code_table__str_arrow = \ + col_target_t.col_target_table + col_source_t.col_source_str_arrow + col_dispatch_code_table__str_i8_cat = \ + col_target_t.col_target_table + col_source_t.col_source_str_i8_cat + col_dispatch_code_table__str_i16_cat = \ + col_target_t.col_target_table + col_source_t.col_source_str_i16_cat + col_dispatch_code_table__str_i32_cat = \ + col_target_t.col_target_table + col_source_t.col_source_str_i32_cat + + col_dispatch_code_symbol__str_pyobj = \ + col_target_t.col_target_symbol + col_source_t.col_source_str_pyobj + col_dispatch_code_symbol__str_arrow = \ + col_target_t.col_target_symbol + col_source_t.col_source_str_arrow + col_dispatch_code_symbol__str_i8_cat = \ + col_target_t.col_target_symbol + col_source_t.col_source_str_i8_cat + col_dispatch_code_symbol__str_i16_cat = \ + col_target_t.col_target_symbol + col_source_t.col_source_str_i16_cat + col_dispatch_code_symbol__str_i32_cat = \ + col_target_t.col_target_symbol + col_source_t.col_source_str_i32_cat + + col_dispatch_code_column_bool__bool_pyobj = \ + col_target_t.col_target_column_bool + col_source_t.col_source_bool_pyobj + col_dispatch_code_column_bool__bool_numpy = \ + col_target_t.col_target_column_bool + col_source_t.col_source_bool_numpy + col_dispatch_code_column_bool__bool_arrow = \ + col_target_t.col_target_column_bool + col_source_t.col_source_bool_arrow + + col_dispatch_code_column_i64__int_pyobj = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_int_pyobj + col_dispatch_code_column_i64__u8_numpy = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_u8_numpy + col_dispatch_code_column_i64__i8_numpy = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_i8_numpy + col_dispatch_code_column_i64__u16_numpy = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_u16_numpy + col_dispatch_code_column_i64__i16_numpy = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_i16_numpy + col_dispatch_code_column_i64__u32_numpy = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_u32_numpy + col_dispatch_code_column_i64__i32_numpy = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_i32_numpy + col_dispatch_code_column_i64__u64_numpy = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_u64_numpy + col_dispatch_code_column_i64__i64_numpy = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_i64_numpy + col_dispatch_code_column_i64__u8_arrow = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_u8_arrow + col_dispatch_code_column_i64__i8_arrow = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_i8_arrow + col_dispatch_code_column_i64__u16_arrow = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_u16_arrow + col_dispatch_code_column_i64__i16_arrow = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_i16_arrow + col_dispatch_code_column_i64__u32_arrow = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_u32_arrow + col_dispatch_code_column_i64__i32_arrow = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_i32_arrow + col_dispatch_code_column_i64__u64_arrow = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_u64_arrow + col_dispatch_code_column_i64__i64_arrow = \ + col_target_t.col_target_column_i64 + col_source_t.col_source_i64_arrow + + col_dispatch_code_column_f64__float_pyobj = \ + col_target_t.col_target_column_f64 + col_source_t.col_source_float_pyobj + col_dispatch_code_column_f64__f32_numpy = \ + col_target_t.col_target_column_f64 + col_source_t.col_source_f32_numpy + col_dispatch_code_column_f64__f64_numpy = \ + col_target_t.col_target_column_f64 + col_source_t.col_source_f64_numpy + col_dispatch_code_column_f64__f32_arrow = \ + col_target_t.col_target_column_f64 + col_source_t.col_source_f32_arrow + col_dispatch_code_column_f64__f64_arrow = \ + col_target_t.col_target_column_f64 + col_source_t.col_source_f64_arrow + + col_dispatch_code_column_str__str_pyobj = \ + col_target_t.col_target_column_str + col_source_t.col_source_str_pyobj + col_dispatch_code_column_str__str_arrow = \ + col_target_t.col_target_column_str + col_source_t.col_source_str_arrow + col_dispatch_code_column_str__str_i8_cat = \ + col_target_t.col_target_column_str + col_source_t.col_source_str_i8_cat + col_dispatch_code_column_str__str_i16_cat = \ + col_target_t.col_target_column_str + col_source_t.col_source_str_i16_cat + col_dispatch_code_column_str__str_i32_cat = \ + col_target_t.col_target_column_str + col_source_t.col_source_str_i32_cat + + col_dispatch_code_column_ts__dt64ns_numpy = \ + col_target_t.col_target_column_ts + col_source_t.col_source_dt64ns_numpy + col_dispatch_code_column_ts__dt64ns_tz_arrow = \ + col_target_t.col_target_column_ts + \ + col_source_t.col_source_dt64ns_tz_arrow + + col_dispatch_code_at__dt64ns_numpy = \ + col_target_t.col_target_at + col_source_t.col_source_dt64ns_numpy + col_dispatch_code_at__dt64ns_tz_arrow = \ + col_target_t.col_target_at + col_source_t.col_source_dt64ns_tz_arrow + + +# Int values in order for sorting (as needed for API's sequential coupling). +cdef enum meta_target_t: + meta_target_table = col_target_t.col_target_table + meta_target_symbol = col_target_t.col_target_symbol + meta_target_field = col_target_t.col_target_column_bool + meta_target_at = col_target_t.col_target_at + + +cdef struct col_setup_t: + col_chunks_t chunks + size_t orig_index + Py_buffer pybuf + ArrowSchema arrow_schema # Schema of first chunk. + col_source_t source + meta_target_t meta_target + col_target_t target + + +cdef struct col_t: + col_dispatch_code_t dispatch_code # source + target. Determines serializer. + line_sender_column_name name + col_cursor_t cursor + col_setup_t* setup # Grouping to reduce size of struct. + + +cdef void col_t_release(col_t* col): + """ + Release a (possibly) initialized column. + + col_t objects are `calloc`ed, so uninitialized (or partially) initialized + objects will have their pointers and other values set to 0. + """ + cdef size_t chunk_index + cdef ArrowArray* chunk + + if Py_buffer_obj_is_set(&col.setup.pybuf): + PyBuffer_Release(&col.setup.pybuf) # Note: Sets `.pybuf.obj` to NULL. + + for chunk_index in range(col.setup.chunks.n_chunks): + chunk = &col.setup.chunks.chunks[chunk_index] + if chunk.release != NULL: + chunk.release(chunk) + memset(chunk, 0, sizeof(ArrowArray)) + + if col.setup.arrow_schema.release != NULL: + col.setup.arrow_schema.release(&col.setup.arrow_schema) + + free(col.setup.chunks.chunks) + col.setup.chunks.chunks = NULL + col.setup.chunks.n_chunks = 0 + + free(col.setup) + col.setup = NULL + + +# Calloc'd array of col_t. +cdef struct col_t_arr: + size_t size + col_t* d + + +cdef col_t_arr col_t_arr_blank(): + cdef col_t_arr arr + arr.size = 0 + arr.d = NULL + return arr + + +cdef col_t_arr col_t_arr_new(size_t size): + cdef col_t_arr arr + cdef size_t index + arr.size = size + arr.d = calloc(size, sizeof(col_t)) + for index in range(size): + arr.d[index].setup = calloc(1, sizeof(col_setup_t)) + return arr + + +cdef void col_t_arr_release(col_t_arr* arr): + cdef size_t index + if arr.d: + for index in range(arr.size): + col_t_release(&arr.d[index]) + free(arr.d) + arr.size = 0 + arr.d = NULL + + +cdef object _NUMPY = None # module object +cdef object _NUMPY_BOOL = None +cdef object _NUMPY_UINT8 = None +cdef object _NUMPY_INT8 = None +cdef object _NUMPY_UINT16 = None +cdef object _NUMPY_INT16 = None +cdef object _NUMPY_UINT32 = None +cdef object _NUMPY_INT32 = None +cdef object _NUMPY_UINT64 = None +cdef object _NUMPY_INT64 = None +cdef object _NUMPY_FLOAT32 = None +cdef object _NUMPY_FLOAT64 = None +cdef object _NUMPY_DATETIME64_NS = None +cdef object _NUMPY_OBJECT = None +cdef object _PANDAS = None # module object +cdef object _PANDAS_NA = None # pandas.NA +cdef object _PYARROW = None # module object, if available or None + +cdef int64_t _NAT = INT64_MIN # pandas NaT + + +cdef object _dataframe_may_import_deps(): + """" + Lazily import module dependencies on first use to avoid startup overhead. + + $ cat imp_test.py + import numpy + import pandas + import pyarrow + + $ time python3 ./imp_test.py + python3 ./imp_test.py 0.56s user 1.60s system 852% cpu 0.254 total + """ + global _NUMPY, _PANDAS, _PYARROW, _PANDAS_NA + global _NUMPY_BOOL + global _NUMPY_UINT8 + global _NUMPY_INT8 + global _NUMPY_UINT16 + global _NUMPY_INT16 + global _NUMPY_UINT32 + global _NUMPY_INT32 + global _NUMPY_UINT64 + global _NUMPY_INT64 + global _NUMPY_FLOAT32 + global _NUMPY_FLOAT64 + global _NUMPY_DATETIME64_NS + global _NUMPY_OBJECT + if _NUMPY is not None: + return + try: + import pandas + import numpy + import pyarrow + except ImportError as ie: + raise ImportError( + 'Missing dependencies: `pandas`, `numpy` and `pyarrow` must all ' + + 'be installed to use the `.dataframe()` method. ' + + 'See: https://py-questdb-client.readthedocs.io/' + + 'en/latest/installation.html.') from ie + _NUMPY = numpy + _NUMPY_BOOL = type(_NUMPY.dtype('bool')) + _NUMPY_UINT8 = type(_NUMPY.dtype('uint8')) + _NUMPY_INT8 = type(_NUMPY.dtype('int8')) + _NUMPY_UINT16 = type(_NUMPY.dtype('uint16')) + _NUMPY_INT16 = type(_NUMPY.dtype('int16')) + _NUMPY_UINT32 = type(_NUMPY.dtype('uint32')) + _NUMPY_INT32 = type(_NUMPY.dtype('int32')) + _NUMPY_UINT64 = type(_NUMPY.dtype('uint64')) + _NUMPY_INT64 = type(_NUMPY.dtype('int64')) + _NUMPY_FLOAT32 = type(_NUMPY.dtype('float32')) + _NUMPY_FLOAT64 = type(_NUMPY.dtype('float64')) + _NUMPY_DATETIME64_NS = type(_NUMPY.dtype('datetime64[ns]')) + _NUMPY_OBJECT = type(_NUMPY.dtype('object')) + _PANDAS = pandas + _PANDAS_NA = pandas.NA + _PYARROW = pyarrow + + +cdef object _dataframe_check_is_dataframe(object df): + if not isinstance(df, _PANDAS.DataFrame): + raise IngressError( + IngressErrorCode.InvalidApiCall, + f'Bad argument `df`: Expected {_fqn(_PANDAS.DataFrame)}, ' + + f'not an object of type {_fqn(type(df))}.') + + +cdef ssize_t _dataframe_resolve_table_name( + qdb_pystr_buf* b, + object df, + list pandas_cols, + col_t_arr* cols, + object table_name, + object table_name_col, + size_t col_count, + line_sender_table_name* name_out) except -2: + """ + Resolve the table name string or column. + + Returns -1 if the table name is a string, otherwise the column index. + """ + cdef size_t col_index = 0 + cdef PandasCol pandas_col + cdef col_t* col + if table_name is not None: + if table_name_col is not None: + raise ValueError( + 'Can specify only one of `table_name` or `table_name_col`.') + if isinstance(table_name, str): + try: + str_to_table_name_copy(b, table_name, name_out) + return -1 # Magic value for "no column index". + except IngressError as ie: + raise ValueError( + f'Bad argument `table_name`: {ie}') + else: + raise TypeError('Bad argument `table_name`: Must be str.') + elif table_name_col is not None: + if isinstance(table_name_col, str): + _dataframe_get_loc(df, table_name_col, 'table_name_col', &col_index) + elif isinstance(table_name_col, int): + _bind_col_index( + 'table_name_col', table_name_col, col_count, &col_index) + else: + raise TypeError( + 'Bad argument `table_name_col`: ' + + 'must be a column name (str) or index (int).') + pandas_col = pandas_cols[col_index] + col = &cols.d[col_index] + _dataframe_check_column_is_str( + 'Bad argument `table_name_col`: ', + pandas_col, + col.setup.source) + col.setup.meta_target = meta_target_t.meta_target_table + name_out.len = 0 + name_out.buf = NULL + return col_index + elif df.index.name: + if not isinstance(df.index.name, str): + raise TypeError( + 'Bad dataframe index name as table name: Expected str, ' + + f'not an object of type {_fqn(type(df.index.name))}.') + + # If the index has a name, use that as the table name. + try: + str_to_table_name_copy(b, df.index.name, name_out) + return -1 # Magic value for "no column index". + except IngressError as ie: + raise ValueError( + f'Bad dataframe index name as table name: {ie}') + else: + raise ValueError( + 'Must specify at least one of `table_name` or `table_name_col`, ' + + 'or set the dataframe index name (df.index.name = \'tbl_name\').') + + +cdef void_int _bind_col_index( + str arg_name, int col_num, size_t col_count, + size_t* col_index) except -1: + """ + Validate that `col_index` is in bounds for `col_count`. + This function also converts negative indicies (e.g. -1 for last column) to + positive indicies. + """ + cdef bint bad = False + cdef int orig_col_num = col_num + if col_num < 0: + col_num += col_count # Try convert negative offsets to positive ones. + if col_num < 0: + bad = True + if (not bad) and (col_num >= col_count): + bad = True + if bad: + raise IndexError( + f'Bad argument `{arg_name}`: {orig_col_num} index out of range') + col_index[0] = col_num + + +cdef void_int _dataframe_check_column_is_str( + str err_msg_prefix, + PandasCol pandas_col, + col_source_t source) except -1: + cdef str inferred_descr = "" + if not source in _STR_SOURCES: + if isinstance(pandas_col.dtype, _NUMPY_OBJECT): + inferred_descr = f' (inferred type: {_PYOBJ_SOURCE_DESCR[source]})' + raise IngressError( + IngressErrorCode.BadDataFrame, + err_msg_prefix + + f'Bad dtype `{pandas_col.dtype}`{inferred_descr} for the ' + + f'{pandas_col.name!r} column: Must be a strings column.') + + +@cython.internal +cdef class PandasCol: + """Python object representing a column to parse .dataframe() arguments.""" + cdef str name + cdef object dtype + cdef object series + + def __init__( + self, + str name, + object dtype, + object series): + self.name = name + self.dtype = dtype + self.series = series + + +cdef void_int _dataframe_resolve_symbols( + object df, + list pandas_cols, + col_t_arr* cols, + ssize_t table_name_col, + ssize_t at_col, + object symbols) except -1: + cdef size_t col_index = 0 + cdef object symbol + cdef PandasCol pandas_col + cdef col_t* col + if symbols == 'auto': + for col_index in range(cols.size): + pandas_col = pandas_cols[col_index] + col = &cols.d[col_index] + if col.setup.meta_target == meta_target_t.meta_target_field: + if isinstance(pandas_col.dtype, _PANDAS.CategoricalDtype): + col.setup.meta_target = meta_target_t.meta_target_symbol + elif symbols is False: + pass + elif symbols is True: + for col_index in range(cols.size): + col = &cols.d[col_index] + if col.setup.source in _STR_SOURCES: + pandas_col = pandas_cols[col_index] + if col.setup.meta_target == meta_target_t.meta_target_field: + col.setup.meta_target = meta_target_t.meta_target_symbol + else: + if not isinstance(symbols, (tuple, list)): + raise TypeError( + f'Bad argument `symbols`: Must be a bool or a tuple or list '+ + 'of column names (str) or indices (int).') + for symbol in symbols: + if isinstance(symbol, str): + _dataframe_get_loc(df, symbol, 'symbols', &col_index) + elif isinstance(symbol, int): + _bind_col_index('symbol', symbol, cols.size, &col_index) + else: + raise TypeError( + f'Bad argument `symbols`: Elements must ' + + 'be a column name (str) or index (int).') + if (table_name_col >= 0) and (col_index == table_name_col): + raise ValueError( + f'Bad argument `symbols`: Cannot use the same column ' + + f'{symbol!r} as both the table_name and as a symbol.') + if (at_col >= 0) and (col_index == at_col): + raise ValueError( + f'Bad argument `symbols`: Cannot use the `at` column ' + + f'({df.columns[at_col]!r}) as a symbol column.') + pandas_col = pandas_cols[col_index] + col = &cols.d[col_index] + _dataframe_check_column_is_str( + 'Bad argument `symbols`: ', + pandas_col, + col.setup.source) + col.setup.meta_target = meta_target_t.meta_target_symbol + + +cdef void_int _dataframe_get_loc( + object df, str col_name, str arg_name, + size_t* col_index_out) except -1: + """ + Return the column index for `col_name`. + """ + try: + col_index_out[0] = df.columns.get_loc(col_name) + except KeyError: + raise KeyError( + f'Bad argument `{arg_name}`: ' + + f'Column {col_name!r} not found in the dataframe.') + + +# The values -2 and -1 are safe to use as a sentinel because the TimestampNanos +# type already validates that the value is >= 0. +cdef int64_t _AT_IS_SERVER_NOW = -2 +cdef int64_t _AT_IS_SET_BY_COLUMN = -1 + + +cdef str _SUPPORTED_DATETIMES = 'datetime64[ns] or datetime64[ns, tz]' + + +cdef object _dataframe_is_supported_datetime(object dtype): + if (isinstance(dtype, _NUMPY_DATETIME64_NS) and + (str(dtype) == 'datetime64[ns]')): + return True + if isinstance(dtype, _PANDAS.DatetimeTZDtype): + return dtype.unit == 'ns' + return False + + +cdef ssize_t _dataframe_resolve_at( + object df, + col_t_arr* cols, + object at, + size_t col_count, + int64_t* at_value_out) except -2: + cdef size_t col_index + cdef object dtype + cdef PandasCol pandas_col + cdef TimestampNanos at_nanos + if at is None: + at_value_out[0] = _AT_IS_SERVER_NOW + return -1 + elif isinstance(at, TimestampNanos): + at_nanos = at + at_value_out[0] = at_nanos._value + return -1 + elif isinstance(at, datetime): + if at.timestamp() < 0: + raise ValueError( + 'Bad argument `at`: Cannot use a datetime before the ' + + 'Unix epoch (1970-01-01 00:00:00).') + at_value_out[0] = datetime_to_nanos(at) + return -1 + elif isinstance(at, str): + _dataframe_get_loc(df, at, 'at', &col_index) + elif isinstance(at, int): + _bind_col_index('at', at, col_count, &col_index) + else: + raise TypeError( + f'Bad argument `at`: Unsupported type {_fqn(type(at))}. ' + + 'Must be one of: None, TimestampNanos, datetime, ' + + 'int (column index), str (colum name)') + dtype = df.dtypes[col_index] + if _dataframe_is_supported_datetime(dtype): + at_value_out[0] = _AT_IS_SET_BY_COLUMN + col = &cols.d[col_index] + col.setup.meta_target = meta_target_t.meta_target_at + return col_index + else: + raise TypeError( + f'Bad argument `at`: Bad dtype `{dtype}` ' + + f'for the {at!r} column: Must be a {_SUPPORTED_DATETIMES} column.') + + +cdef void_int _dataframe_alloc_chunks( + size_t n_chunks, col_t* col) except -1: + col.setup.chunks.n_chunks = n_chunks + col.setup.chunks.chunks = calloc( + col.setup.chunks.n_chunks + 1, # See `_dataframe_col_advance` on why +1. + sizeof(ArrowArray)) + if col.setup.chunks.chunks == NULL: + raise MemoryError() + + +cdef void _dataframe_free_mapped_arrow(ArrowArray* arr): + free(arr.buffers) + arr.buffers = NULL + arr.release = NULL + + +cdef void_int _dataframe_series_as_pybuf( + PandasCol pandas_col, col_t* col, str fallback_dtype=None) except -1: + cdef object nparr = pandas_col.series.to_numpy(dtype=fallback_dtype) + cdef ArrowArray* mapped + cdef int get_buf_ret + if not PyObject_CheckBuffer(nparr): + raise TypeError( + f'Bad column {pandas_col.name!r}: Expected a buffer, got ' + + f'{pandas_col.series!r} ({_fqn(type(pandas_col.series))})') + try: + # Note! We don't need to support numpy strides since Pandas doesn't. + # Also note that this guarantees a 1D buffer. + get_buf_ret = PyObject_GetBuffer(nparr, &col.setup.pybuf, PyBUF_SIMPLE) + except ValueError as ve: + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Bad column {pandas_col.name!r}: {ve}') from ve + except BufferError as be: + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Bad column {pandas_col.name!r}: Expected a buffer, got ' + + f'{pandas_col.series!r} ({_fqn(type(pandas_col.series))})') from be + _dataframe_alloc_chunks(1, col) + mapped = &col.setup.chunks.chunks[0] + + # Total number of elements. + mapped.length = ( + col.setup.pybuf.len // col.setup.pybuf.itemsize) + mapped.null_count = 0 + mapped.offset = 0 + mapped.n_buffers = 2 + mapped.n_children = 0 + mapped.buffers = calloc(2, sizeof(const void*)) + mapped.buffers[0] = NULL + mapped.buffers[1] = col.setup.pybuf.buf + mapped.children = NULL + mapped.dictionary = NULL + mapped.release = _dataframe_free_mapped_arrow # to cleanup allocated array. + + +cdef void_int _dataframe_series_as_arrow( + PandasCol pandas_col, + col_t* col) except -1: + cdef object array + cdef list chunks + cdef size_t n_chunks + cdef size_t chunk_index + array = _PYARROW.Array.from_pandas(pandas_col.series) + if isinstance(array, _PYARROW.ChunkedArray): + chunks = array.chunks + else: + chunks = [array] + + n_chunks = len(chunks) + _dataframe_alloc_chunks(n_chunks, col) + + for chunk_index in range(n_chunks): + array = chunks[chunk_index] + if chunk_index == 0: + chunks[chunk_index]._export_to_c( + &col.setup.chunks.chunks[chunk_index], + &col.setup.arrow_schema) + else: + chunks[chunk_index]._export_to_c( + &col.setup.chunks.chunks[chunk_index]) + + +cdef const char* _ARROW_FMT_INT8 = "c" +cdef const char* _ARROW_FMT_INT16 = "s" +cdef const char* _ARROW_FMT_INT32 = "i" +cdef const char* _ARROW_FMT_SML_STR = "u" + + +cdef void_int _dataframe_category_series_as_arrow( + PandasCol pandas_col, col_t* col) except -1: + cdef const char* format + _dataframe_series_as_arrow(pandas_col, col) + format = col.setup.arrow_schema.format + if strncmp(format, _ARROW_FMT_INT8, 1) == 0: + col.setup.source = col_source_t.col_source_str_i8_cat + elif strncmp(format, _ARROW_FMT_INT16, 1) == 0: + col.setup.source = col_source_t.col_source_str_i16_cat + elif strncmp(format, _ARROW_FMT_INT32, 1) == 0: + col.setup.source = col_source_t.col_source_str_i32_cat + else: + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Bad column {pandas_col.name!r}: ' + + 'Unsupported arrow category index type. ' + + f'Got {(format).decode("utf-8")!r}.') + + format = col.setup.arrow_schema.dictionary.format + if strncmp(format, _ARROW_FMT_SML_STR, 1) != 0: + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Bad column {pandas_col.name!r}: ' + + 'Expected a category of strings, ' + + f'got a category of {pandas_col.series.dtype.categories.dtype}.') + + +cdef inline bint _dataframe_is_float_nan(PyObject* obj): + return PyFloat_CheckExact(obj) and isnan(PyFloat_AS_DOUBLE(obj)) + + +cdef inline bint _dataframe_is_null_pyobj(PyObject* obj): + return ( + (obj == Py_None) or + (obj == _PANDAS_NA) or + _dataframe_is_float_nan(obj)) + + +cdef void_int _dataframe_series_sniff_pyobj( + PandasCol pandas_col, col_t* col) except -1: + """ + Deduct the type of the object column. + Object columns can contain pretty much anything, but they usually don't. + We make an educated guess by finding the first non-null value in the column. + """ + cdef size_t el_index + cdef size_t n_elements = len(pandas_col.series) + cdef PyObject** obj_arr + cdef PyObject* obj + _dataframe_series_as_pybuf(pandas_col, col) + obj_arr = (col.setup.pybuf.buf) + for el_index in range(n_elements): + obj = obj_arr[el_index] + if not _dataframe_is_null_pyobj(obj): + if PyBool_Check(obj): + col.setup.source = col_source_t.col_source_bool_pyobj + elif PyLong_CheckExact(obj): + col.setup.source = col_source_t.col_source_int_pyobj + elif PyFloat_CheckExact(obj): + col.setup.source = col_source_t.col_source_float_pyobj + elif PyUnicode_CheckExact(obj): + col.setup.source = col_source_t.col_source_str_pyobj + elif PyBytes_CheckExact(obj): + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Bad column {pandas_col.name!r}: ' + + 'Unsupported object column containing bytes.' + + 'If this is a string column, decode it first. ' + + 'See: https://stackoverflow.com/questions/40389764/') + else: + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Bad column {pandas_col.name!r}: ' + + f'Unsupported object column containing an object of type ' + + _fqn(type(obj)) + '.') + return 0 + + # We haven't returned yet, so we've hit an object column that + # exclusively has null values. We will just skip this column. + col.setup.source = col_source_t.col_source_nulls + + +cdef void_int _dataframe_resolve_source_and_buffers( + PandasCol pandas_col, col_t* col) except -1: + cdef object dtype = pandas_col.dtype + if isinstance(dtype, _NUMPY_BOOL): + col.setup.source = col_source_t.col_source_bool_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _PANDAS.BooleanDtype): + col.setup.source = col_source_t.col_source_bool_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _NUMPY_UINT8): + col.setup.source = col_source_t.col_source_u8_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _NUMPY_INT8): + col.setup.source = col_source_t.col_source_i8_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _NUMPY_UINT16): + col.setup.source = col_source_t.col_source_u16_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _NUMPY_INT16): + col.setup.source = col_source_t.col_source_i16_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _NUMPY_UINT32): + col.setup.source = col_source_t.col_source_u32_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _NUMPY_INT32): + col.setup.source = col_source_t.col_source_i32_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _NUMPY_UINT64): + col.setup.source = col_source_t.col_source_u64_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _NUMPY_INT64): + col.setup.source = col_source_t.col_source_i64_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _PANDAS.UInt8Dtype): + col.setup.source = col_source_t.col_source_u8_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.Int8Dtype): + col.setup.source = col_source_t.col_source_i8_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.UInt16Dtype): + col.setup.source = col_source_t.col_source_u16_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.Int16Dtype): + col.setup.source = col_source_t.col_source_i16_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.UInt32Dtype): + col.setup.source = col_source_t.col_source_u32_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.Int32Dtype): + col.setup.source = col_source_t.col_source_i32_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.UInt64Dtype): + col.setup.source = col_source_t.col_source_u64_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.Int64Dtype): + col.setup.source = col_source_t.col_source_i64_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _NUMPY_FLOAT32): + col.setup.source = col_source_t.col_source_f32_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _NUMPY_FLOAT64): + col.setup.source = col_source_t.col_source_f64_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif isinstance(dtype, _PANDAS.Float32Dtype): + col.setup.source = col_source_t.col_source_f32_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.Float64Dtype): + col.setup.source = col_source_t.col_source_f64_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _PANDAS.StringDtype): + if dtype.storage == 'pyarrow': + col.setup.source = col_source_t.col_source_str_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif dtype.storage == 'python': + col.setup.source = col_source_t.col_source_str_pyobj + _dataframe_series_as_pybuf(pandas_col, col) + else: + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Unknown string dtype storage: f{dtype.storage} ' + + f'for column {pandas_col.name} of dtype {dtype}.') + elif isinstance(dtype, _PANDAS.CategoricalDtype): + _dataframe_category_series_as_arrow(pandas_col, col) + elif (isinstance(dtype, _NUMPY_DATETIME64_NS) and + _dataframe_is_supported_datetime(dtype)): + col.setup.source = col_source_t.col_source_dt64ns_numpy + _dataframe_series_as_pybuf(pandas_col, col) + elif (isinstance(dtype, _PANDAS.DatetimeTZDtype) and + _dataframe_is_supported_datetime(dtype)): + col.setup.source = col_source_t.col_source_dt64ns_tz_arrow + _dataframe_series_as_arrow(pandas_col, col) + elif isinstance(dtype, _NUMPY_OBJECT): + _dataframe_series_sniff_pyobj(pandas_col, col) + else: + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Unsupported dtype {dtype} for column {pandas_col.name!r}. ' + + 'Raise an issue if you think it should be supported: ' + + 'https://github.com/questdb/py-questdb-client/issues.') + + +cdef void_int _dataframe_resolve_target( + PandasCol pandas_col, col_t* col) except -1: + cdef col_target_t target + cdef set target_sources + if col.setup.meta_target in _DIRECT_META_TARGETS: + col.setup.target = col.setup.meta_target + return 0 + for target in _FIELD_TARGETS: + target_sources = _TARGET_TO_SOURCES[target] + if col.setup.source in target_sources: + col.setup.target = target + return 0 + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Could not map column source type (code {col.setup.source} for ' + + f'column {pandas_col.name!r} ' + + f' ({pandas_col.dtype}) to any ILP type.') + + +cdef void _dataframe_init_cursor(col_t* col): + col.cursor.chunk = col.setup.chunks.chunks + col.cursor.chunk_index = 0 + col.cursor.offset = col.cursor.chunk.offset + + +cdef void_int _dataframe_resolve_cols( + qdb_pystr_buf* b, + list pandas_cols, + col_t_arr* cols, + bint* any_cols_need_gil_out) except -1: + cdef size_t index + cdef size_t len_dataframe_cols = len(pandas_cols) + cdef PandasCol pandas_col + cdef col_t* col + any_cols_need_gil_out[0] = False + for index in range(len_dataframe_cols): + pandas_col = pandas_cols[index] + col = &cols.d[index] + + # The target is resolved in stages: + # * We first assign all column `.meta_target`s to be fields. + # * Then, depending on argument parsing some/none of the columns + # obtain a meta-target of "table", "symbol" or "at". + # * Finally, based on the source, any remaining "meta_target_field" + # columns are converted to the appropriate target. + # See: _dataframe_resolve_col_targets_and_dc(..). + col.setup.meta_target = meta_target_t.meta_target_field + + # We will sort columns later. The index will be used to achieve a stable + # sort among columns with the same `.meta_target`. + col.setup.orig_index = index + + _dataframe_resolve_source_and_buffers(pandas_col, col) + _dataframe_init_cursor(col) + if col_source_needs_gil(col.setup.source): + any_cols_need_gil_out[0] = True + + +cdef void_int _dataframe_resolve_cols_target_name_and_dc( + qdb_pystr_buf* b, + list pandas_cols, + col_t_arr* cols) except -1: + cdef size_t index + cdef col_t* col + cdef PandasCol pandas_col + for index in range(cols.size): + col = &cols.d[index] + pandas_col = pandas_cols[index] + _dataframe_resolve_target(pandas_col, col) + if col.setup.source not in _TARGET_TO_SOURCES[col.setup.target]: + raise ValueError( + f'Bad value: Column {pandas_col.name!r} ' + + f'({pandas_col.dtype}) is not ' + + f'supported as a {_TARGET_NAMES[col.setup.target]} column.') + col.dispatch_code = ( + col.setup.source + col.setup.target) + + # Since we don't need to send the column names for 'table' and + # 'at' columns, we don't need to validate and encode them as + # column names. This allows unsupported names for these columns. + if ((col.setup.meta_target != meta_target_t.meta_target_table) and + (col.setup.meta_target != meta_target_t.meta_target_at)): + str_to_column_name_copy(b, pandas_col.name, &col.name) + + +cdef int _dataframe_compare_cols(const void* lhs, const void* rhs) nogil: + cdef col_t* lhs_col = lhs + cdef col_t* rhs_col = rhs + cdef int source_diff = lhs_col.setup.meta_target - rhs_col.setup.meta_target + if source_diff != 0: + return source_diff + return lhs_col.setup.orig_index - rhs_col.setup.orig_index + + +cdef void_int _dataframe_resolve_args( + object df, + object table_name, + object table_name_col, + object symbols, + object at, + qdb_pystr_buf* b, + size_t col_count, + line_sender_table_name* c_table_name_out, + int64_t* at_value_out, + col_t_arr* cols, + bint* any_cols_need_gil_out) except -1: + cdef ssize_t name_col + cdef ssize_t at_col + + cdef list pandas_cols = [ + PandasCol(name, df.dtypes[index], series) + for index, (name, series) in enumerate(df.items())] + _dataframe_resolve_cols(b, pandas_cols, cols, any_cols_need_gil_out) + name_col = _dataframe_resolve_table_name( + b, + df, + pandas_cols, + cols, + table_name, + table_name_col, + col_count, + c_table_name_out) + at_col = _dataframe_resolve_at(df, cols, at, col_count, at_value_out) + _dataframe_resolve_symbols(df, pandas_cols, cols, name_col, at_col, symbols) + _dataframe_resolve_cols_target_name_and_dc(b, pandas_cols, cols) + qsort(cols.d, col_count, sizeof(col_t), _dataframe_compare_cols) + + +cdef inline bint _dataframe_arrow_get_bool(col_cursor_t* cursor): + return ( + (cursor.chunk.buffers[1])[cursor.offset // 8] & + (1 << (cursor.offset % 8))) + + +cdef inline bint _dataframe_arrow_is_valid(col_cursor_t* cursor): + """Check if the value is set according to the validity bitmap.""" + return ( + cursor.chunk.null_count == 0 or + ( + (cursor.chunk.buffers[0])[cursor.offset // 8] & + (1 << (cursor.offset % 8)))) + + +cdef inline void _dataframe_arrow_get_cat_value( + col_cursor_t* cursor, + size_t key, + size_t* len_out, + const char** buf_out): + cdef int32_t* value_index_access + cdef int32_t value_begin + cdef uint8_t* value_char_access + value_index_access = cursor.chunk.dictionary.buffers[1] + value_begin = value_index_access[key] + len_out[0] = value_index_access[key + 1] - value_begin + value_char_access = cursor.chunk.dictionary.buffers[2] + buf_out[0] = &value_char_access[value_begin] + + +cdef inline bint _dataframe_arrow_get_cat_i8( + col_cursor_t* cursor, size_t* len_out, const char** buf_out): + cdef bint valid = _dataframe_arrow_is_valid(cursor) + cdef int8_t* key_access + cdef int8_t key + if valid: + key_access = cursor.chunk.buffers[1] + key = key_access[cursor.offset] + _dataframe_arrow_get_cat_value(cursor, key, len_out, buf_out) + return valid + + +cdef inline bint _dataframe_arrow_get_cat_i16( + col_cursor_t* cursor, size_t* len_out, const char** buf_out): + cdef bint valid = _dataframe_arrow_is_valid(cursor) + cdef int16_t* key_access + cdef int16_t key + if valid: + key_access = cursor.chunk.buffers[1] + key = key_access[cursor.offset] + _dataframe_arrow_get_cat_value(cursor, key, len_out, buf_out) + return valid + + +cdef inline bint _dataframe_arrow_get_cat_i32( + col_cursor_t* cursor, size_t* len_out, const char** buf_out): + cdef bint valid = _dataframe_arrow_is_valid(cursor) + cdef int32_t* key_access + cdef int32_t key + if valid: + key_access = cursor.chunk.buffers[1] + key = key_access[cursor.offset] + _dataframe_arrow_get_cat_value(cursor, key, len_out, buf_out) + return valid + + +cdef inline bint _dataframe_arrow_str( + col_cursor_t* cursor, + size_t* len_out, + const char** buf_out): + cdef int32_t* index_access + cdef uint8_t* char_access + cdef int32_t begin + cdef bint valid = _dataframe_arrow_is_valid(cursor) + if valid: + index_access = cursor.chunk.buffers[1] + char_access = cursor.chunk.buffers[2] + begin = index_access[cursor.offset] + len_out[0] = index_access[cursor.offset + 1] - begin + buf_out[0] = &char_access[begin] + return valid + + +cdef inline void_int _dataframe_cell_str_pyobj_to_utf8( + qdb_pystr_buf* b, + col_cursor_t* cursor, + bint* valid_out, + line_sender_utf8* utf8_out) except -1: + cdef PyObject** access = cursor.chunk.buffers[1] + cdef PyObject* cell = access[cursor.offset] + if PyUnicode_CheckExact(cell): + str_to_utf8(b, cell, utf8_out) + valid_out[0] = True + elif _dataframe_is_null_pyobj(cell): + valid_out[0] = False + else: + raise ValueError( + 'Expected a string, ' + + f'got an object of type {_fqn(type(cell))}.') + + +cdef void_int _dataframe_serialize_cell_table__str_pyobj( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col) except -1: + cdef line_sender_error* err = NULL + cdef PyObject** access = col.cursor.chunk.buffers[1] + cdef PyObject* cell = access[col.cursor.offset] + cdef line_sender_table_name c_table_name + if not PyUnicode_CheckExact(cell): + if _dataframe_is_null_pyobj(cell): + raise ValueError('Expected a table name, got a null value') + else: + raise ValueError( + 'Expected a table name (str object), ' + + f'got an object of type {_fqn(type(cell))}.') + str_to_table_name(b, cell, &c_table_name) + if not line_sender_buffer_table(ls_buf, c_table_name, &err): + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_table__str_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef size_t c_len + cdef const char* buf + cdef line_sender_table_name c_table_name + if _dataframe_arrow_str(&col.cursor, &c_len, &buf): + if not line_sender_table_name_init(&c_table_name, c_len, buf, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + if not line_sender_buffer_table(ls_buf, c_table_name, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + else: + _ensure_has_gil(gs) + raise ValueError('Table name cannot be null') + + +cdef void_int _dataframe_serialize_cell_table__str_i8_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef size_t c_len + cdef const char* c_buf + cdef line_sender_table_name c_table_name + if _dataframe_arrow_get_cat_i8(&col.cursor, &c_len, &c_buf): + if not line_sender_table_name_init(&c_table_name, c_len, c_buf, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + if not line_sender_buffer_table(ls_buf, c_table_name, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + else: + _ensure_has_gil(gs) + raise ValueError('Table name cannot be null') + + +cdef void_int _dataframe_serialize_cell_table__str_i16_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef size_t c_len + cdef const char* c_buf + cdef line_sender_table_name c_table_name + if _dataframe_arrow_get_cat_i16(&col.cursor, &c_len, &c_buf): + if not line_sender_table_name_init(&c_table_name, c_len, c_buf, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + if not line_sender_buffer_table(ls_buf, c_table_name, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + else: + _ensure_has_gil(gs) + raise ValueError('Table name cannot be null') + + +cdef void_int _dataframe_serialize_cell_table__str_i32_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef size_t c_len + cdef const char* c_buf + cdef line_sender_table_name c_table_name + if _dataframe_arrow_get_cat_i32(&col.cursor, &c_len, &c_buf): + if not line_sender_table_name_init(&c_table_name, c_len, c_buf, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + if not line_sender_buffer_table(ls_buf, c_table_name, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + else: + _ensure_has_gil(gs) + raise ValueError('Table name cannot be null') + + +cdef void_int _dataframe_serialize_cell_symbol__str_pyobj( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = False + cdef line_sender_utf8 utf8 + _dataframe_cell_str_pyobj_to_utf8(b, &col.cursor, &valid, &utf8) + if valid and not line_sender_buffer_symbol(ls_buf, col.name, utf8, &err): + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_symbol__str_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + if _dataframe_arrow_str(&col.cursor, &utf8.len, &utf8.buf): + if not line_sender_buffer_symbol(ls_buf, col.name, utf8, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_symbol__str_i8_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + if _dataframe_arrow_get_cat_i8(&col.cursor, &utf8.len, &utf8.buf): + if not line_sender_buffer_symbol(ls_buf, col.name, utf8, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_symbol__str_i16_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + if _dataframe_arrow_get_cat_i16(&col.cursor, &utf8.len, &utf8.buf): + if not line_sender_buffer_symbol(ls_buf, col.name, utf8, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_symbol__str_i32_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + if _dataframe_arrow_get_cat_i32(&col.cursor, &utf8.len, &utf8.buf): + if not line_sender_buffer_symbol(ls_buf, col.name, utf8, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_bool__bool_pyobj( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col) except -1: + cdef line_sender_error* err = NULL + cdef PyObject** access = col.cursor.chunk.buffers[1] + cdef PyObject* cell = access[col.cursor.offset] + if PyBool_Check(cell): + if not line_sender_buffer_column_bool( + ls_buf, col.name, cell == Py_True, &err): + raise c_err_to_py(err) + elif _dataframe_is_null_pyobj(cell): + raise ValueError('Cannot insert null values into a boolean column.') + else: + raise ValueError( + 'Expected an object of type bool, got a ' + + _fqn(type(cell)) + '.') + + +cdef void_int _dataframe_serialize_cell_column_bool__bool_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef uint8_t* access = col.cursor.chunk.buffers[1] + cdef uint8_t cell = access[col.cursor.offset] + if not line_sender_buffer_column_bool(ls_buf, col.name, not not cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_bool__bool_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef bint value + if valid: + value = _dataframe_arrow_get_bool(&col.cursor) + if not line_sender_buffer_column_bool(ls_buf, col.name, value, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + else: + _ensure_has_gil(gs) + raise ValueError('Cannot insert null values into a boolean column.') + + +cdef void_int _dataframe_serialize_cell_column_i64__int_pyobj( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col) except -1: + cdef line_sender_error* err = NULL + cdef PyObject** access = col.cursor.chunk.buffers[1] + cdef PyObject* cell = access[col.cursor.offset] + cdef int64_t value + if PyLong_CheckExact(cell): + value = PyLong_AsLongLong(cell) + if not line_sender_buffer_column_i64(ls_buf, col.name, value, &err): + raise c_err_to_py(err) + elif _dataframe_is_null_pyobj(cell): + pass + else: + raise ValueError( + 'Expected an object of type int, got an object of type ' + + _fqn(type(cell)) + '.') + + +cdef void_int _dataframe_serialize_cell_column_i64__u8_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef uint8_t* access = col.cursor.chunk.buffers[1] + cdef uint8_t cell = access[col.cursor.offset] + if not line_sender_buffer_column_i64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__i8_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef int8_t* access = col.cursor.chunk.buffers[1] + cdef int8_t cell = access[col.cursor.offset] + if not line_sender_buffer_column_i64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__u16_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef uint16_t* access = col.cursor.chunk.buffers[1] + cdef uint16_t cell = access[col.cursor.offset] + if not line_sender_buffer_column_i64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__i16_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef int16_t* access = col.cursor.chunk.buffers[1] + cdef int16_t cell = access[col.cursor.offset] + if not line_sender_buffer_column_i64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__u32_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef uint32_t* access = col.cursor.chunk.buffers[1] + cdef uint32_t cell = access[col.cursor.offset] + if not line_sender_buffer_column_i64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__i32_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef int32_t* access = col.cursor.chunk.buffers[1] + cdef int32_t cell = access[col.cursor.offset] + if not line_sender_buffer_column_i64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__u64_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef uint64_t* access = col.cursor.chunk.buffers[1] + cdef uint64_t cell = access[col.cursor.offset] + if cell > INT64_MAX: + _ensure_has_gil(gs) + raise OverflowError('uint64 value too large for int64 column type.') + if not line_sender_buffer_column_i64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__i64_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef int64_t* access = col.cursor.chunk.buffers[1] + cdef int64_t cell = access[col.cursor.offset] + if not line_sender_buffer_column_i64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__u8_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef uint8_t* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_i64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__i8_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef int8_t* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_i64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__u16_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef uint16_t* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_i64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__i16_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef int16_t* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_i64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__u32_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef uint32_t* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_i64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__i32_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef int32_t* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_i64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__u64_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef uint64_t* access + cdef uint64_t cell + if valid: + access = col.cursor.chunk.buffers[1] + cell = access[col.cursor.offset] + if cell > INT64_MAX: + _ensure_has_gil(gs) + raise OverflowError('uint64 value too large for int64 column type.') + if not line_sender_buffer_column_i64( + ls_buf, + col.name, + cell, + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_i64__i64_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef int64_t* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_i64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_f64__float_pyobj( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col) except -1: + cdef line_sender_error* err = NULL + cdef PyObject** access = col.cursor.chunk.buffers[1] + cdef PyObject* cell = access[col.cursor.offset] + cdef double value + if PyFloat_CheckExact(cell): + value = PyFloat_AS_DOUBLE(cell) + if not line_sender_buffer_column_f64(ls_buf, col.name, value, &err): + raise c_err_to_py(err) + elif _dataframe_is_null_pyobj(cell): + pass + else: + raise ValueError( + 'Expected an object of type float, got an object of type ' + + _fqn(type(cell)) + '.') + + +cdef void_int _dataframe_serialize_cell_column_f64__f32_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + # Note: This is the C `float` type, not the Python `float` type. + cdef float* access = col.cursor.chunk.buffers[1] + cdef float cell = access[col.cursor.offset] + if not line_sender_buffer_column_f64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_f64__f64_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef double* access = col.cursor.chunk.buffers[1] + cdef double cell = access[col.cursor.offset] + if not line_sender_buffer_column_f64(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_f64__f32_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef float* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_f64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_f64__f64_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef double* access + if valid: + access = col.cursor.chunk.buffers[1] + if not line_sender_buffer_column_f64( + ls_buf, + col.name, + access[col.cursor.offset], + &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_str__str_pyobj( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = False + cdef line_sender_utf8 utf8 + _dataframe_cell_str_pyobj_to_utf8(b, &col.cursor, &valid, &utf8) + if valid and not line_sender_buffer_column_str( + ls_buf, col.name, utf8, &err): + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_str__str_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + if _dataframe_arrow_str(&col.cursor, &utf8.len, &utf8.buf): + if not line_sender_buffer_column_str(ls_buf, col.name, utf8, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_str__str_i8_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + if _dataframe_arrow_get_cat_i8(&col.cursor, &utf8.len, &utf8.buf): + if not line_sender_buffer_column_str(ls_buf, col.name, utf8, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_str__str_i16_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + if _dataframe_arrow_get_cat_i16(&col.cursor, &utf8.len, &utf8.buf): + if not line_sender_buffer_column_str(ls_buf, col.name, utf8, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_str__str_i32_cat( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + if _dataframe_arrow_get_cat_i32(&col.cursor, &utf8.len, &utf8.buf): + if not line_sender_buffer_column_str(ls_buf, col.name, utf8, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_ts__dt64ns_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef int64_t* access = col.cursor.chunk.buffers[1] + cdef int64_t cell = access[col.cursor.offset] + if cell != _NAT: + cell //= 1000 # Convert from nanoseconds to microseconds. + if not line_sender_buffer_column_ts(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_column_ts__dt64ns_tz_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef int64_t cell + cdef int64_t* access + if valid: + access = col.cursor.chunk.buffers[1] + cell = access[col.cursor.offset] + cell //= 1000 # Convert from nanoseconds to microseconds. + if not line_sender_buffer_column_ts(ls_buf, col.name, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_at_dt64ns_numpy( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef int64_t* access = col.cursor.chunk.buffers[1] + cdef int64_t cell = access[col.cursor.offset] + if cell == _NAT: + if not line_sender_buffer_at_now(ls_buf, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + else: + # Note: ls_buf will validate against negative numbers. + if not line_sender_buffer_at(ls_buf, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell_at_dt64ns_tz_arrow( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef line_sender_error* err = NULL + cdef bint valid = _dataframe_arrow_is_valid(&col.cursor) + cdef int64_t* access + cdef int64_t cell + if valid: + access = col.cursor.chunk.buffers[1] + cell = access[col.cursor.offset] + # Note: ls_buf will validate against negative numbers. + if not line_sender_buffer_at(ls_buf, cell, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + else: + if not line_sender_buffer_at_now(ls_buf, &err): + _ensure_has_gil(gs) + raise c_err_to_py(err) + + +cdef void_int _dataframe_serialize_cell( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col, + PyThreadState** gs) except -1: + cdef col_dispatch_code_t dc = col.dispatch_code + # Note!: Code below will generate a `switch` statement. + # Ensure this happens! Don't break the `dc == ...` pattern. + if dc == col_dispatch_code_t.col_dispatch_code_skip_nulls: + pass # We skip a null column. Nothing to do. + elif dc == col_dispatch_code_t.col_dispatch_code_table__str_pyobj: + _dataframe_serialize_cell_table__str_pyobj(ls_buf, b, col) + elif dc == col_dispatch_code_t.col_dispatch_code_table__str_arrow: + _dataframe_serialize_cell_table__str_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_table__str_i8_cat: + _dataframe_serialize_cell_table__str_i8_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_table__str_i16_cat: + _dataframe_serialize_cell_table__str_i16_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_table__str_i32_cat: + _dataframe_serialize_cell_table__str_i32_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_symbol__str_pyobj: + _dataframe_serialize_cell_symbol__str_pyobj(ls_buf, b, col) + elif dc == col_dispatch_code_t.col_dispatch_code_symbol__str_arrow: + _dataframe_serialize_cell_symbol__str_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_symbol__str_i8_cat: + _dataframe_serialize_cell_symbol__str_i8_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_symbol__str_i16_cat: + _dataframe_serialize_cell_symbol__str_i16_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_symbol__str_i32_cat: + _dataframe_serialize_cell_symbol__str_i32_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_bool__bool_pyobj: + _dataframe_serialize_cell_column_bool__bool_pyobj(ls_buf, b, col) + elif dc == col_dispatch_code_t.col_dispatch_code_column_bool__bool_numpy: + _dataframe_serialize_cell_column_bool__bool_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_bool__bool_arrow: + _dataframe_serialize_cell_column_bool__bool_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__int_pyobj: + _dataframe_serialize_cell_column_i64__int_pyobj(ls_buf, b, col) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__u8_numpy: + _dataframe_serialize_cell_column_i64__u8_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__i8_numpy: + _dataframe_serialize_cell_column_i64__i8_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__u16_numpy: + _dataframe_serialize_cell_column_i64__u16_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__i16_numpy: + _dataframe_serialize_cell_column_i64__i16_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__u32_numpy: + _dataframe_serialize_cell_column_i64__u32_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__i32_numpy: + _dataframe_serialize_cell_column_i64__i32_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__u64_numpy: + _dataframe_serialize_cell_column_i64__u64_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__i64_numpy: + _dataframe_serialize_cell_column_i64__i64_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__u8_arrow: + _dataframe_serialize_cell_column_i64__u8_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__i8_arrow: + _dataframe_serialize_cell_column_i64__i8_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__u16_arrow: + _dataframe_serialize_cell_column_i64__u16_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__i16_arrow: + _dataframe_serialize_cell_column_i64__i16_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__u32_arrow: + _dataframe_serialize_cell_column_i64__u32_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__i32_arrow: + _dataframe_serialize_cell_column_i64__i32_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__u64_arrow: + _dataframe_serialize_cell_column_i64__u64_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_i64__i64_arrow: + _dataframe_serialize_cell_column_i64__i64_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_f64__float_pyobj: + _dataframe_serialize_cell_column_f64__float_pyobj(ls_buf, b, col) + elif dc == col_dispatch_code_t.col_dispatch_code_column_f64__f32_numpy: + _dataframe_serialize_cell_column_f64__f32_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_f64__f64_numpy: + _dataframe_serialize_cell_column_f64__f64_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_f64__f32_arrow: + _dataframe_serialize_cell_column_f64__f32_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_f64__f64_arrow: + _dataframe_serialize_cell_column_f64__f64_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_str__str_pyobj: + _dataframe_serialize_cell_column_str__str_pyobj(ls_buf, b, col) + elif dc == col_dispatch_code_t.col_dispatch_code_column_str__str_arrow: + _dataframe_serialize_cell_column_str__str_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_str__str_i8_cat: + _dataframe_serialize_cell_column_str__str_i8_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_str__str_i16_cat: + _dataframe_serialize_cell_column_str__str_i16_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_str__str_i32_cat: + _dataframe_serialize_cell_column_str__str_i32_cat(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_ts__dt64ns_numpy: + _dataframe_serialize_cell_column_ts__dt64ns_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_ts__dt64ns_tz_arrow: + _dataframe_serialize_cell_column_ts__dt64ns_tz_arrow(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_at__dt64ns_numpy: + _dataframe_serialize_cell_at_dt64ns_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_at__dt64ns_tz_arrow: + _dataframe_serialize_cell_at_dt64ns_tz_arrow(ls_buf, b, col, gs) + else: + _ensure_has_gil(gs) + raise RuntimeError(f"Unknown column dispatch code: {dc}") + # See earlier note about switch statement generation. + # Don't add complex conditions above! + + +cdef void _dataframe_col_advance(col_t* col): + # Branchless version of: + # cdef bint new_chunk = cursor.offset == cursor.chunk.length + # if new_chunk == 0: + # cursor.chunk_index += 1 + # cursor.chunk += 1 # pointer advance + # + # if new_chunk: + # cursor.offset = cursor.chunk.offset + # else: + # cursor.offset += 1 + # + # (Checked with Godbolt, GCC -O3 code was rather "jumpy") + cdef col_cursor_t* cursor = &col.cursor + cdef size_t new_chunk # disguised bint. Either 0 or 1. + cursor.offset += 1 + new_chunk = cursor.offset == cursor.chunk.length + cursor.chunk_index += new_chunk + cursor.chunk += new_chunk + # Note: We get away with this because we've allocated one extra blank chunk. + # This ensures that accessing `cursor.chunk.offset` doesn't segfault. + cursor.offset = ( + (new_chunk * cursor.chunk.offset) + + ((not new_chunk) * cursor.offset)) + + +cdef void_int _dataframe_handle_auto_flush( + auto_flush_t af, + line_sender_buffer* ls_buf, + PyThreadState** gs) except -1: + cdef line_sender_error* flush_err + cdef line_sender_error* marker_err + cdef bint flush_ok + cdef bint marker_ok + if (af.sender == NULL) or (line_sender_buffer_size(ls_buf) < af.watermark): + return 0 + + # Always temporarily release GIL during a flush. + had_gil = _ensure_doesnt_have_gil(gs) + flush_ok = line_sender_flush(af.sender, ls_buf, &flush_err) + if not flush_ok: + # To avoid flush reattempt on Sender.__exit__. + line_sender_buffer_clear(ls_buf) + + # Flushing will have cleared the marker: We need to set it again + # We need this also on error due to our error handling logic which will + # try to rewind the buffer on error and fail if the marker is unset. + marker_ok = line_sender_buffer_set_marker(ls_buf, &marker_err) + + if had_gil or (not flush_ok) or (not marker_ok): + _ensure_has_gil(gs) + + if not flush_ok: + raise c_err_to_py_fmt(flush_err, _FLUSH_FMT) + + # The flush error takes precedence over the marker error. + if not marker_ok: + raise c_err_to_py(marker_err) + + +# Every how many cells to release and re-acquire the Python GIL. +# +# We've done some perf testing with some mixed column dtypes. +# On a modern CPU we're doing over 8 million pandas cells per second. +# By default, `sys.getswitchinterval()` is 0.005 seconds. +# To accomodate this, we'd need to release the GIL every 40,000 cells. +# This will be divided by the column count to get the row gil blip interval. +cdef size_t _CELL_GIL_BLIP_INTERVAL = 40000 + + +cdef void_int _dataframe( + auto_flush_t af, + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + object df, + object table_name, + object table_name_col, + object symbols, + object at) except -1: + cdef size_t col_count + cdef line_sender_table_name c_table_name + cdef int64_t at_value = _AT_IS_SET_BY_COLUMN + cdef col_t_arr cols = col_t_arr_blank() + cdef bint any_cols_need_gil = False + cdef qdb_pystr_pos str_buf_marker + cdef size_t row_count + cdef line_sender_error* err = NULL + cdef size_t row_index + cdef size_t col_index + cdef col_t* col + cdef size_t row_gil_blip_interval + cdef PyThreadState* gs = NULL # GIL state. NULL means we have the GIL. + cdef bint had_gil + cdef bint was_serializing_cell = False + + _dataframe_may_import_deps() + _dataframe_check_is_dataframe(df) + row_count = len(df) + col_count = len(df.columns) + if (col_count == 0) or (row_count == 0): + return 0 # Nothing to do. + + try: + qdb_pystr_buf_clear(b) + cols = col_t_arr_new(col_count) + _dataframe_resolve_args( + df, + table_name, + table_name_col, + symbols, + at, + b, + col_count, + &c_table_name, + &at_value, + &cols, + &any_cols_need_gil) + + # We've used the str buffer up to a point for the headers. + # Instead of clearing it (which would clear the headers' memory) + # we will truncate (rewind) back to this position. + str_buf_marker = qdb_pystr_buf_tell(b) + line_sender_buffer_clear_marker(ls_buf) + + # On error, undo all added lines. + if not line_sender_buffer_set_marker(ls_buf, &err): + raise c_err_to_py(err) + + row_gil_blip_interval = _CELL_GIL_BLIP_INTERVAL // col_count + if row_gil_blip_interval < 400: # ceiling reached at 100 columns + row_gil_blip_interval = 400 + try: + # Don't move this logic up! We need the GIL to execute a `try`. + # Also we can't have any other `try` blocks between here and the + # `finally` block. + if not any_cols_need_gil: + _ensure_doesnt_have_gil(&gs) + + for row_index in range(row_count): + if (gs == NULL) and (row_index % row_gil_blip_interval == 0): + # Release and re-acquire the GIL every so often. + # This is to allow other python threads to run. + # If we hold the GIL for too long, we can starve other + # threads, for example timing out network activity. + _ensure_doesnt_have_gil(&gs) + _ensure_has_gil(&gs) + + qdb_pystr_buf_truncate(b, str_buf_marker) + + # Table-name from `table_name` arg in Python. + if c_table_name.buf != NULL: + if not line_sender_buffer_table(ls_buf, c_table_name, &err): + _ensure_has_gil(&gs) + raise c_err_to_py(err) + + # Serialize columns cells. + # Note: Columns are sorted: table name, symbols, fields, at. + was_serializing_cell = True + for col_index in range(col_count): + col = &cols.d[col_index] + _dataframe_serialize_cell(ls_buf, b, col, &gs) # may raise + _dataframe_col_advance(col) + was_serializing_cell = False + + # Fixed "at" value (not from a column). + if at_value == _AT_IS_SERVER_NOW: + if not line_sender_buffer_at_now(ls_buf, &err): + _ensure_has_gil(&gs) + raise c_err_to_py(err) + elif at_value >= 0: + if not line_sender_buffer_at(ls_buf, at_value, &err): + _ensure_has_gil(&gs) + raise c_err_to_py(err) + + _dataframe_handle_auto_flush(af, ls_buf, &gs) + except Exception as e: + # It would be an internal bug for this to raise. + if not line_sender_buffer_rewind_to_marker(ls_buf, &err): + raise c_err_to_py(err) + + if (isinstance(e, IngressError) and + (e.code == IngressErrorCode.InvalidApiCall)): + # TODO: This should be allowed by the database. + # It currently isn't so we have to raise an error. + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Bad dataframe row at index {row_index}: ' + + 'All values are nulls. '+ + 'Ensure at least one column is not null.') from e + elif was_serializing_cell: + raise IngressError( + IngressErrorCode.BadDataFrame, + 'Failed to serialize value of column ' + + repr(df.columns[col.setup.orig_index]) + + f' at row index {row_index} (' + + repr(df.iloc[row_index, col.setup.orig_index]) + + f'): {e} [dc={col.dispatch_code}]') from e + else: + raise + except Exception as e: + if not isinstance(e, IngressError): + raise IngressError( + IngressErrorCode.InvalidApiCall, + str(e)) from e + else: + raise + finally: + _ensure_has_gil(&gs) # Note: We need the GIL for cleanup. + line_sender_buffer_clear_marker(ls_buf) + col_t_arr_release(&cols) + qdb_pystr_buf_clear(b) diff --git a/src/questdb/extra_cpython.pxd b/src/questdb/extra_cpython.pxd new file mode 100644 index 00000000..3e794566 --- /dev/null +++ b/src/questdb/extra_cpython.pxd @@ -0,0 +1,70 @@ +# Custom definitions that aren't provided in the standard `cpython` module. + +from libc.stdint cimport uint8_t, uint16_t, uint32_t +from cpython.object cimport PyObject + +cdef extern from "Python.h": + cdef PyObject* Py_None + cdef PyObject* Py_True + + ctypedef uint8_t Py_UCS1 # unicodeobject.h + ctypedef uint16_t Py_UCS2 + ctypedef uint32_t Py_UCS4 + + ctypedef unsigned int uint + + cdef enum PyUnicode_Kind: + PyUnicode_1BYTE_KIND + PyUnicode_2BYTE_KIND + PyUnicode_4BYTE_KIND + + # Note: Returning an `object` rather than `PyObject` as the function + # returns a new reference rather than borrowing an existing one. + object PyUnicode_FromKindAndData( + int kind, const void* buffer, Py_ssize_t size) + + # Ditto, see comment on why not returning a `PyObject` above. + str PyUnicode_FromStringAndSize( + const char* u, Py_ssize_t size) + + # Must be called before accessing data or is compact check. + int PyUnicode_READY(PyObject* o) except -1 + + # Is UCS1 and ascii (and therefore valid UTF-8). + bint PyUnicode_IS_COMPACT_ASCII(PyObject* o) + + # Get length. + Py_ssize_t PyUnicode_GET_LENGTH(PyObject* o) + + # Zero-copy access to string buffer. + int PyUnicode_KIND(PyObject* o) + Py_UCS1* PyUnicode_1BYTE_DATA(PyObject* o) + Py_UCS2* PyUnicode_2BYTE_DATA(PyObject* o) + Py_UCS4* PyUnicode_4BYTE_DATA(PyObject* o) + + Py_ssize_t PyBytes_GET_SIZE(object o) + + bint PyBytes_CheckExact(PyObject* o) + + char* PyBytes_AsString(object o) + + bint PyUnicode_CheckExact(PyObject* o) + + bint PyBool_Check(PyObject* o) + + bint PyLong_CheckExact(PyObject* o) + + bint PyFloat_CheckExact(PyObject* o) + + double PyFloat_AS_DOUBLE(PyObject* o) + + long long PyLong_AsLongLong(PyObject* o) except? -1 + + PyObject* PyErr_Occurred() + + ctypedef struct PyThreadState: + pass + + PyThreadState* PyEval_SaveThread() + + void PyEval_RestoreThread(PyThreadState* tstate) diff --git a/src/questdb/ingress.pyx b/src/questdb/ingress.pyx index 4100899e..b676400c 100644 --- a/src/questdb/ingress.pyx +++ b/src/questdb/ingress.pyx @@ -30,58 +30,67 @@ API for fast data ingestion into QuestDB. """ -from libc.stdint cimport uint8_t, uint64_t, int64_t +# For prototypes: https://github.com/cython/cython/tree/master/Cython/Includes +from libc.stdint cimport uint8_t, uint64_t, int64_t, uint32_t, uintptr_t, \ + INT64_MAX, INT64_MIN +from libc.stdlib cimport malloc, calloc, realloc, free, abort, qsort +from libc.string cimport strncmp, memset +from libc.math cimport isnan +from libc.errno cimport errno from cpython.datetime cimport datetime -from cpython.bool cimport bool, PyBool_Check +from cpython.bool cimport bool from cpython.weakref cimport PyWeakref_NewRef, PyWeakref_GetObject from cpython.object cimport PyObject -from cpython.float cimport PyFloat_Check -from cpython.int cimport PyInt_Check -from cpython.unicode cimport PyUnicode_Check +from cpython.buffer cimport Py_buffer, PyObject_CheckBuffer, \ + PyObject_GetBuffer, PyBuffer_Release, PyBUF_SIMPLE +from cpython.memoryview cimport PyMemoryView_FromMemory from .line_sender cimport * +from .pystr_to_utf8 cimport * +from .arrow_c_data_interface cimport * +from .extra_cpython cimport * +from .ingress_helper cimport * -cdef extern from "Python.h": - ctypedef uint8_t Py_UCS1 # unicodeobject.h +# An int we use only for error reporting. +# 0 is success. +# -1 is failure. +ctypedef int void_int - ctypedef unsigned int uint +import cython +include "dataframe.pxi" - cdef enum PyUnicode_Kind: - PyUnicode_1BYTE_KIND - PyUnicode_2BYTE_KIND - PyUnicode_4BYTE_KIND - # Note: Returning an `object` rather than `PyObject` as the function - # returns a new reference rather than borrowing an existing one. - object PyUnicode_FromKindAndData( - int kind, const void* buffer, Py_ssize_t size) +from enum import Enum +from typing import List, Tuple, Dict, Union, Any, Optional, Callable, \ + Iterable +import pathlib - # Ditto, see comment on why not returning a `PyObject` above. - str PyUnicode_FromStringAndSize( - const char* u, Py_ssize_t size) +import sys - # Must be called before accessing data or is compact check. - int PyUnicode_READY(object o) except -1 +# For `get_time_now_ns` and `get_time_now_us` functions. +IF UNAME_SYSNAME == 'Windows': + import time +ELSE: + from posix.time cimport timespec, clock_gettime, CLOCK_REALTIME - # Is UCS1 and ascii (and therefore valid UTF-8). - bint PyUnicode_IS_COMPACT_ASCII(object o) - # Get length. - Py_ssize_t PyUnicode_GET_LENGTH(object o) +cdef bint _has_gil(PyThreadState** gs): + return gs[0] == NULL - # Zero-copy access to buffer. - Py_UCS1* PyUnicode_1BYTE_DATA(object o) - Py_ssize_t PyBytes_GET_SIZE(object o) +cdef bint _ensure_doesnt_have_gil(PyThreadState** gs): + """Returns True if previously had the GIL, False otherwise.""" + if _has_gil(gs): + gs[0] = PyEval_SaveThread() + return True + return False - char* PyBytes_AsString(object o) +cdef void _ensure_has_gil(PyThreadState** gs): + if not _has_gil(gs): + PyEval_RestoreThread(gs[0]) + gs[0] = NULL -from enum import Enum -from typing import List, Tuple, Dict, Union, Any, Optional, Callable, Iterable -import pathlib - -import sys class IngressErrorCode(Enum): """Category of Error.""" @@ -93,6 +102,7 @@ class IngressErrorCode(Enum): InvalidTimestamp = line_sender_error_invalid_timestamp AuthError = line_sender_error_auth_error TlsError = line_sender_error_tls_error + BadDataFrame = line_sender_error_tls_error + 1 def __str__(self) -> str: """Return the name of the enum.""" @@ -160,53 +170,173 @@ cdef inline object c_err_to_py_fmt(line_sender_error* err, str fmt): return IngressError(tup[0], fmt.format(tup[1])) -cdef bytes str_to_utf8(str string, line_sender_utf8* utf8_out): +cdef object _utf8_decode_error( + PyObject* string, uint32_t bad_codepoint): + cdef str s = string + return IngressError( + IngressErrorCode.InvalidUtf8, + f'Invalid codepoint 0x{bad_codepoint:x} in string {s!r}: ' + + 'Cannot be encoded as UTF-8.') + + +cdef str _fqn(type obj): + if obj.__module__ == 'builtins': + return obj.__qualname__ + else: + return f'{obj.__module__}.{obj.__qualname__}' + + +cdef inline void_int _encode_utf8( + qdb_pystr_buf* b, + PyObject* string, + line_sender_utf8* utf8_out) except -1: + cdef uint32_t bad_codepoint = 0 + cdef size_t count = (PyUnicode_GET_LENGTH(string)) + cdef int kind = PyUnicode_KIND(string) + if kind == PyUnicode_1BYTE_KIND: + # No error handling for UCS1: All code points translate into valid UTF8. + qdb_ucs1_to_utf8( + b, + count, + PyUnicode_1BYTE_DATA(string), + &utf8_out.len, + &utf8_out.buf) + elif kind == PyUnicode_2BYTE_KIND: + if not qdb_ucs2_to_utf8( + b, + count, + PyUnicode_2BYTE_DATA(string), + &utf8_out.len, + &utf8_out.buf, + &bad_codepoint): + raise _utf8_decode_error(string, bad_codepoint) + elif kind == PyUnicode_4BYTE_KIND: + if not qdb_ucs4_to_utf8( + b, + count, + + # This cast is required and is possibly a Cython compiler bug. + # It doesn't recognize that `const Py_UCS4*` + # is the same as `const uint32_t*`. + PyUnicode_4BYTE_DATA(string), + + &utf8_out.len, + &utf8_out.buf, + &bad_codepoint): + raise _utf8_decode_error(string, bad_codepoint) + else: + raise ValueError(f'Unknown UCS kind: {kind}.') + + +cdef void_int str_to_utf8( + qdb_pystr_buf* b, + PyObject* string, + line_sender_utf8* utf8_out) except -1: """ - Init the `utf8_out` object from the `string`. - If the string is held as a UCS1 and is purely ascii, then - the memory is borrowed. - Otherwise the string is first encoded to UTF-8 into a bytes object - and such bytes object is returned to transfer ownership and extend - the lifetime of the buffer pointed to by `utf8_out`. + Convert a Python string to a UTF-8 borrowed buffer. + This is done without allocating new Python `bytes` objects. + In case the string is an ASCII string, it's also generally zero-copy. + The `utf8_out` param will point to (borrow from) either the ASCII buffer + inside the original Python object or a part of memory allocated inside the + `b` buffer. + + If you need to use `utf8_out` without the GIL, call `qdb_pystr_buf_copy`. """ - # Note that we bypass `line_sender_utf8_init`. - cdef bytes owner = None + if not PyUnicode_CheckExact(string): + raise TypeError( + 'Expected a str object, not an object of type ' + + _fqn(type(string))) PyUnicode_READY(string) + + # We optimize the common case of ASCII strings. + # This avoid memory allocations and copies altogether. + # We get away with this because ASCII is a subset of UTF-8. if PyUnicode_IS_COMPACT_ASCII(string): utf8_out.len = (PyUnicode_GET_LENGTH(string)) utf8_out.buf = (PyUnicode_1BYTE_DATA(string)) - return owner - else: - owner = string.encode('utf-8') - utf8_out.len = (PyBytes_GET_SIZE(owner)) - utf8_out.buf = (PyBytes_AsString(owner)) - return owner + return 0 + + _encode_utf8(b, string, utf8_out) -cdef bytes str_to_table_name(str string, line_sender_table_name* name_out): + +cdef void_int str_to_utf8_copy( + qdb_pystr_buf* b, + PyObject* string, + line_sender_utf8* utf8_out) except -1: + """ + Variant of `str_to_utf8` that always copies the string to a new buffer. + + The resulting `utf8_out` can be used when not holding the GIL: + The pointed-to memory is owned by `b`. + """ + if not PyUnicode_CheckExact(string): + raise TypeError( + 'Expected a str object, not an object of type ' + + _fqn(type(string))) + + PyUnicode_READY(string) + _encode_utf8(b, string, utf8_out) + + +cdef void_int str_to_table_name( + qdb_pystr_buf* b, + PyObject* string, + line_sender_table_name* name_out) except -1: """ Python string to borrowed C table name. Also see `str_to_utf8`. """ cdef line_sender_error* err = NULL cdef line_sender_utf8 utf8 - cdef bytes owner = str_to_utf8(string, &utf8) + str_to_utf8(b, string, &utf8) + if not line_sender_table_name_init(name_out, utf8.len, utf8.buf, &err): + raise c_err_to_py(err) + + +cdef void_int str_to_table_name_copy( + qdb_pystr_buf* b, + PyObject* string, + line_sender_table_name* name_out) except -1: + """ + Python string to copied C table name. + Also see `str_to_utf8_copy`. + """ + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + str_to_utf8_copy(b, string, &utf8) if not line_sender_table_name_init(name_out, utf8.len, utf8.buf, &err): raise c_err_to_py(err) - return owner -cdef bytes str_to_column_name(str string, line_sender_column_name* name_out): +cdef void_int str_to_column_name( + qdb_pystr_buf* b, + str string, + line_sender_column_name* name_out) except -1: """ Python string to borrowed C column name. Also see `str_to_utf8`. """ cdef line_sender_error* err = NULL cdef line_sender_utf8 utf8 - cdef bytes owner = str_to_utf8(string, &utf8) + str_to_utf8(b, string, &utf8) + if not line_sender_column_name_init(name_out, utf8.len, utf8.buf, &err): + raise c_err_to_py(err) + + +cdef void_int str_to_column_name_copy( + qdb_pystr_buf* b, + str string, + line_sender_column_name* name_out) except -1: + """ + Python string to copied C column name. + Also see `str_to_utf8_copy`. + """ + cdef line_sender_error* err = NULL + cdef line_sender_utf8 utf8 + str_to_utf8_copy(b, string, &utf8) if not line_sender_column_name_init(name_out, utf8.len, utf8.buf, &err): raise c_err_to_py(err) - return owner cdef int64_t datetime_to_micros(datetime dt): @@ -229,40 +359,79 @@ cdef int64_t datetime_to_nanos(datetime dt): (dt.microsecond * 1000)) +cdef int64_t _US_SEC = 1000000 +cdef int64_t _NS_US = 1000 + + +cdef int64_t get_time_now_us() except -1: + """ + Get the current time in microseconds. + """ + IF UNAME_SYSNAME == 'Windows': + return time.time_ns() // 1000 + ELSE: + # Note: Y2K38 bug on 32-bit systems, but we don't care. + cdef timespec ts + if clock_gettime(CLOCK_REALTIME, &ts) != 0: + raise OSError(errno, 'clock_gettime(CLOCK_REALTIME, &ts) failed') + return (ts.tv_sec) * _US_SEC + (ts.tv_nsec) // _NS_US + + +cdef int64_t _NS_SEC = 1000000000 + + +cdef int64_t get_time_now_ns() except -1: + """ + Get the current time in nanoseconds. + """ + IF UNAME_SYSNAME == 'Windows': + return time.time_ns() + ELSE: + # Note: Y2K38 bug on 32-bit systems, but we don't care. + cdef timespec ts + if clock_gettime(CLOCK_REALTIME, &ts) != 0: + raise OSError(errno, 'clock_gettime(CLOCK_REALTIME, &ts) failed') + return (ts.tv_sec) * _NS_SEC + (ts.tv_nsec) + + cdef class TimestampMicros: """ - A timestamp in microseconds since the UNIX epoch. + A timestamp in microseconds since the UNIX epoch (UTC). - You may construct a ``TimestampMicros`` from an integer or a ``datetime``. + You may construct a ``TimestampMicros`` from an integer or a + ``datetime.datetime``, or simply call the :func:`TimestampMicros.now` + method. .. code-block:: python - # Can't be negative. - TimestampMicros(1657888365426838016) - - # Careful with the timezeone! - TimestampMicros.from_datetime(datetime.datetime.utcnow()) + # Recommended way to get the current timestamp. + TimestampMicros.now() - When constructing from a ``datetime``, you should take extra care - to ensure that the timezone is correct. + # The above is equivalent to: + TimestampMicros(time.time_ns() // 1000) - For example, ``datetime.now()`` implies the `local` timezone which - is probably not what you want. + # You can provide a numeric timestamp too. It can't be negative. + TimestampMicros(1657888365426838) - When constructing the ``datetime`` object explicity, you pass in the - timezone to use. + ``TimestampMicros`` can also be constructed from a ``datetime.datetime`` + object. .. code-block:: python TimestampMicros.from_datetime( - datetime.datetime(2000, 1, 1, tzinfo=datetime.timezone.utc)) + datetime.datetime.now(tz=datetime.timezone.utc)) + We recommend that when using ``datetime`` objects, you explicitly pass in + the timezone to use. This is because ``datetime`` objects without an + associated timezone are assumed to be in the local timezone and it is easy + to make mistakes (e.g. passing ``datetime.datetime.utcnow()`` is a likely + bug). """ cdef int64_t _value def __cinit__(self, value: int): if value < 0: - raise ValueError('value must positive integer.') + raise ValueError('value must be a positive integer.') self._value = value @classmethod @@ -274,46 +443,60 @@ cdef class TimestampMicros: raise TypeError('dt must be a datetime object.') return cls(datetime_to_micros(dt)) + @classmethod + def now(cls): + """ + Construct a ``TimestampMicros`` from the current time as UTC. + """ + cdef int64_t value = get_time_now_us() + return cls(value) + @property def value(self) -> int: - """Number of microseconds.""" + """Number of microseconds (Unix epoch timestamp, UTC).""" return self._value + def __repr__(self): + return f'TimestampMicros.({self._value})' + cdef class TimestampNanos: """ - A timestamp in nanoseconds since the UNIX epoch. + A timestamp in nanoseconds since the UNIX epoch (UTC). - You may construct a ``TimestampNanos`` from an integer or a ``datetime``. + You may construct a ``TimestampNanos`` from an integer or a + ``datetime.datetime``, or simply call the :func:`TimestampNanos.now` + method. .. code-block:: python - # Can't be negative. - TimestampNanos(1657888365426838016) - - # Careful with the timezeone! - TimestampNanos.from_datetime(datetime.datetime.utcnow()) + # Recommended way to get the current timestamp. + TimestampNanos.now() - When constructing from a ``datetime``, you should take extra care - to ensure that the timezone is correct. + # The above is equivalent to: + TimestampNanos(time.time_ns()) - For example, ``datetime.now()`` implies the `local` timezone which - is probably not what you want. + # You can provide a numeric timestamp too. It can't be negative. + TimestampNanos(1657888365426838016) - When constructing the ``datetime`` object explicity, you pass in the - timezone to use. + ``TimestampNanos`` can also be constructed from a ``datetime`` object. .. code-block:: python - TimestampMicros.from_datetime( - datetime.datetime(2000, 1, 1, tzinfo=datetime.timezone.utc)) + TimestampNanos.from_datetime( + datetime.datetime.now(tz=datetime.timezone.utc)) + We recommend that when using ``datetime`` objects, you explicitly pass in + the timezone to use. This is because ``datetime`` objects without an + associated timezone are assumed to be in the local timezone and it is easy + to make mistakes (e.g. passing ``datetime.datetime.utcnow()`` is a likely + bug). """ cdef int64_t _value def __cinit__(self, value: int): if value < 0: - raise ValueError('value must positive integer.') + raise ValueError('value must be a positive integer.') self._value = value @classmethod @@ -325,17 +508,28 @@ cdef class TimestampNanos: raise TypeError('dt must be a datetime object.') return cls(datetime_to_nanos(dt)) + @classmethod + def now(cls): + """ + Construct a ``TimestampNanos`` from the current time as UTC. + """ + cdef int64_t value = get_time_now_ns() + return cls(value) + @property def value(self) -> int: - """Number of nanoseconds.""" + """Number of nanoseconds (Unix epoch timestamp, UTC).""" return self._value + def __repr__(self): + return f'TimestampNanos({self.value})' + cdef class Sender cdef class Buffer -cdef int may_flush_on_row_complete(Buffer buffer, Sender sender) except -1: +cdef void_int may_flush_on_row_complete(Buffer buffer, Sender sender) except -1: if sender._auto_flush_enabled: if len(buffer) >= sender._auto_flush_watermark: sender.flush(buffer) @@ -406,6 +600,7 @@ cdef class Buffer: """ cdef line_sender_buffer* _impl + cdef qdb_pystr_buf* _b cdef size_t _init_capacity cdef size_t _max_name_len cdef object _row_complete_sender @@ -420,6 +615,7 @@ cdef class Buffer: cdef inline _cinit_impl(self, size_t init_capacity, size_t max_name_len): self._impl = line_sender_buffer_with_max_name_len(max_name_len) + self._b = qdb_pystr_buf_new() line_sender_buffer_reserve(self._impl, init_capacity) self._init_capacity = init_capacity self._max_name_len = max_name_len @@ -427,6 +623,7 @@ cdef class Buffer: def __dealloc__(self): self._row_complete_sender = None + qdb_pystr_buf_free(self._b) line_sender_buffer_free(self._impl) @property @@ -473,6 +670,7 @@ cdef class Buffer: ``sender.flush(buffer, clear=False)``. """ line_sender_buffer_clear(self._impl) + qdb_pystr_buf_clear(self._b) def __len__(self) -> int: """ @@ -491,12 +689,12 @@ cdef class Buffer: cdef const char* utf8 = line_sender_buffer_peek(self._impl, &size) return PyUnicode_FromStringAndSize(utf8, size) - cdef inline int _set_marker(self) except -1: + cdef inline void_int _set_marker(self) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_set_marker(self._impl, &err): raise c_err_to_py(err) - cdef inline int _rewind_to_marker(self) except -1: + cdef inline void_int _rewind_to_marker(self) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_rewind_to_marker(self._impl, &err): raise c_err_to_py(err) @@ -504,84 +702,82 @@ cdef class Buffer: cdef inline _clear_marker(self): line_sender_buffer_clear_marker(self._impl) - cdef inline int _table(self, str table_name) except -1: + cdef inline void_int _table(self, str table_name) except -1: cdef line_sender_error* err = NULL cdef line_sender_table_name c_table_name - cdef bytes owner = str_to_table_name(table_name, &c_table_name) + str_to_table_name( + self._cleared_b(), table_name, &c_table_name) if not line_sender_buffer_table(self._impl, c_table_name, &err): raise c_err_to_py(err) - return 0 - cdef inline int _symbol(self, str name, str value) except -1: + cdef inline qdb_pystr_buf* _cleared_b(self): + qdb_pystr_buf_clear(self._b) + return self._b + + cdef inline void_int _symbol(self, str name, str value) except -1: cdef line_sender_error* err = NULL cdef line_sender_column_name c_name cdef line_sender_utf8 c_value - cdef bytes owner_name = str_to_column_name(name, &c_name) - cdef bytes owner_value = str_to_utf8(value, &c_value) + str_to_column_name(self._cleared_b(), name, &c_name) + str_to_utf8(self._b, value, &c_value) if not line_sender_buffer_symbol(self._impl, c_name, c_value, &err): raise c_err_to_py(err) - return 0 - cdef inline int _column_bool( + cdef inline void_int _column_bool( self, line_sender_column_name c_name, bint value) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_column_bool(self._impl, c_name, value, &err): raise c_err_to_py(err) - return 0 - cdef inline int _column_i64( + cdef inline void_int _column_i64( self, line_sender_column_name c_name, int64_t value) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_column_i64(self._impl, c_name, value, &err): raise c_err_to_py(err) return 0 - cdef inline int _column_f64( + cdef inline void_int _column_f64( self, line_sender_column_name c_name, double value) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_column_f64(self._impl, c_name, value, &err): raise c_err_to_py(err) - return 0 - cdef inline int _column_str( + cdef inline void_int _column_str( self, line_sender_column_name c_name, str value) except -1: cdef line_sender_error* err = NULL cdef line_sender_utf8 c_value - cdef bytes owner_value = str_to_utf8(value, &c_value) + str_to_utf8(self._b, value, &c_value) if not line_sender_buffer_column_str(self._impl, c_name, c_value, &err): raise c_err_to_py(err) - return 0 - cdef inline int _column_ts( + cdef inline void_int _column_ts( self, line_sender_column_name c_name, TimestampMicros ts) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_column_ts(self._impl, c_name, ts._value, &err): raise c_err_to_py(err) - return 0 - cdef inline int _column_dt( + cdef inline void_int _column_dt( self, line_sender_column_name c_name, datetime dt) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_column_ts( self._impl, c_name, datetime_to_micros(dt), &err): raise c_err_to_py(err) - return 0 - cdef inline int _column(self, str name, object value) except -1: + cdef inline void_int _column(self, str name, object value) except -1: cdef line_sender_column_name c_name - cdef bytes owner_name = str_to_column_name(name, &c_name) - if PyBool_Check(value): - return self._column_bool(c_name, value) - elif PyInt_Check(value): - return self._column_i64(c_name, value) - elif PyFloat_Check(value): - return self._column_f64(c_name, value) - elif PyUnicode_Check(value): - return self._column_str(c_name, value) + str_to_column_name(self._cleared_b(), name, &c_name) + if PyBool_Check(value): + self._column_bool(c_name, value) + elif PyLong_CheckExact(value): + self._column_i64(c_name, value) + elif PyFloat_CheckExact(value): + self._column_f64(c_name, value) + elif PyUnicode_CheckExact(value): + self._column_str(c_name, value) elif isinstance(value, TimestampMicros): - return self._column_ts(c_name, value) + self._column_ts(c_name, value) elif isinstance(value, datetime): - return self._column_dt(c_name, value) + self._column_dt(c_name, value) else: valid = ', '.join(( 'bool', @@ -591,9 +787,9 @@ cdef class Buffer: 'TimestampMicros', 'datetime.datetime')) raise TypeError( - f'Unsupported type: {type(value)}. Must be one of: {valid}') + f'Unsupported type: {_fqn(type(value))}. Must be one of: {valid}') - cdef inline int _may_trigger_row_complete(self) except -1: + cdef inline void_int _may_trigger_row_complete(self) except -1: cdef line_sender_error* err = NULL cdef PyObject* sender = NULL if self._row_complete_sender != None: @@ -601,38 +797,35 @@ cdef class Buffer: if sender != NULL: may_flush_on_row_complete(self, sender) - cdef inline int _at_ts(self, TimestampNanos ts) except -1: + cdef inline void_int _at_ts(self, TimestampNanos ts) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_at(self._impl, ts._value, &err): raise c_err_to_py(err) - return 0 - cdef inline int _at_dt(self, datetime dt) except -1: + cdef inline void_int _at_dt(self, datetime dt) except -1: cdef int64_t value = datetime_to_nanos(dt) cdef line_sender_error* err = NULL if not line_sender_buffer_at(self._impl, value, &err): raise c_err_to_py(err) - return 0 - cdef inline int _at_now(self) except -1: + cdef inline void_int _at_now(self) except -1: cdef line_sender_error* err = NULL if not line_sender_buffer_at_now(self._impl, &err): raise c_err_to_py(err) - return 0 - cdef inline int _at(self, object ts) except -1: + cdef inline void_int _at(self, object ts) except -1: if ts is None: - return self._at_now() + self._at_now() elif isinstance(ts, TimestampNanos): - return self._at_ts(ts) + self._at_ts(ts) elif isinstance(ts, datetime): - return self._at_dt(ts) + self._at_dt(ts) else: raise TypeError( - f'Unsupported type: {type(ts)}. Must be one of: ' + + f'Unsupported type: {_fqn(type(ts))}. Must be one of: ' + 'TimestampNanos, datetime, None') - cdef int _row( + cdef void_int _row( self, str table_name, dict symbols=None, @@ -708,7 +901,7 @@ cdef class Buffer: columns={ 'temperature': 24.5, 'humidity': 0.5}, - at=datetime.datetime.utcnow()) + at=datetime.datetime.now(tz=datetime.timezone.utc)) Python strings passed as values to ``symbols`` are going to be encoded @@ -764,181 +957,284 @@ cdef class Buffer: self._row(table_name, symbols, columns, at) return self - # def tabular( - # self, - # table_name: str, - # data: Iterable[Iterable[Union[ - # bool, int, float, str, - # TimestampMicros, TimestampNanos, datetime]]], - # *, - # header: Optional[List[Optional[str]]]=None, - # symbols: Union[bool, List[int]]=False, - # at: Union[None, TimestampNanos, datetime]=None): - # """ - # Add multiple rows as an iterable of iterables (e.g. list of lists) to - # the buffer. - - # **Data and header** - - # The ``data`` argument specifies rows which must all be for the same - # table. Column names are provided as the ``header``. - - # .. code-block:: python - - # buffer.tabular( - # 'table_name', - # [[True, 123, 3.14, 'xyz'], - # [False, 456, 6.28, 'abc'], - # [True, 789, 9.87, 'def']], - # header=['col1', 'col2', 'col3', 'col4']) - - # **Designated Timestamp Column** - - # QuestDB supports a special `designated timestamp - # `_ column that it - # uses to sort the rows by timestamp. - - # If the data section contains the same number of columns as the header, - # then the designated is going to be - # assigned by the server, unless specified for all columns the `at` - # argument as either an integer wrapped in a ``TimestampNanos`` object - # representing nanoseconds since unix epoch (1970-01-01 00:00:00 UTC) or - # as a ``datetime.datetime`` object. - - # .. code-block:: python - - # buffer.tabular( - # 'table_name', - # [[True, None, 3.14, 'xyz'], - # [False, 123, 6.28, 'abc'], - # [True, 456, 9.87, 'def']], - # header=['col1', 'col2', 'col3', 'col4'], - # at=datetime.datetime.utcnow()) - - # # or ... - # # at=TimestampNanos(1657386397157631000)) - - # If the rows need different `designated timestamp - # `_ values across - # different rows, you can provide them as an additional unlabeled column. - # An unlabled column is one that has its name set to ``None``. - - # .. code-block:: python - - # ts1 = datetime.datetime.utcnow() - # ts2 = ( - # datetime.datetime.utcnow() + - # datetime.timedelta(microseconds=1)) - # buffer.tabular( - # 'table_name', - # [[True, 123, ts1], - # [False, 456, ts2]], - # header=['col1', 'col2', None]) - - # Like the ``at`` argument, the designated timestamp column may also be - # specified as ``TimestampNanos`` objects. - - # .. code-block:: python - - # buffer.tabular( - # 'table_name', - # [[True, 123, TimestampNanos(1657386397157630000)], - # [False, 456, TimestampNanos(1657386397157631000)]], - # header=['col1', 'col2', None]) - - # The designated timestamp column may appear anywhere positionally. - - # .. code-block:: python - - # ts1 = datetime.datetime.utcnow() - # ts2 = ( - # datetime.datetime.utcnow() + - # datetime.timedelta(microseconds=1)) - # buffer.tabular( - # 'table_name', - # [[1000, ts1, 123], - # [2000, ts2, 456]], - # header=['col1', None, 'col2']) - - # **Other timestamp columns** - - # Other columns may also contain timestamps. These columns can take - # ``datetime.datetime`` objects or ``TimestampMicros`` (*not nanos*) - # objects. - - # .. code-block:: python - - # ts1 = datetime.datetime.utcnow() - # ts2 = ( - # datetime.datetime.utcnow() + - # datetime.timedelta(microseconds=1)) - # buffer.tabular( - # 'table_name', - # [[1000, ts1, 123], - # [2000, ts2, 456]], - # header=['col1', 'col2', 'col3'], - # at=datetime.datetime.utcnow()) - - # **Symbol Columns** - - # QuestDB can represent strings via the ``STRING`` or ``SYMBOL`` types. - - # If all the columns of type ``str`` are to be treated as ``STRING``, then - # specify ``symbols=False`` (default - see exaples above). - - # If all need to be treated as ``SYMBOL`` specify ``symbols=True``. - - # .. code-block:: python - - # buffer.tabular( - # 'table_name', - # [['abc', 123, 3.14, 'xyz'], - # ['def', 456, None, 'abc'], - # ['ghi', 789, 9.87, 'def']], - # header=['col1', 'col2', 'col3', 'col4'], - # symbols=True) # `col1` and `col4` are SYMBOL columns. - - # Whilst if only a select few are to be treated as ``SYMBOL``, specify a - # list of column indices to the ``symbols`` arg. - - # .. code-block:: python - - # buffer.tabular( - # 'table_name', - # [['abc', 123, 3.14, 'xyz'], - # ['def', 456, 6.28, 'abc'], - # ['ghi', 789, 9.87, 'def']], - # header=['col1', 'col2', 'col3', 'col4'], - # symbols=[0]) # `col1` is SYMBOL; 'col4' is STRING. - - # Alternatively, you can specify a list of symbol column names. - - # .. code-block:: python - - # buffer.tabular( - # 'table_name', - # [['abc', 123, 3.14, 'xyz'], - # ['def', 456, 6.28, 'abc'], - # ['ghi', 789, 9.87, 'def']], - # header=['col1', 'col2', 'col3', 'col4'], - # symbols=['col1']) # `col1` is SYMBOL; 'col4' is STRING. - - # Note that column indices are 0-based and negative indices are counted - # from the end. - # """ - # raise ValueError('nyi') - - # def pandas( - # self, - # table_name: str, - # data: pd.DataFrame, - # *, - # symbols: Union[bool, List[int]]=False, - # at: Union[None, TimestampNanos, datetime]=None): - # """ - # Add a pandas DataFrame to the buffer. - # """ - # raise ValueError('nyi') + def dataframe( + self, + df, # : pd.DataFrame + *, + table_name: Optional[str] = None, + table_name_col: Union[None, int, str] = None, + symbols: Union[str, bool, List[int], List[str]] = 'auto', + at: Union[None, int, str, TimestampNanos, datetime] = None): + """ + Add a pandas DataFrame to the buffer. + + Also see the :func:`Sender.dataframe` method if you're + not using the buffer explicitly. It supports the same parameters + and also supports auto-flushing. + + This feature requires the ``pandas``, ``numpy`` and ``pyarrow`` + package to be installed. + + :param df: The pandas DataFrame to serialize to the buffer. + :type df: pandas.DataFrame + + :param table_name: The name of the table to which the rows belong. + + If ``None``, the table name is taken from the ``table_name_col`` + parameter. If both ``table_name`` and ``table_name_col`` are + ``None``, the table name is taken from the DataFrame's index + name (``df.index.name`` attribute). + :type table_name: str or None + + :param table_name_col: The name or index of the column in the DataFrame + that contains the table name. + + If ``None``, the table name is taken + from the ``table_name`` parameter. If both ``table_name`` and + ``table_name_col`` are ``None``, the table name is taken from the + DataFrame's index name (``df.index.name`` attribute). + + If ``table_name_col`` is an integer, it is interpreted as the index + of the column starting from ``0``. The index of the column can be + negative, in which case it is interpreted as an offset from the end + of the DataFrame. E.g. ``-1`` is the last column. + :type table_name_col: str or int or None + + :param symbols: The columns to be serialized as symbols. + + If ``'auto'`` (default), all columns of dtype ``'categorical'`` are + serialized as symbols. If ``True``, all ``str`` columns are + serialized as symbols. If ``False``, no columns are serialized as + symbols. + + The list of symbols can also be specified explicitly as a ``list`` + of column names (``str``) or indices (``int``). Integer indices + start at ``0`` and can be negative, offset from the end of the + DataFrame. E.g. ``-1`` is the last column. + + Only columns containing strings can be serialized as symbols. + + :type symbols: str or bool or list of str or list of int + + :param at: The designated timestamp of the rows. + + You can specify a single value for all rows or column name or index. + If ``None``, timestamp is assigned by the server for all rows. + To pass in a timestamp explicity as an integer use the + ``TimestampNanos`` wrapper type. To get the current timestamp, + use ``TimestampNanos.now()``. + When passing a ``datetime.datetime`` object, the timestamp is + converted to nanoseconds. + A ``datetime`` object is assumed to be in the local timezone unless + one is specified explicitly (so call + ``datetime.datetime.now(tz=datetime.timezone.utc)`` instead + of ``datetime.datetime.utcnow()`` for the current timestamp to + avoid bugs). + + To specify a different timestamp for each row, pass in a column name + (``str``) or index (``int``, 0-based index, negative index + supported): In this case, the column needs to be of dtype + ``datetime64[ns]`` (assumed to be in the **UTC timezone** and not + local, due to differences in Pandas and Python datetime handling) or + ``datetime64[ns, tz]``. When a timezone is specified in the column, + it is converted to UTC automatically. + + A timestamp column can also contain ``None`` values. The server will + assign the current timestamp to those rows. + + **Note**: All timestamps are always converted to nanoseconds and in + the UTC timezone. Timezone information is dropped before sending and + QuestDB will not store any timezone information. + :type at: TimestampNanos, datetime.datetime, int or str or None + + **Note**: It is an error to specify both ``table_name`` and + ``table_name_col``. + + **Note**: The "index" column of the DataFrame is never serialized, + even if it is named. + + Example: + + .. code-block:: python + + import pandas as pd + import questdb.ingress as qi + + buf = qi.Buffer() + # ... + + df = pd.DataFrame({ + 'location': ['London', 'Managua', 'London'], + 'temperature': [24.5, 35.0, 25.5], + 'humidity': [0.5, 0.6, 0.45], + 'ts': pd.date_range('2021-07-01', periods=3)}) + buf.dataframe( + df, table_name='weather', at='ts', symbols=['location']) + + # ... + sender.flush(buf) + + **Pandas to ILP datatype mappings** + + .. seealso:: https://questdb.io/docs/reference/api/ilp/columnset-types/ + + .. list-table:: Pandas Mappings + :header-rows: 1 + + * - Pandas ``dtype`` + - Nulls + - ILP Datatype + * - ``'bool'`` + - N + - ``BOOLEAN`` + * - ``'boolean'`` + - N **α** + - ``BOOLEAN`` + * - ``'object'`` (``bool`` objects) + - N **α** + - ``BOOLEAN`` + * - ``'uint8'`` + - N + - ``INTEGER`` + * - ``'int8'`` + - N + - ``INTEGER`` + * - ``'uint16'`` + - N + - ``INTEGER`` + * - ``'int16'`` + - N + - ``INTEGER`` + * - ``'uint32'`` + - N + - ``INTEGER`` + * - ``'int32'`` + - N + - ``INTEGER`` + * - ``'uint64'`` + - N + - ``INTEGER`` **β** + * - ``'int64'`` + - N + - ``INTEGER`` + * - ``'UInt8'`` + - Y + - ``INTEGER`` + * - ``'Int8'`` + - Y + - ``INTEGER`` + * - ``'UInt16'`` + - Y + - ``INTEGER`` + * - ``'Int16'`` + - Y + - ``INTEGER`` + * - ``'UInt32'`` + - Y + - ``INTEGER`` + * - ``'Int32'`` + - Y + - ``INTEGER`` + * - ``'UInt64'`` + - Y + - ``INTEGER`` **β** + * - ``'Int64'`` + - Y + - ``INTEGER`` + * - ``'object'`` (``int`` objects) + - Y + - ``INTEGER`` **β** + * - ``'float32'`` **γ** + - Y (``NaN``) + - ``FLOAT`` + * - ``'float64'`` + - Y (``NaN``) + - ``FLOAT`` + * - ``'object'`` (``float`` objects) + - Y (``NaN``) + - ``FLOAT`` + * - ``'string'`` (``str`` objects) + - Y + - ``STRING`` (default), ``SYMBOL`` via ``symbols`` arg. **δ** + * - ``'string[pyarrow]'`` + - Y + - ``STRING`` (default), ``SYMBOL`` via ``symbols`` arg. **δ** + * - ``'category'`` (``str`` objects) **ε** + - Y + - ``SYMBOL`` (default), ``STRING`` via ``symbols`` arg. **δ** + * - ``'object'`` (``str`` objects) + - Y + - ``STRING`` (default), ``SYMBOL`` via ``symbols`` arg. **δ** + * - ``'datetime64[ns]'`` + - Y + - ``TIMESTAMP`` **ζ** + * - ``'datetime64[ns, tz]'`` + - Y + - ``TIMESTAMP`` **ζ** + + .. note:: + + * **α**: Note some pandas dtypes allow nulls (e.g. ``'boolean'``), + where the QuestDB database does not. + + * **β**: The valid range for integer values is -2^63 to 2^63-1. + Any ``'uint64'``, ``'UInt64'`` or python ``int`` object values + outside this range will raise an error during serialization. + + * **γ**: Upcast to 64-bit float during serialization. + + * **δ**: Columns containing strings can also be used to specify the + table name. See ``table_name_col``. + + * **ε**: We only support categories containing strings. If the + category contains non-string values, an error will be raised. + + * **ζ**: The '.dataframe()' method only supports datetimes with + nanosecond precision. The designated timestamp column (see ``at`` + parameter) maintains the nanosecond precision, whilst values + stored as columns have their precision truncated to microseconds. + All dates are sent as UTC and any additional timezone information + is dropped. If no timezone is specified, we follow + the pandas convention of assuming the timezone is UTC. + Datetimes before 1970-01-01 00:00:00 UTC are not supported. + If a datetime value is specified as ``None`` (``NaT``), it is + interpreted as the current QuestDB server time set on receipt of + message. + + **Error Handling and Recovery** + + In case an exception is raised during dataframe serialization, the + buffer is left in its previous state. + The buffer remains in a valid state and can be used for further calls + even after an error. + + For clarification, as an example, if an invalid ``None`` + value appears at the 3rd row for a ``bool`` column, neither the 3rd nor + the preceding rows are added to the buffer. + + **Note**: This differs from the :func:`Sender.dataframe` method, which + modifies this guarantee due to its ``auto_flush`` logic. + + **Performance Considerations** + + The Python GIL is released during serialization if it is not needed. + If any column requires the GIL, the entire serialization is done whilst + holding the GIL. + + Column types that require the GIL are: + + * Columns of ``str``, ``float`` or ``int`` or ``float`` Python objects. + * The ``'string[python]'`` dtype. + """ + _dataframe( + auto_flush_blank(), + self._impl, + self._b, + df, + table_name, + table_name_col, + symbols, + at) _FLUSH_FMT = ('{} - See https://py-questdb-client.readthedocs.io/en/' @@ -1100,15 +1396,12 @@ cdef class Sender: cdef line_sender_error* err = NULL cdef line_sender_utf8 host_utf8 - cdef bytes host_owner cdef str port_str cdef line_sender_utf8 port_utf8 - cdef bytes port_owner cdef str interface_str cdef line_sender_utf8 interface_utf8 - cdef bytes interface_owner cdef str a_key_id cdef bytes a_key_id_owner @@ -1126,27 +1419,36 @@ cdef class Sender: cdef bytes a_pub_key_y_owner cdef line_sender_utf8 a_pub_key_y_utf8 - cdef bytes ca_owner cdef line_sender_utf8 ca_utf8 + cdef qdb_pystr_buf* b + self._opts = NULL self._impl = NULL - self._buffer = None - if PyInt_Check(port): + self._init_capacity = init_capacity + self._max_name_len = max_name_len + + self._buffer = Buffer( + init_capacity=init_capacity, + max_name_len=max_name_len) + + b = self._buffer._b + + if PyLong_CheckExact(port): port_str = str(port) - elif PyUnicode_Check(port): + elif PyUnicode_CheckExact(port): port_str = port else: raise TypeError( - f'port must be an integer or a string, not {type(port)}') + f'port must be an int or a str, not {_fqn(type(port))}') - host_owner = str_to_utf8(host, &host_utf8) - port_owner = str_to_utf8(port_str, &port_utf8) + str_to_utf8(b, host, &host_utf8) + str_to_utf8(b, port_str, &port_utf8) self._opts = line_sender_opts_new_service(host_utf8, port_utf8) if interface is not None: - interface_owner = str_to_utf8(interface, &interface_utf8) + str_to_utf8(b, interface, &interface_utf8) line_sender_opts_net_interface(self._opts, interface_utf8) if auth is not None: @@ -1154,10 +1456,10 @@ cdef class Sender: a_priv_key, a_pub_key_x, a_pub_key_y) = auth - a_key_id_owner = str_to_utf8(a_key_id, &a_key_id_utf8) - a_priv_key_owner = str_to_utf8(a_priv_key, &a_priv_key_utf8) - a_pub_key_x_owner = str_to_utf8(a_pub_key_x, &a_pub_key_x_utf8) - a_pub_key_y_owner = str_to_utf8(a_pub_key_y, &a_pub_key_y_utf8) + str_to_utf8(b, a_key_id, &a_key_id_utf8) + str_to_utf8(b, a_priv_key, &a_priv_key_utf8) + str_to_utf8(b, a_pub_key_x, &a_pub_key_x_utf8) + str_to_utf8(b, a_pub_key_y, &a_pub_key_y_utf8) line_sender_opts_auth( self._opts, a_key_id_utf8, @@ -1172,27 +1474,20 @@ cdef class Sender: if tls == 'insecure_skip_verify': line_sender_opts_tls_insecure_skip_verify(self._opts) else: - ca_owner = str_to_utf8(tls, &ca_utf8) + str_to_utf8(b, tls, &ca_utf8) line_sender_opts_tls_ca(self._opts, ca_utf8) elif isinstance(tls, pathlib.Path): tls = str(tls) - ca_owner = str_to_utf8(tls, &ca_utf8) + str_to_utf8(b, tls, &ca_utf8) line_sender_opts_tls_ca(self._opts, ca_utf8) else: raise TypeError( 'tls must be a bool, a path or string pointing to CA file ' - f'or "insecure_skip_verify", not {type(tls)}') + f'or "insecure_skip_verify", not {_fqn(type(tls))}') if read_timeout is not None: line_sender_opts_read_timeout(self._opts, read_timeout) - self._init_capacity = init_capacity - self._max_name_len = max_name_len - - self._buffer = Buffer( - init_capacity=init_capacity, - max_name_len=max_name_len) - self._auto_flush_enabled = not not auto_flush self._auto_flush_watermark = int(auto_flush) \ if self._auto_flush_enabled else 0 @@ -1200,6 +1495,8 @@ cdef class Sender: raise ValueError( 'auto_flush_watermark must be >= 0, ' f'not {self._auto_flush_watermark}') + + qdb_pystr_buf_clear(b) def new_buffer(self): """ @@ -1288,6 +1585,68 @@ cdef class Sender: """ self._buffer.row(table_name, symbols=symbols, columns=columns, at=at) + def dataframe( + self, + df, # : pd.DataFrame + *, + table_name: Optional[str] = None, + table_name_col: Union[None, int, str] = None, + symbols: Union[str, bool, List[int], List[str]] = 'auto', + at: Union[None, int, str, TimestampNanos, datetime] = None): + """ + Write a Pandas DataFrame to the internal buffer. + + Example: + + .. code-block:: python + + import pandas as pd + import questdb.ingress as qi + + df = pd.DataFrame({ + 'car': pd.Categorical(['Nic 42', 'Eddi', 'Nic 42', 'Eddi']), + 'position': [1, 2, 1, 2], + 'speed': [89.3, 98.2, 3, 4], + 'lat_gforce': [0.1, -0.2, -0.6, 0.4], + 'accelleration': [0.1, -0.2, 0.6, 4.4], + 'tyre_pressure': [2.6, 2.5, 2.6, 2.5], + 'ts': [ + pd.Timestamp('2022-08-09 13:56:00'), + pd.Timestamp('2022-08-09 13:56:01'), + pd.Timestamp('2022-08-09 13:56:02'), + pd.Timestamp('2022-08-09 13:56:03')]}) + + with qi.Sender('localhost', 9000) as sender: + sender.dataframe(df, table_name='race_metrics', at='ts') + + This method builds on top of the :func:`Buffer.dataframe` method. + See its documentation for details on arguments. + + Additionally, this method also supports auto-flushing the buffer + as specified in the ``Sender``'s ``auto_flush`` constructor argument. + Auto-flushing is implemented incrementally, meanting that when + calling ``sender.dataframe(df)`` with a large ``df``, the sender may + have sent some of the rows to the server already whist the rest of the + rows are going to be sent at the next auto-flush or next explicit call + to :func:`Sender.flush`. + + In case of data errors with auto-flushing enabled, some of the rows + may have been transmitted to the server already. + """ + cdef auto_flush_t af = auto_flush_blank() + if self._auto_flush_enabled: + af.sender = self._impl + af.watermark = self._auto_flush_watermark + _dataframe( + af, + self._buffer._impl, + self._buffer._b, + df, + table_name, + table_name_col, + symbols, + at) + cpdef flush(self, Buffer buffer=None, bint clear=True): """ If called with no arguments, immediately flushes the internal buffer. @@ -1307,13 +1666,19 @@ cdef class Sender: If ``False``, the flushed buffer is left in the internal buffer. Note that ``clear=False`` is only supported if ``buffer`` is also specified. + + The Python GIL is released during the network IO operation. """ + cdef line_sender* sender = self._impl + cdef line_sender_error* err = NULL + cdef line_sender_buffer* c_buf = NULL + cdef PyThreadState* gs = NULL # GIL state. NULL means we have the GIL. + cdef bint ok = False + if buffer is None and not clear: raise ValueError('The internal buffer must always be cleared.') - cdef line_sender_error* err = NULL - cdef line_sender_buffer* c_buf = NULL - if self._impl == NULL: + if sender == NULL: raise IngressError( IngressErrorCode.InvalidApiCall, 'flush() can\'t be called: Not connected.') @@ -1324,20 +1689,21 @@ cdef class Sender: if line_sender_buffer_size(c_buf) == 0: return - try: - if clear: - if not line_sender_flush(self._impl, c_buf, &err): - raise c_err_to_py_fmt(err, _FLUSH_FMT) - else: - if not line_sender_flush_and_keep(self._impl, c_buf, &err): - raise c_err_to_py_fmt(err, _FLUSH_FMT) - except: - # Prevent a follow-up call to `.close(flush=True)` (as is usually - # called from `__exit__`) to raise after the sender entered an error - # state following a failed call to `.flush()`. + # We might be blocking on IO, so temporarily release the GIL. + _ensure_doesnt_have_gil(&gs) + if clear: + ok = line_sender_flush(sender, c_buf, &err) + else: + ok = line_sender_flush_and_keep(sender, c_buf, &err) + _ensure_has_gil(&gs) + if not ok: if c_buf == self._buffer._impl: + # Prevent a follow-up call to `.close(flush=True)` (as is + # usually called from `__exit__`) to raise after the sender + # entered an error state following a failed call to `.flush()`. + # Note: In this case `clear` is always `True`. line_sender_buffer_clear(c_buf) - raise + raise c_err_to_py_fmt(err, _FLUSH_FMT) cdef _close(self): self._buffer = None diff --git a/src/questdb/ingress_helper.inc b/src/questdb/ingress_helper.inc new file mode 100644 index 00000000..b726a913 --- /dev/null +++ b/src/questdb/ingress_helper.inc @@ -0,0 +1,14 @@ +#pragma once +// This file is included into `ingress.c`. + + +// Cython idiosyncrasy workaround. +// If we do this in Cython it treats `buf.obj` as +// a ref-counted `object` instead of a `PyObject*`, +// so we can't check it for NULL. +// Since `Py_buffer` is a Cython built-in we can't actually +// just re-define it in `extra_cpython.pxd`. +static int Py_buffer_obj_is_set(Py_buffer* buf) +{ + return buf->obj != NULL; +} diff --git a/src/questdb/ingress_helper.pxd b/src/questdb/ingress_helper.pxd new file mode 100644 index 00000000..9d3651d5 --- /dev/null +++ b/src/questdb/ingress_helper.pxd @@ -0,0 +1,2 @@ +cdef extern from "ingress_helper.inc": + bint Py_buffer_obj_is_set(Py_buffer* buf) \ No newline at end of file diff --git a/src/questdb/pystr_to_utf8.pxd b/src/questdb/pystr_to_utf8.pxd new file mode 100644 index 00000000..1822e26c --- /dev/null +++ b/src/questdb/pystr_to_utf8.pxd @@ -0,0 +1,60 @@ +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t, intptr_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, uintptr_t + +cdef extern from "pystr_to_utf8.h": + + cdef struct qdb_pystr_buf: + pass + + cdef struct qdb_pystr_pos: + size_t chain + size_t string + + # Prepare a new buffer. The buffer must be freed with `qdb_pystr_free`. + # The `qdb_ucsX_to_utf8` functions will write to this buffer. + qdb_pystr_buf *qdb_pystr_buf_new() + + # Get current position. Use in conjunction with `truncate`. + qdb_pystr_pos qdb_pystr_buf_tell(const qdb_pystr_buf *b) + + # Trim the buffer to the given position. Use in conjunction with `tell`. + void qdb_pystr_buf_truncate(qdb_pystr_buf *b, qdb_pystr_pos pos) + + # Reset the converter's buffer to zero length. + void qdb_pystr_buf_clear(qdb_pystr_buf *b) + + # Free the buffer. Must be called after `qdb_pystr_buf_new`. + void qdb_pystr_buf_free(qdb_pystr_buf *b) + + # Convert a Py_UCS1 string to UTF-8. + # Returns a `buf_out` borrowed ptr of `size_out` len. + # The buffer is borrowed from `b`. + void qdb_ucs1_to_utf8(qdb_pystr_buf *b, + size_t count, + const uint8_t *input, + size_t *size_out, + const char **buf_out) + + # Convert a Py_UCS2 string to UTF-8. + # Returns a `buf_out` borrowed ptr of `size_out` len. + # The buffer is borrowed from `b`. + # In case of errors, returns `false` and bad_codepoint_out is set to the + # offending codepoint. + bint qdb_ucs2_to_utf8(qdb_pystr_buf *b, + size_t count, + const uint16_t *input, + size_t *size_out, + const char **buf_out, + uint32_t *bad_codepoint_out) + + # Convert a Py_UCS4 string to UTF-8. + # Returns a `buf_out` borrowed ptr of `size_out` len. + # The buffer is borrowed from `b`. + # In case of errors, returns `false` and bad_codepoint_out is set to the + # offending codepoint. + bint qdb_ucs4_to_utf8(qdb_pystr_buf *b, + size_t count, + const uint32_t *input, + size_t *size_out, + const char **buf_out, + uint32_t *bad_codepoint_out) diff --git a/test/benchmark.py b/test/benchmark.py new file mode 100644 index 00000000..fa62b514 --- /dev/null +++ b/test/benchmark.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +import sys +import os +sys.dont_write_bytecode = True +import unittest +import time +import numpy as np +import pandas as pd +from concurrent.futures import ThreadPoolExecutor + +import patch_path +import questdb.ingress as qi + + +def _tp(buf, t0, t1): + tp = len(buf) / (t1 - t0) / 1024 / 1024 + return f'{tp:.2f} MiB/s' + + +class TestBenchmarkPandas(unittest.TestCase): + def test_pystr_i64_10m(self): + # This is a benchmark, not a test. + # It is useful to run it manually to check performance. + slist = [f's{i:09}' for i in range(10_000_000)] + df = pd.DataFrame({ + 'a': slist, + 'b': list(range(len(slist)))}) + + buf = qi.Buffer() + + # Warm up and pre-size buffer + buf.dataframe(df, table_name='tbl1', symbols=True) + buf.clear() + + # Run + t0 = time.monotonic() + buf.dataframe(df, table_name='tbl1', symbols=True) + t1 = time.monotonic() + print(f'Time: {t1 - t0}, size: {len(buf)}, tp: {_tp(buf, t0, t1)}') + + def test_mixed_10m(self): + # This is a benchmark, not a test. + # It is useful to run it manually to check performance. + count = 10_000_000 + slist = [f's{i:09}' for i in range(count)] + df = pd.DataFrame({ + 'col1': pd.Series(slist, dtype='string[pyarrow]'), + 'col2': list(range(len(slist))), + 'col3': [float(i / 2) for i in range(len(slist))], + 'col4': [float(i / 2) + 1.0 for i in range(len(slist))], + 'col5': pd.Categorical( + ['a', 'b', 'c', 'a', None, 'c', 'a', float('nan')] * + (count // 8))}) + + buf = qi.Buffer() + + # Warm up and pre-size buffer + buf.dataframe(df, table_name='tbl1', symbols=True) + buf.clear() + + # Run + t0 = time.monotonic() + buf.dataframe(df, table_name='tbl1', symbols=True) + t1 = time.monotonic() + print(f'Time: {t1 - t0}, size: {len(buf)}, tp: {_tp(buf, t0, t1)}') + + def test_string_escaping_10m(self): + count = 10_000_000 + slist = [f's={i:09}==abc \\' for i in range(count)] + series = pd.Series(slist, dtype='string[pyarrow]') + df = pd.DataFrame({ + 'col1': series, + 'col2': series, + 'col3': series, + 'col4': series, + 'col5': series, + 'col6': series}) + + buf = qi.Buffer() + + # Warm up and pre-size buffer + buf.dataframe(df, table_name='tbl1', symbols=True) + buf.clear() + + # Run + t0 = time.monotonic() + buf.dataframe(df, table_name='tbl1', symbols=True) + t1 = time.monotonic() + print(f'Time: {t1 - t0}, size: {len(buf)}, tp: {_tp(buf, t0, t1)}') + + def test_string_encoding_10m(self): + count = 10_000_000 + strs = ['a', # ASCII + 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 12 , # UCS-2 + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '𐀀a𐀀b𐀀💩🦞c𐀀d𐀀ef'] # UCS-4, 4 bytes for UTF-8. + slist = strs * (count // len(strs)) + self.assertEqual(len(slist), count) + + df = pd.DataFrame({ + 'col1': slist, + 'col2': slist, + 'col3': slist, + 'col4': slist, + 'col5': slist}) + + buf = qi.Buffer() + + # Warm up and pre-size buffer + buf.dataframe(df, table_name='tbl1', symbols=False) + buf.clear() + + # Run + t0 = time.monotonic() + buf.dataframe(df, table_name='tbl1', symbols=False) + t1 = time.monotonic() + print(f'Time: {t1 - t0}, size: {len(buf)}, tp: {_tp(buf, t0, t1)}') + + def _test_gil_release_10m(self, threads): + count = 10_000_000 + series = pd.Series(np.arange(count), dtype='int64') + df = pd.DataFrame({ + 'col1': series, + 'col2': series, + 'col3': series, + 'col4': series, + 'col5': series, + 'col6': series}) + + tpe = ThreadPoolExecutor(max_workers=threads) + bufs = [qi.Buffer() for _ in range(threads)] + + def benchmark_run(buf): + t0 = time.monotonic() + buf.dataframe(df, table_name='tbl1', symbols=True) + t1 = time.monotonic() + return buf, (t0, t1) + + # Warm up and pre-size buffer + futs = [ + tpe.submit(benchmark_run, buf) + for buf in bufs] + for fut in futs: + fut.result() # Wait for completion + for buf in bufs: + buf.clear() + + # Run + futs = [ + tpe.submit(benchmark_run, buf) + for buf in bufs] + results = [ + fut.result() + for fut in futs] + print(f'\nSize: {len(bufs[0])}') + total_time = 0 + min_time = 2 ** 64 -1 # Bigger than any `time.monotonic()` value + max_time = 0 + print('Per-thread times:') + for index, (_, (t0, t1)) in enumerate(results): + if t0 < min_time: + min_time = t0 + if t1 > max_time: + max_time = t1 + elapsed = t1 - t0 + print(f' [{index:02}]: Time: {elapsed}') + total_time += elapsed + avg_time = total_time / len(results) + print(f'Avg time: {avg_time}') + tp = (len(bufs[0]) * len(bufs)) / (max_time - min_time) / 1024 / 1024 + print(f'Wall time: {max_time - min_time}, tp: {tp:.2f} MiB/s') + + def test_gil_release_10m_1t(self): + self._test_gil_release_10m(1) + + def test_gil_release_10m_10t(self): + self._test_gil_release_10m(10) + + def test_gil_release_10m_16t(self): + self._test_gil_release_10m(16) + + def test_gil_release_10m_32t(self): + self._test_gil_release_10m(32) + + +if __name__ == '__main__': + if os.environ.get('TEST_QUESTDB_PROFILE') == '1': + import cProfile + cProfile.run('unittest.main()', sort='cumtime') + else: + unittest.main() diff --git a/test/patch_path.py b/test/patch_path.py index 8137bcd1..64868881 100644 --- a/test/patch_path.py +++ b/test/patch_path.py @@ -9,5 +9,8 @@ import pathlib PROJ_ROOT = pathlib.Path(__file__).parent.parent -if os.environ.get('TEST_QUESTDB_PATCH_PATH') == '1': +def patch(): sys.path.append(str(PROJ_ROOT / 'src')) + +if os.environ.get('TEST_QUESTDB_PATCH_PATH') == '1': + patch() \ No newline at end of file diff --git a/test/system_test.py b/test/system_test.py index 900fff03..e0eb7d5a 100755 --- a/test/system_test.py +++ b/test/system_test.py @@ -12,6 +12,14 @@ from fixture import QuestDbFixture, install_questdb, CA_PATH, AUTH +try: + import pandas as pd + import numpy + import pyarrow +except ImportError: + pd = None + + import questdb.ingress as qi @@ -123,6 +131,44 @@ def test_auth_tls_ca(self): def test_auth_tls_ca_str(self): self._test_scenario(self.qdb_auth, AUTH, str(CA_PATH)) + @unittest.skipIf(not pd, 'pandas not installed') + def test_basic_dataframe(self): + port = self.qdb_plain.line_tcp_port + pending = None + table_name = uuid.uuid4().hex + df = pd.DataFrame({ + 'col_a': [1, 2, 3], + 'col_b': ['a', 'b', 'c'], + 'col_c': [True, False, True], + 'col_d': [1.5, 2.5, 3.5], + 'col_e': pd.Categorical(['A', 'B', 'C']), + 'col_f': [ + numpy.datetime64('2021-01-01'), + numpy.datetime64('2021-01-02'), + numpy.datetime64('2021-01-03')]}) + df.index.name = table_name + with qi.Sender('localhost', port) as sender: + sender.dataframe(df) + pending = str(sender) + + resp = self.qdb_plain.retry_check_table( + table_name, min_rows=3, log_ctx=pending) + exp_columns = [ + {'name': 'col_e', 'type': 'SYMBOL'}, + {'name': 'col_a', 'type': 'LONG'}, + {'name': 'col_b', 'type': 'STRING'}, + {'name': 'col_c', 'type': 'BOOLEAN'}, + {'name': 'col_d', 'type': 'DOUBLE'}, + {'name': 'col_f', 'type': 'TIMESTAMP'}, + {'name': 'timestamp', 'type': 'TIMESTAMP'}] + self.assertEqual(resp['columns'], exp_columns) + + exp_dataset = [ # Comparison excludes timestamp column. + ['A', 1, 'a', True, 1.5, '2021-01-01T00:00:00.000000Z'], + ['B', 2, 'b', False, 2.5, '2021-01-02T00:00:00.000000Z'], + ['C', 3, 'c', True, 3.5, '2021-01-03T00:00:00.000000Z']] + scrubbed_dataset = [row[:-1] for row in resp['dataset']] + self.assertEqual(scrubbed_dataset, exp_dataset) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/test/test.py b/test/test.py index 392839f0..136f91f1 100755 --- a/test/test.py +++ b/test/test.py @@ -10,11 +10,30 @@ import patch_path from mock_server import Server + import questdb.ingress as qi if os.environ.get('TEST_QUESTDB_INTEGRATION') == '1': from system_test import TestWithDatabase +try: + import pandas as pd + import numpy + import pyarrow +except ImportError: + pd = None + + +if pd is not None: + from test_dataframe import TestPandas +else: + class TestNoPandas(unittest.TestCase): + def test_no_pandas(self): + buf = qi.Buffer() + exp = 'Missing.*`pandas.*pyarrow`.*readthedocs.*installation.html.' + with self.assertRaisesRegex(ImportError, exp): + buf.dataframe(None) + class TestBuffer(unittest.TestCase): def test_new(self): @@ -106,8 +125,43 @@ def test_no_symbol_or_col_args(self): def test_unicode(self): buf = qi.Buffer() - buf.row('tbl1', symbols={'questdb1': '❤️'}, columns={'questdb2': '❤️'}) - self.assertEqual(str(buf), 'tbl1,questdb1=❤️ questdb2="❤️"\n') + buf.row( + 'tbl1', # ASCII + symbols={'questdb1': 'q❤️p'}, # Mixed ASCII and UCS-2 + columns={'questdb2': '❤️' * 1200}) # Over the 1024 buffer prealloc. + buf.row( + 'tbl1', + symbols={ + 'Questo è il nome di una colonna': # Non-ASCII UCS-1 + 'Це символьне значення'}, # UCS-2, 2 bytes for UTF-8. + columns={ + 'questdb1': '', # Empty string + 'questdb2': '嚜꓂', # UCS-2, 3 bytes for UTF-8. + 'questdb3': '💩🦞'}) # UCS-4, 4 bytes for UTF-8. + self.assertEqual(str(buf), + f'tbl1,questdb1=q❤️p questdb2="{"❤️" * 1200}"\n' + + 'tbl1,Questo\\ è\\ il\\ nome\\ di\\ una\\ colonna=' + + 'Це\\ символьне\\ значення ' + + 'questdb1="",questdb2="嚜꓂",questdb3="💩🦞"\n') + + buf.clear() + buf.row('tbl1', symbols={'questdb1': 'q❤️p'}) + self.assertEqual(str(buf), 'tbl1,questdb1=q❤️p\n') + + # A bad char in Python. + with self.assertRaisesRegex( + qi.IngressError, + '.*codepoint 0xd800 in string .*'): + buf.row('tbl1', symbols={'questdb1': 'a\ud800'}) + + # Strong exception safety: no partial writes. + # Ensure we can continue using the buffer after an error. + buf.row('tbl1', symbols={'questdb1': 'another line of input'}) + self.assertEqual( + str(buf), + 'tbl1,questdb1=q❤️p\n' + + # Note: No partially written failed line here. + 'tbl1,questdb1=another\\ line\\ of\\ input\n') def test_float(self): buf = qi.Buffer() @@ -139,7 +193,6 @@ def test_int_range(self): buf.row('tbl1', columns={'num': -2**63-1}) - class TestSender(unittest.TestCase): def test_basic(self): with Server() as server, qi.Sender('localhost', server.port) as sender: @@ -362,6 +415,55 @@ def test_dont_flush_on_exception(self): msgs = server.recv() self.assertEqual(msgs, []) + @unittest.skipIf(not pd, 'pandas not installed') + def test_dataframe(self): + with Server() as server: + with qi.Sender('localhost', server.port) as sender: + server.accept() + df = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) + sender.dataframe(df, table_name='tbl1') + msgs = server.recv() + self.assertEqual( + msgs, + [b'tbl1 a=1i,b=3.0', + b'tbl1 a=2i,b=4.0']) + + @unittest.skipIf(not pd, 'pandas not installed') + def test_dataframe_auto_flush(self): + with Server() as server: + # An auto-flush size of 20 bytes is enough to auto-flush the first + # row, but not the second. + with qi.Sender('localhost', server.port, auto_flush=20) as sender: + server.accept() + df = pd.DataFrame({'a': [100000, 2], 'b': [3.0, 4.0]}) + sender.dataframe(df, table_name='tbl1') + msgs = server.recv() + self.assertEqual( + msgs, + [b'tbl1 a=100000i,b=3.0']) + + # The second row is still pending send. + self.assertEqual(len(sender), 16) + + # So we give it some more data and we should see it flush. + sender.row('tbl1', columns={'a': 3, 'b': 5.0}) + msgs = server.recv() + self.assertEqual( + msgs, + [b'tbl1 a=2i,b=4.0', + b'tbl1 a=3i,b=5.0']) + + self.assertEqual(len(sender), 0) + + # We can now disconnect the server and see auto flush failing. + server.close() + + exp_err = 'Could not flush buffer.* - See https' + with self.assertRaisesRegex(qi.IngressError, exp_err): + for _ in range(1000): + time.sleep(0.01) + sender.dataframe(df.head(1), table_name='tbl1') + def test_new_buffer(self): sender = qi.Sender( host='localhost', @@ -393,5 +495,64 @@ def test_bad_init_args(self): qi.Sender(host='localhost', port=9009, max_name_len=-1) +class TestBases: + class Timestamp(unittest.TestCase): + def test_from_int(self): + ns = 1670857929778202000 + num = ns // self.ns_scale + ts = self.timestamp_cls(num) + self.assertEqual(ts.value, num) + + ts0 = self.timestamp_cls(0) + self.assertEqual(ts0.value, 0) + + with self.assertRaisesRegex(ValueError, 'value must be a positive'): + self.timestamp_cls(-1) + + def test_from_datetime(self): + utc = datetime.timezone.utc + + dt1 = datetime.datetime(2022, 1, 1, 12, 0, 0, 0, tzinfo=utc) + ts1 = self.timestamp_cls.from_datetime(dt1) + self.assertEqual(ts1.value, 1641038400000000000 // self.ns_scale) + self.assertEqual( + ts1.value, + int(dt1.timestamp() * 1000000000 // self.ns_scale)) + + dt2 = datetime.datetime(1970, 1, 1, tzinfo=utc) + ts2 = self.timestamp_cls.from_datetime(dt2) + self.assertEqual(ts2.value, 0) + + with self.assertRaisesRegex(ValueError, 'value must be a positive'): + self.timestamp_cls.from_datetime( + datetime.datetime(1969, 12, 31, tzinfo=utc)) + + dt_naive = datetime.datetime(2022, 1, 1, 12, 0, 0, 0, + tzinfo=utc).astimezone(None).replace(tzinfo=None) + ts3 = self.timestamp_cls.from_datetime(dt_naive) + self.assertEqual(ts3.value, 1641038400000000000 // self.ns_scale) + + def test_now(self): + expected = time.time_ns() // self.ns_scale + actual = self.timestamp_cls.now().value + delta = abs(expected - actual) + one_sec = 1000000000 // self.ns_scale + self.assertLess(delta, one_sec) + + +class TestTimestampMicros(TestBases.Timestamp): + timestamp_cls = qi.TimestampMicros + ns_scale = 1000 + + +class TestTimestampNanos(TestBases.Timestamp): + timestamp_cls = qi.TimestampNanos + ns_scale = 1 + + if __name__ == '__main__': - unittest.main() + if os.environ.get('TEST_QUESTDB_PROFILE') == '1': + import cProfile + cProfile.run('unittest.main()', sort='cumtime') + else: + unittest.main() diff --git a/test/test_dataframe.py b/test/test_dataframe.py new file mode 100644 index 00000000..42e310c7 --- /dev/null +++ b/test/test_dataframe.py @@ -0,0 +1,1575 @@ +#!/usr/bin/env python3 + +import sys +import os +sys.dont_write_bytecode = True +import unittest +import datetime as dt +import functools +import tempfile +import pathlib + +BROKEN_TIMEZONES = True + +try: + import zoneinfo + _TZ = zoneinfo.ZoneInfo('America/New_York') + BROKEN_TIMEZONES = os.name == 'nt' +except ImportError: + import pytz + _TZ = pytz.timezone('America/New_York') + +import patch_path + +import questdb.ingress as qi +import pandas as pd +import numpy as np +import pyarrow as pa + +try: + import fastparquet +except ImportError: + fastparquet = None + + +def _dataframe(*args, **kwargs): + buf = qi.Buffer() + buf.dataframe(*args, **kwargs) + return str(buf) + + +DF1 = pd.DataFrame({ + 'A': [1.0, 2.0, 3.0], + 'B': [1, 2, 3], + 'C': [ + pd.Timestamp('20180310'), + pd.Timestamp('20180311'), + pd.Timestamp('20180312')], + 'D': [True, 'foo', 'bar']}) + + +DF2 = pd.DataFrame({ + 'T': ['t1', 't2', 't1'], + 'A': ['a1', 'a2', 'a3'], + 'B': ['b1', None, 'b3'], + 'C': pd.Series(['b1', None, 'b3'], dtype='string'), + 'D': pd.Series(['a1', 'a2', 'a3'], dtype='string'), + 'E': [1.0, 2.0, 3.0], + 'F': [1, 2, 3], + 'G': [ + pd.Timestamp('20180310'), + pd.Timestamp('20180311'), + pd.Timestamp('20180312')]}) + + +def with_tmp_dir(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + with tempfile.TemporaryDirectory(prefix='py-questdb-client_') as tmpdir: + return func(self, *args, pathlib.Path(tmpdir), **kwargs) + return wrapper + + +class TestPandas(unittest.TestCase): + def test_bad_dataframe(self): + with self.assertRaisesRegex(qi.IngressError, + 'Expected pandas'): + _dataframe([]) + + def test_no_table_name(self): + with self.assertRaisesRegex(qi.IngressError, + 'Must specify at least one of'): + _dataframe(DF1) + + def test_bad_table_name_type(self): + with self.assertRaisesRegex(qi.IngressError, 'Must be str'): + _dataframe(DF1, table_name=1.5) + + def test_invalid_table_name(self): + with self.assertRaisesRegex(qi.IngressError, + '`table_name`: Bad string "."'): + _dataframe(DF1, table_name='.') + + def test_invalid_column_dtype(self): + with self.assertRaisesRegex(qi.IngressError, + '`table_name_col`: Bad dtype'): + _dataframe(DF1, table_name_col='B') + with self.assertRaisesRegex(qi.IngressError, + '`table_name_col`: Bad dtype'): + _dataframe(DF1, table_name_col=1) + with self.assertRaisesRegex(qi.IngressError, + '`table_name_col`: Bad dtype'): + _dataframe(DF1, table_name_col=-3) + with self.assertRaisesRegex(qi.IngressError, + '`table_name_col`: -5 index'): + _dataframe(DF1, table_name_col=-5) + + def test_bad_str_obj_col(self): + with self.assertRaisesRegex(qi.IngressError, + "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): + _dataframe(DF1, table_name_col='D') + with self.assertRaisesRegex(qi.IngressError, + "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): + _dataframe(DF1, table_name_col=3) + with self.assertRaisesRegex(qi.IngressError, + "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): + _dataframe(DF1, table_name_col=-1) + + def test_bad_symbol(self): + with self.assertRaisesRegex(qi.IngressError, + '`symbols`.*bool.*tuple.*list'): + _dataframe(DF1, table_name='tbl1', symbols=0) + with self.assertRaisesRegex(qi.IngressError, + '`symbols`.*bool.*tuple.*list'): + _dataframe(DF1, table_name='tbl1', symbols={}) + with self.assertRaisesRegex(qi.IngressError, + '`symbols`.*bool.*tuple.*list'): + _dataframe(DF1, table_name='tbl1', symbols=None) + with self.assertRaisesRegex(qi.IngressError, + "`symbols`: Bad dtype `float64`.*'A'.*Must.*strings col"): + _dataframe(DF1, table_name='tbl1', symbols=(0,)) + with self.assertRaisesRegex(qi.IngressError, + "`symbols`: Bad dtype `int64`.*'B'.*Must be a strings column."): + _dataframe(DF1, table_name='tbl1', symbols=[1]) + + def test_bad_at(self): + with self.assertRaisesRegex(qi.IngressError, + '`at`.*2018.*not found in the'): + _dataframe(DF1, table_name='tbl1', at='2018-03-10T00:00:00Z') + with self.assertRaisesRegex(qi.IngressError, + '`at`.*float64.*be a datetime'): + _dataframe(DF1, table_name='tbl1', at='A') + with self.assertRaisesRegex(qi.IngressError, + '`at`.*int64.*be a datetime'): + _dataframe(DF1, table_name='tbl1', at=1) + with self.assertRaisesRegex(qi.IngressError, + '`at`.*object.*be a datetime'): + _dataframe(DF1, table_name='tbl1', at=-1) + + def test_empty_dataframe(self): + buf = _dataframe(pd.DataFrame(), table_name='tbl1') + self.assertEqual(buf, '') + + def test_zero_row_dataframe(self): + buf = _dataframe(pd.DataFrame(columns=['A', 'B']), table_name='tbl1') + self.assertEqual(buf, '') + + def test_zero_column_dataframe(self): + df = pd.DataFrame(index=[0, 1, 2]) + self.assertEqual(len(df), 3) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual(buf, '') + + def test_basic(self): + buf = _dataframe( + DF2, + table_name_col='T', + symbols=['A', 'B', 'C', 'D'], + at=-1) + self.assertEqual( + buf, + 't1,A=a1,B=b1,C=b1,D=a1 E=1.0,F=1i 1520640000000000000\n' + + 't2,A=a2,D=a2 E=2.0,F=2i 1520726400000000000\n' + + 't1,A=a3,B=b3,C=b3,D=a3 E=3.0,F=3i 1520812800000000000\n') + + def test_named_dataframe(self): + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c']}) + df.index.name = 'table_name' + buf = _dataframe(df) + self.assertEqual( + buf, + 'table_name a=1i,b="a"\n' + + 'table_name a=2i,b="b"\n' + + 'table_name a=3i,b="c"\n') + + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n') + + buf = _dataframe(df, table_name_col='b') + self.assertEqual( + buf, + 'a a=1i\n' + + 'b a=2i\n' + + 'c a=3i\n') + + df.index.name = 42 # bad type, not str + with self.assertRaisesRegex(qi.IngressError, + 'Bad dataframe index name as table.*: Expected str, not.*int.'): + _dataframe(df) + + @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') + def test_at_good(self): + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c']}) + df.index.name = 'test_at_good' + with self.assertRaisesRegex(qi.IngressError, + 'Bad argument `at`: Column .2018-03.* not found .* dataframe.'): + _dataframe(df, at='2018-03-10T00:00:00Z') + + # Same timestamp, specified in various ways. + t1_setup = dt.datetime(2018, 3, 10, 0, 0, 0, tzinfo=dt.timezone.utc) + t1 = t1_setup.astimezone(tz=None).replace(tzinfo=None) # naive, local + t2 = dt.datetime(2018, 3, 10, 0, 0, 0, tzinfo=dt.timezone.utc) + t3 = dt.datetime(2018, 3, 9, 19, 0, 0, tzinfo=_TZ) + t4 = qi.TimestampNanos(1520640000000000000) + t5 = qi.TimestampNanos.from_datetime(t1) + t6 = qi.TimestampNanos.from_datetime(t2) + t7 = qi.TimestampNanos.from_datetime(t3) + timestamps = [t1, t2, t3, t4, t5, t6, t7] + for ts in timestamps: + buf = _dataframe(df, table_name='tbl1', at=ts) + self.assertEqual( + buf, + 'tbl1 a=1i,b="a" 1520640000000000000\n' + + 'tbl1 a=2i,b="b" 1520640000000000000\n' + + 'tbl1 a=3i,b="c" 1520640000000000000\n') + + @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') + def test_at_neg(self): + n1 = dt.datetime(1965, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) + n2 = dt.datetime(1965, 1, 1, 0, 0, 0, tzinfo=_TZ) + n3 = dt.datetime(1965, 1, 1, 0, 0, 0) + neg_timestamps = [n1, n2, n3] + for ts in neg_timestamps: + with self.assertRaisesRegex(qi.IngressError, + 'Bad.*`at`: Cannot .* before the Unix epoch .1970-01-01.*'): + _dataframe(DF2, at=ts, table_name='test_at_neg') + + @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') + def test_at_ts_0(self): + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c']}) + df.index.name = 'test_at_ts_0' + + # Epoch 0, specified in various ways. + e1_setup = dt.datetime(1970, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) + e1 = e1_setup.astimezone(tz=None).replace(tzinfo=None) # naive, local + e2 = dt.datetime(1970, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) + e3 = dt.datetime(1969, 12, 31, 19, 0, 0, tzinfo=_TZ) + e4 = qi.TimestampNanos(0) + e5 = qi.TimestampNanos.from_datetime(e1) + e6 = qi.TimestampNanos.from_datetime(e2) + e7 = qi.TimestampNanos.from_datetime(e3) + edge_timestamps = [e1, e2, e3, e4, e5, e6, e7] + + for ts in edge_timestamps: + buf = _dataframe(df, table_name='tbl1', at=ts) + self.assertEqual( + buf, + 'tbl1 a=1i,b="a" 0\n' + + 'tbl1 a=2i,b="b" 0\n' + + 'tbl1 a=3i,b="c" 0\n') + + def test_single_at_col(self): + df = pd.DataFrame({'timestamp': pd.to_datetime(['2023-01-01'])}) + with self.assertRaisesRegex(qi.IngressError, + 'Bad dataframe row at index 0: All values are nulls.'): + _dataframe(df, table_name='tbl1', at='timestamp') + + def test_row_of_nulls(self): + df = pd.DataFrame({'a': ['a1', None, 'a3']}) + with self.assertRaisesRegex( + qi.IngressError, 'Bad dataframe row.*1: All values are nulls.'): + _dataframe(df, table_name='tbl1', symbols=['a']) + + def test_u8_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + 0, + 255], # u8 max + dtype='uint8')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i\n' + + 'tbl1 a=2i\n' + + 'tbl1 a=3i\n' + + 'tbl1 a=0i\n' + + 'tbl1 a=255i\n') + + def test_i8_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + -128, # i8 min + 127, # i8 max + 0], dtype='int8')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i\n' + + 'tbl1 a=2i\n' + + 'tbl1 a=3i\n' + + 'tbl1 a=-128i\n' + + 'tbl1 a=127i\n' + + 'tbl1 a=0i\n') + + def test_u16_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + 0, + 65535], # u16 max + dtype='uint16')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i\n' + + 'tbl1 a=2i\n' + + 'tbl1 a=3i\n' + + 'tbl1 a=0i\n' + + 'tbl1 a=65535i\n') + + def test_i16_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + -32768, # i16 min + 32767, # i16 max + 0], dtype='int16')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i\n' + + 'tbl1 a=2i\n' + + 'tbl1 a=3i\n' + + 'tbl1 a=-32768i\n' + + 'tbl1 a=32767i\n' + + 'tbl1 a=0i\n') + + def test_u32_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + 0, + 4294967295], # u32 max + dtype='uint32')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i\n' + + 'tbl1 a=2i\n' + + 'tbl1 a=3i\n' + + 'tbl1 a=0i\n' + + 'tbl1 a=4294967295i\n') + + def test_i32_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + -2147483648, # i32 min + 0, + 2147483647], # i32 max + dtype='int32')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i\n' + + 'tbl1 a=2i\n' + + 'tbl1 a=3i\n' + + 'tbl1 a=-2147483648i\n' + + 'tbl1 a=0i\n' + + 'tbl1 a=2147483647i\n') + + def test_u64_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + 0, + 9223372036854775807], # i64 max + dtype='uint64')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i\n' + + 'tbl1 a=2i\n' + + 'tbl1 a=3i\n' + + 'tbl1 a=0i\n' + + 'tbl1 a=9223372036854775807i\n') + + buf = qi.Buffer() + buf.dataframe(pd.DataFrame({'b': [.5, 1.0, 1.5]}), table_name='tbl2') + exp1 = ( + 'tbl2 b=0.5\n' + + 'tbl2 b=1.0\n' + + 'tbl2 b=1.5\n') + self.assertEqual( + str(buf), + exp1) + df2 = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + 0, + 9223372036854775808], # i64 max + 1 + dtype='uint64')}) + with self.assertRaisesRegex( + qi.IngressError, + 'serialize .* column .a. .* 4 .9223372036854775808.*int64'): + buf.dataframe(df2, table_name='tbl1') + + self.assertEqual( + str(buf), + exp1) # No partial write of `df2`. + + def test_i64_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + -9223372036854775808, # i64 min + 0, + 9223372036854775807], # i64 max + dtype='int64')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i\n' + + 'tbl1 a=2i\n' + + 'tbl1 a=3i\n' + + 'tbl1 a=-9223372036854775808i\n' + + 'tbl1 a=0i\n' + + 'tbl1 a=9223372036854775807i\n') + + def test_f32_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1.0, 2.0, 3.0, + 0.0, + float('inf'), + float('-inf'), + float('nan'), + 3.4028234663852886e38], # f32 max + dtype='float32')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1.0\n' + + 'tbl1 a=2.0\n' + + 'tbl1 a=3.0\n' + + 'tbl1 a=0.0\n' + + 'tbl1 a=Infinity\n' + + 'tbl1 a=-Infinity\n' + + 'tbl1 a=NaN\n' + + 'tbl1 a=3.4028234663852886e38\n') + + def test_f64_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 1.0, 2.0, 3.0, + 0.0, + float('inf'), + float('-inf'), + float('nan'), + 1.7976931348623157e308], # f64 max + dtype='float64')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1.0\n' + + 'tbl1 a=2.0\n' + + 'tbl1 a=3.0\n' + + 'tbl1 a=0.0\n' + + 'tbl1 a=Infinity\n' + + 'tbl1 a=-Infinity\n' + + 'tbl1 a=NaN\n' + + 'tbl1 a=1.7976931348623157e308\n') + + def test_u8_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + 0, + None, + 255], # u8 max + dtype=pd.UInt8Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=0i,b="d"\n' + + 'tbl1 b="e"\n' + + 'tbl1 a=255i,b="f"\n') + + def test_i8_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + -128, # i8 min + 0, + None, + 127], # i8 max + dtype=pd.Int8Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=-128i,b="d"\n' + + 'tbl1 a=0i,b="e"\n' + + 'tbl1 b="f"\n' + + 'tbl1 a=127i,b="g"\n') + + def test_u16_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + 0, + None, + 65535], # u16 max + dtype=pd.UInt16Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=0i,b="d"\n' + + 'tbl1 b="e"\n' + + 'tbl1 a=65535i,b="f"\n') + + def test_i16_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + -32768, # i16 min + 0, + None, + 32767], # i16 max + dtype=pd.Int16Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=-32768i,b="d"\n' + + 'tbl1 a=0i,b="e"\n' + + 'tbl1 b="f"\n' + + 'tbl1 a=32767i,b="g"\n') + + def test_u32_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + 0, + None, + 4294967295], # u32 max + dtype=pd.UInt32Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=0i,b="d"\n' + + 'tbl1 b="e"\n' + + 'tbl1 a=4294967295i,b="f"\n') + + def test_i32_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + -2147483648, # i32 min + 0, + None, + 2147483647], # i32 max + dtype=pd.Int32Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=-2147483648i,b="d"\n' + + 'tbl1 a=0i,b="e"\n' + + 'tbl1 b="f"\n' + + 'tbl1 a=2147483647i,b="g"\n') + + def test_u64_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + 0, + None, + 9223372036854775807], # i64 max + dtype=pd.UInt64Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=0i,b="d"\n' + + 'tbl1 b="e"\n' + + 'tbl1 a=9223372036854775807i,b="f"\n') + + df2 = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + 0, + 9223372036854775808], # i64 max + 1 + dtype=pd.UInt64Dtype())}) + with self.assertRaisesRegex( + qi.IngressError, + 'serialize .* column .a. .* 4 .9223372036854775808.*int64'): + _dataframe(df2, table_name='tbl1') + + def test_i64_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + -9223372036854775808, # i64 min + 0, + None, + 9223372036854775807], # i64 max + dtype=pd.Int64Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=-9223372036854775808i,b="d"\n' + + 'tbl1 a=0i,b="e"\n' + + 'tbl1 b="f"\n' + + 'tbl1 a=9223372036854775807i,b="g"\n') + + def test_f32_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1.0, 2.0, 3.0, + 0.0, + float('inf'), + float('-inf'), + float('nan'), + 3.4028234663852886e38, # f32 max + None], + dtype=pd.Float32Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1.0,b="a"\n' + + 'tbl1 a=2.0,b="b"\n' + + 'tbl1 a=3.0,b="c"\n' + + 'tbl1 a=0.0,b="d"\n' + + 'tbl1 a=Infinity,b="e"\n' + + 'tbl1 a=-Infinity,b="f"\n' + + 'tbl1 b="g"\n' + # This one is wierd: `nan` gets 0 in the bitmask. + 'tbl1 a=3.4028234663852886e38,b="h"\n' + + 'tbl1 b="i"\n') + + def test_f64_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1.0, 2.0, 3.0, + 0.0, + float('inf'), + float('-inf'), + float('nan'), + 1.7976931348623157e308, # f64 max + None], + dtype=pd.Float64Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1.0,b="a"\n' + + 'tbl1 a=2.0,b="b"\n' + + 'tbl1 a=3.0,b="c"\n' + + 'tbl1 a=0.0,b="d"\n' + + 'tbl1 a=Infinity,b="e"\n' + + 'tbl1 a=-Infinity,b="f"\n' + + 'tbl1 b="g"\n' + # This one is wierd: `nan` gets 0 in the bitmask. + 'tbl1 a=1.7976931348623157e308,b="h"\n' + + 'tbl1 b="i"\n') + + def test_bool_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + True, False, False, + False, True, False], + dtype='bool')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=t\n' + + 'tbl1 a=f\n' + + 'tbl1 a=f\n' + + 'tbl1 a=f\n' + + 'tbl1 a=t\n' + + 'tbl1 a=f\n') + + def test_bool_arrow_col(self): + df = pd.DataFrame({'a': pd.Series([ + True, False, False, + False, True, False, + True, True, True, + False, False, False], + dtype='boolean')}) # Note `boolean` != `bool`. + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=t\n' + + 'tbl1 a=f\n' + + 'tbl1 a=f\n' + + 'tbl1 a=f\n' + + 'tbl1 a=t\n' + + 'tbl1 a=f\n' + + 'tbl1 a=t\n' + + 'tbl1 a=t\n' + + 'tbl1 a=t\n' + + 'tbl1 a=f\n' + + 'tbl1 a=f\n' + + 'tbl1 a=f\n') + + df2 = pd.DataFrame({'a': pd.Series([ + True, False, False, + None, True, False], + dtype='boolean')}) + with self.assertRaisesRegex( + qi.IngressError, + 'Failed.*at row index 3 .*.: .*insert null .*boolean col'): + _dataframe(df2, table_name='tbl1') + + def test_bool_obj_col(self): + df = pd.DataFrame({'a': pd.Series([ + True, False, False, + False, True, False], + dtype='object')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=t\n' + + 'tbl1 a=f\n' + + 'tbl1 a=f\n' + + 'tbl1 a=f\n' + + 'tbl1 a=t\n' + + 'tbl1 a=f\n') + + df2 = pd.DataFrame({'a': pd.Series([ + True, False, 'false'], + dtype='object')}) + with self.assertRaisesRegex( + qi.IngressError, + 'serialize .* column .a. .* 2 .*false.*bool'): + _dataframe(df2, table_name='tbl1') + + df3 = pd.DataFrame({'a': pd.Series([ + None, True, False], + dtype='object')}) + with self.assertRaisesRegex( + qi.IngressError, + 'serialize.*\\(None\\): Cannot insert null.*boolean column'): + _dataframe(df3, table_name='tbl1') + + def test_datetime64_numpy_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + pd.Timestamp('2019-01-01 00:00:00'), + pd.Timestamp('2019-01-01 00:00:01'), + pd.Timestamp('2019-01-01 00:00:02'), + pd.Timestamp('2019-01-01 00:00:03'), + pd.Timestamp('2019-01-01 00:00:04'), + pd.Timestamp('2019-01-01 00:00:05'), + None, + float('nan'), + pd.NA], + dtype='datetime64[ns]'), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=1546300800000000t,b="a"\n' + + 'tbl1 a=1546300801000000t,b="b"\n' + + 'tbl1 a=1546300802000000t,b="c"\n' + + 'tbl1 a=1546300803000000t,b="d"\n' + + 'tbl1 a=1546300804000000t,b="e"\n' + + 'tbl1 a=1546300805000000t,b="f"\n' + + 'tbl1 b="g"\n' + + 'tbl1 b="h"\n' + + 'tbl1 b="i"\n') + + df = pd.DataFrame({'a': pd.Series([ + pd.Timestamp('1970-01-01 00:00:00'), + pd.Timestamp('1970-01-01 00:00:01'), + pd.Timestamp('1970-01-01 00:00:02')])}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a=0t\n' + + 'tbl1 a=1000000t\n' + + 'tbl1 a=2000000t\n') + + def test_datetime64_tz_arrow_col(self): + df = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ), + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=1, tz=_TZ), + None, + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=3, tz=_TZ)], + 'b': ['sym1', 'sym2', 'sym3', 'sym4']}) + buf = _dataframe(df, table_name='tbl1', symbols=['b']) + self.assertEqual( + buf, + # Note how these are 5hr offset from `test_datetime64_numpy_col`. + 'tbl1,b=sym1 a=1546318800000000t\n' + + 'tbl1,b=sym2 a=1546318801000000t\n' + + 'tbl1,b=sym3\n' + + 'tbl1,b=sym4 a=1546318803000000t\n') + + # Not epoch 0. + df = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=1970, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ), + pd.Timestamp( + year=1970, month=1, day=1, + hour=0, minute=0, second=1, tz=_TZ), + pd.Timestamp( + year=1970, month=1, day=1, + hour=0, minute=0, second=2, tz=_TZ)], + 'b': ['sym1', 'sym2', 'sym3']}) + buf = _dataframe(df, table_name='tbl1', symbols=['b']) + self.assertEqual( + buf, + # Note how these are 5hr offset from `test_datetime64_numpy_col`. + 'tbl1,b=sym1 a=18000000000t\n' + + 'tbl1,b=sym2 a=18001000000t\n' + + 'tbl1,b=sym3 a=18002000000t\n') + + # Actual epoch 0. + df = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=1969, month=12, day=31, + hour=19, minute=0, second=0, tz=_TZ), + pd.Timestamp( + year=1969, month=12, day=31, + hour=19, minute=0, second=1, tz=_TZ), + pd.Timestamp( + year=1969, month=12, day=31, + hour=19, minute=0, second=2, tz=_TZ)], + 'b': ['sym1', 'sym2', 'sym3']}) + buf = _dataframe(df, table_name='tbl1', symbols=['b']) + self.assertEqual( + buf, + 'tbl1,b=sym1 a=0t\n' + + 'tbl1,b=sym2 a=1000000t\n' + + 'tbl1,b=sym3 a=2000000t\n') + + df2 = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=1900, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ)], + 'b': ['sym1']}) + with self.assertRaisesRegex( + qi.IngressError, "Failed.*'a'.*-220897.* is negative."): + _dataframe(df2, table_name='tbl1', symbols=['b']) + return ############################################################### + + def test_datetime64_numpy_at(self): + df = pd.DataFrame({ + 'a': pd.Series([ + pd.Timestamp('2019-01-01 00:00:00'), + pd.Timestamp('2019-01-01 00:00:01'), + pd.Timestamp('2019-01-01 00:00:02'), + pd.Timestamp('2019-01-01 00:00:03'), + pd.Timestamp('2019-01-01 00:00:04'), + pd.Timestamp('2019-01-01 00:00:05'), + float('nan'), + None, + pd.NaT], + dtype='datetime64[ns]'), + 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + buf = _dataframe(df, table_name='tbl1', at='a') + self.assertEqual( + buf, + 'tbl1 b=1i 1546300800000000000\n' + + 'tbl1 b=2i 1546300801000000000\n' + + 'tbl1 b=3i 1546300802000000000\n' + + 'tbl1 b=4i 1546300803000000000\n' + + 'tbl1 b=5i 1546300804000000000\n' + + 'tbl1 b=6i 1546300805000000000\n' + + 'tbl1 b=7i\n' + + 'tbl1 b=8i\n' + + 'tbl1 b=9i\n') + + df = pd.DataFrame({ + 'a': pd.Series([ + pd.Timestamp('1970-01-01 00:00:00'), + pd.Timestamp('1970-01-01 00:00:01'), + pd.Timestamp('1970-01-01 00:00:02')], + dtype='datetime64[ns]'), + 'b': [1, 2, 3]}) + buf = _dataframe(df, table_name='tbl1', at='a') + self.assertEqual( + buf, + 'tbl1 b=1i 0\n' + + 'tbl1 b=2i 1000000000\n' + + 'tbl1 b=3i 2000000000\n') + + def test_datetime64_tz_arrow_at(self): + df = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ), + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=1, tz=_TZ), + None, + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=3, tz=_TZ)], + 'b': ['sym1', 'sym2', 'sym3', 'sym4']}) + buf = _dataframe(df, table_name='tbl1', symbols=['b'], at='a') + self.assertEqual( + buf, + # Note how these are 5hr offset from `test_datetime64_numpy_col`. + 'tbl1,b=sym1 1546318800000000000\n' + + 'tbl1,b=sym2 1546318801000000000\n' + + 'tbl1,b=sym3\n' + + 'tbl1,b=sym4 1546318803000000000\n') + + df2 = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=1900, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ)], + 'b': ['sym1']}) + with self.assertRaisesRegex( + qi.IngressError, "Failed.*'a'.*-220897.* is neg"): + _dataframe(df2, table_name='tbl1', symbols=['b'], at='a') + + def _test_pyobjstr_table(self, dtype): + df = pd.DataFrame({ + '../bad col name/../it does not matter...': + pd.Series([ + 'a', # ASCII + 'b' * 127, # Max table name length. + 'q❤️p', # Mixed ASCII and UCS-2 + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype=dtype), + 'b': [1, 2, 3, 4, 5]}) + buf = _dataframe(df, table_name_col=0) + self.assertEqual( + buf, + 'a b=1i\n' + + ('b' * 127) + ' b=2i\n' + + 'q❤️p b=3i\n' + + '嚜꓂ b=4i\n' + + '💩🦞 b=5i\n') + + with self.assertRaisesRegex( + qi.IngressError, "Too long"): + _dataframe( + pd.DataFrame({'a': pd.Series(['b' * 128], dtype=dtype)}), + table_name_col='a') + + with self.assertRaisesRegex( + qi.IngressError, 'Failed.*Expected a table name, got a null.*'): + _dataframe( + pd.DataFrame({ + '.': pd.Series(['x', None], dtype=dtype), + 'b': [1, 2]}), + table_name_col='.') + + with self.assertRaisesRegex( + qi.IngressError, 'Failed.*Expected a table name, got a null.*'): + _dataframe( + pd.DataFrame({ + '.': pd.Series(['x', float('nan')], dtype=dtype), + 'b': [1, 2]}), + table_name_col='.') + + with self.assertRaisesRegex( + qi.IngressError, 'Failed.*Expected a table name, got a null.*'): + _dataframe( + pd.DataFrame({ + '.': pd.Series(['x', pd.NA], dtype=dtype), + 'b': [1, 2]}), + table_name_col='.') + + with self.assertRaisesRegex( + qi.IngressError, "''.*must have a non-zero length"): + _dataframe( + pd.DataFrame({ + '/': pd.Series([''], dtype=dtype), + 'b': [1]}), + table_name_col='/') + + with self.assertRaisesRegex( + qi.IngressError, "'tab..1'.*invalid dot `\\.` at position 4"): + _dataframe( + pd.DataFrame({ + '/': pd.Series(['tab..1'], dtype=dtype), + 'b': [1]}), + table_name_col='/') + + def test_obj_str_table(self): + self._test_pyobjstr_table('object') + + with self.assertRaisesRegex( + qi.IngressError, 'table name .*got an object of type int'): + _dataframe( + pd.DataFrame({ + '.': pd.Series(['x', 42], dtype='object'), + 'z': [1, 2]}), + table_name_col='.') + + def test_obj_string_table(self): + self._test_pyobjstr_table('string') + + self.assertEqual( + _dataframe( + pd.DataFrame({ + '.': pd.Series(['x', 42], dtype='string'), + 'z': [1, 2]}), + table_name_col='.'), + 'x z=1i\n' + + '42 z=2i\n') + + def _test_pyobjstr_numpy_symbol(self, dtype): + df = pd.DataFrame({'a': pd.Series([ + 'a', # ASCII + 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 1200, # Over the 1024 buffer prealloc. + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype=dtype)}) + buf = _dataframe(df, table_name='tbl1', symbols=True) + self.assertEqual( + buf, + 'tbl1,a=a\n' + + 'tbl1,a=q❤️p\n' + + 'tbl1,a=' + ('❤️' * 1200) + '\n' + + 'tbl1,a=Questo\\ è\\ un\\ qualcosa\n' + + 'tbl1,a=щось\n' + + 'tbl1,a=\n' + + 'tbl1,a=嚜꓂\n' + + 'tbl1,a=💩🦞\n') + + for null_obj in (None, float('nan'), pd.NA): + self.assertEqual( + _dataframe( + pd.DataFrame({ + 'x': pd.Series(['a', null_obj], dtype=dtype), + 'y': [1, 2]}), + table_name='tbl1', symbols=[0]), + 'tbl1,x=a y=1i\n' + + 'tbl1 y=2i\n') + + def test_obj_str_numpy_symbol(self): + self._test_pyobjstr_numpy_symbol('object') + + with self.assertRaisesRegex( + qi.IngressError, 'Expected a string, got an .* type int'): + _dataframe( + pd.DataFrame({ + 'x': pd.Series(['x', 42], dtype='object'), + 'y': [1, 2]}), + table_name='tbl1', symbols=[0]) + + def test_obj_string_numpy_symbol(self): + self._test_pyobjstr_numpy_symbol('string') + + self.assertEqual( + _dataframe( + pd.DataFrame({ + 'x': pd.Series(['x', 42], dtype='string'), + 'y': [1, 2]}), + table_name='tbl1', symbols=[0]), + 'tbl1,x=x y=1i\n' + + 'tbl1,x=42 y=2i\n') + + def test_str_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 'a', # ASCII + 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 1200, # Over the 1024 buffer prealloc. + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype='str')}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 a="a"\n' + + 'tbl1 a="q❤️p"\n' + + 'tbl1 a="' + ('❤️' * 1200) + '"\n' + + 'tbl1 a="Questo è un qualcosa"\n' + + 'tbl1 a="щось"\n' + + 'tbl1 a=""\n' + + 'tbl1 a="嚜꓂"\n' + + 'tbl1 a="💩🦞"\n') + + def test_str_arrow_table(self): + df = pd.DataFrame({ + '../bad col name/../it does not matter...': pd.Series([ + 'a', # ASCII + 'b' * 127, # Max table name length. + 'q❤️p', # Mixed ASCII and UCS-2 + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype='string[pyarrow]'), + 'b': [1, 2, 3, 4, 5]}) + buf = _dataframe(df, table_name_col=0) + self.assertEqual( + buf, + 'a b=1i\n' + + ('b' * 127) + ' b=2i\n' + + 'q❤️p b=3i\n' + + '嚜꓂ b=4i\n' + + '💩🦞 b=5i\n') + + with self.assertRaisesRegex( + qi.IngressError, "Too long"): + _dataframe( + pd.DataFrame({ + 'a': pd.Series(['b' * 128], dtype='string[pyarrow]')}), + table_name_col='a') + + with self.assertRaisesRegex( + qi.IngressError, "Failed .*.*Table name cannot be null"): + _dataframe( + pd.DataFrame({ + '.': pd.Series(['x', None], dtype='string[pyarrow]'), + 'b': [1, 2]}), + table_name_col='.') + + with self.assertRaisesRegex( + qi.IngressError, "''.*must have a non-zero length"): + _dataframe( + pd.DataFrame({ + '/': pd.Series([''], dtype='string[pyarrow]')}), + table_name_col='/') + + with self.assertRaisesRegex( + qi.IngressError, "'tab..1'.*invalid dot `\\.` at position 4"): + _dataframe( + pd.DataFrame({ + '/': pd.Series(['tab..1'], dtype='string[pyarrow]')}), + table_name_col='/') + + def test_str_arrow_symbol(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 'a', # ASCII + 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 1200, # Over the 1024 buffer prealloc. + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string + None, + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype='string[pyarrow]'), + 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + buf = _dataframe(df, table_name='tbl1', symbols=True) + self.assertEqual( + buf, + 'tbl1,a=a b=1i\n' + + 'tbl1,a=q❤️p b=2i\n' + + 'tbl1,a=' + ('❤️' * 1200) + ' b=3i\n' + + 'tbl1,a=Questo\\ è\\ un\\ qualcosa b=4i\n' + + 'tbl1,a=щось b=5i\n' + + 'tbl1,a= b=6i\n' + + 'tbl1 b=7i\n' + + 'tbl1,a=嚜꓂ b=8i\n' + + 'tbl1,a=💩🦞 b=9i\n') + + def test_str_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 'a', # ASCII + 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 1200, # Over the 1024 buffer prealloc. + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string + None, + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype='string[pyarrow]'), + 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + buf = _dataframe(df, table_name='tbl1', symbols=False) + self.assertEqual( + buf, + 'tbl1 a="a",b=1i\n' + + 'tbl1 a="q❤️p",b=2i\n' + + 'tbl1 a="' + ('❤️' * 1200) + '",b=3i\n' + + 'tbl1 a="Questo è un qualcosa",b=4i\n' + + 'tbl1 a="щось",b=5i\n' + + 'tbl1 a="",b=6i\n' + + 'tbl1 b=7i\n' + + 'tbl1 a="嚜꓂",b=8i\n' + + 'tbl1 a="💩🦞",b=9i\n') + + def test_pyobj_int_col(self): + int64_min = -2**63 + int64_max = 2**63 - 1 + self.assertEqual( + _dataframe( + pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, None, float('nan'), pd.NA, 7, + 0, + int64_min, + int64_max], dtype='object'), + 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}), + table_name='tbl1'), + 'tbl1 a=1i,b=1i\n' + + 'tbl1 a=2i,b=2i\n' + + 'tbl1 a=3i,b=3i\n' + + 'tbl1 b=4i\n' + + 'tbl1 b=5i\n' + + 'tbl1 b=6i\n' + + 'tbl1 a=7i,b=7i\n' + + 'tbl1 a=0i,b=8i\n' + + 'tbl1 a=' + str(int64_min) + 'i,b=9i\n' + + 'tbl1 a=' + str(int64_max) + 'i,b=10i\n') + + with self.assertRaisesRegex( + qi.IngressError, "1 \\('STRING'\\): .*type int, got.*str\\."): + _dataframe( + pd.DataFrame({ + 'a': pd.Series([1, 'STRING'], dtype='object'), + 'b': [1, 2]}), + table_name='tbl1') + + out_of_range = [int64_min - 1, int64_max + 1] + for num in out_of_range: + with self.assertRaisesRegex( + qi.IngressError, "index 1 .*922337203685477.*int too big"): + _dataframe( + pd.DataFrame({ + 'a': pd.Series([1, num], dtype='object'), + 'b': [1, 2]}), + table_name='tbl1') + + def test_pyobj_float_col(self): + self.assertEqual( + _dataframe( + pd.DataFrame({ + 'a': pd.Series( + [1.0, 2.0, 3.0, None, float('nan'), pd.NA, 7.0], + dtype='object'), + 'b': [1, 2, 3, 4, 5, 6, 7]}), + table_name='tbl1'), + 'tbl1 a=1.0,b=1i\n' + + 'tbl1 a=2.0,b=2i\n' + + 'tbl1 a=3.0,b=3i\n' + + 'tbl1 b=4i\n' + + 'tbl1 a=NaN,b=5i\n' + + 'tbl1 b=6i\n' + + 'tbl1 a=7.0,b=7i\n') + + with self.assertRaisesRegex( + qi.IngressError, "1 \\('STRING'\\): .*type float, got.*str\\."): + _dataframe( + pd.DataFrame({ + 'a': pd.Series([1.0, 'STRING'], dtype='object'), + 'b': [1, 2]}), + table_name='tbl1') + + def test_bad_category(self): + # We only support string categories + # (unless anyone asks for additional ones). + # We want to test others are rejected. + with self.assertRaisesRegex( + qi.IngressError, "Bad column 'a'.*got a category of .*int64"): + _dataframe( + pd.DataFrame({'a': pd.Series([1, 2, 3, 2], dtype='category')}), + table_name='tbl1') + + def _test_cat_table(self, count): + slist = [f's{i}' for i in range(count)] + + df = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + buf = _dataframe(df, table_name_col=0) + exp = ''.join( + f'{s} b={i}i\n' + for i, s in enumerate(slist)) + self.assertEqual(buf, exp) + + slist[2] = None + df2 = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + with self.assertRaisesRegex( + qi.IngressError, 'Table name cannot be null'): + _dataframe(df2, table_name_col=0) + + def test_cat_i8_table(self): + self._test_cat_table(30) + self._test_cat_table(127) + + def test_cat_i16_table(self): + self._test_cat_table(128) + self._test_cat_table(4000) + self._test_cat_table(32767) + + def test_cat_i32_table(self): + self._test_cat_table(32768) + self._test_cat_table(40000) + + def _test_cat_symbol(self, count): + slist = [f's{i}' for i in range(count)] + + df = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + buf = _dataframe(df, table_name='tbl1', symbols=True) + exp = ''.join( + f'tbl1,a={s} b={i}i\n' + for i, s in enumerate(slist)) + self.assertEqual(buf, exp) + + slist[2] = None + df2 = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + exp2 = exp.replace('tbl1,a=s2 b=2i\n', 'tbl1 b=2i\n') + buf2 = _dataframe(df2, table_name='tbl1', symbols=True) + self.assertEqual(buf2, exp2) + + def test_cat_i8_symbol(self): + self._test_cat_symbol(30) + self._test_cat_symbol(127) + + def test_cat_i16_symbol(self): + self._test_cat_symbol(128) + self._test_cat_symbol(4000) + self._test_cat_symbol(32767) + + def test_cat_i32_symbol(self): + self._test_cat_symbol(32768) + self._test_cat_symbol(40000) + + def _test_cat_str(self, count): + slist = [f's{i}' for i in range(count)] + + df = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + buf = _dataframe(df, table_name='tbl1', symbols=False) + exp = ''.join( + f'tbl1 a="{s}",b={i}i\n' + for i, s in enumerate(slist)) + self.assertEqual(buf, exp) + + slist[2] = None + df2 = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + exp2 = exp.replace('tbl1 a="s2",b=2i\n', 'tbl1 b=2i\n') + buf2 = _dataframe(df2, table_name='tbl1', symbols=False) + self.assertEqual(buf2, exp2) + + def test_cat_i8_str(self): + self._test_cat_str(30) + self._test_cat_str(127) + + def test_cat_i16_str(self): + self._test_cat_str(128) + self._test_cat_str(4000) + self._test_cat_str(32767) + + def test_cat_i32_str(self): + self._test_cat_str(32768) + self._test_cat_str(40000) + + def test_all_nulls_pyobj_col(self): + df = pd.DataFrame({ + 'a': [None, pd.NA, float('nan')], + 'b': [1, 2, 3]}) + buf = _dataframe(df, table_name='tbl1') + self.assertEqual( + buf, + 'tbl1 b=1i\n' + + 'tbl1 b=2i\n' + + 'tbl1 b=3i\n') + + def test_strided_numpy_column(self): + two_d = np.array([ + [1, 10], + [2, 20], + [3, 30]], dtype='int64') + col2 = two_d[:, 1] + col2.flags['WRITEABLE'] = False + + # Checking our test case setup. + mv = memoryview(col2) + self.assertEqual(mv.contiguous, False) + self.assertEqual(mv.strides, (16,)) + + df = pd.DataFrame(col2, copy=False) + df.columns = ['a'] + + with self.assertRaisesRegex( + qi.IngressError, "Bad column 'a': .*not.*contiguous"): + _dataframe(df, table_name='tbl1') + + def test_serializing_in_chunks(self): + df = pd.DataFrame({ + 'a': pd.Series(np.arange(30), dtype='int64'), + 'b': pd.Series(np.arange(30), dtype='Int64')}) + parts = [ + df.iloc[:10], + df.iloc[10:20], + df.iloc[20:]] + for index, part in enumerate(parts): + buf = _dataframe(part, table_name='tbl1') + exp = ''.join( + f'tbl1 a={i}i,b={i}i\n' + for i in range(index * 10, (index + 1) * 10)) + self.assertEqual(buf, exp) + + def test_arrow_chunked_array(self): + # We build a table with chunked arrow arrays as columns. + chunks_a = [ + pa.array([1, 2, 3], type=pa.int16()), + pa.array([4, 5, 6], type=pa.int16()), + pa.array([], type=pa.int16()), + pa.array([7, 8, 9], type=pa.int16())] + chunked_a = pa.chunked_array(chunks_a) + chunks_b = [ + pa.array([10, 20], type=pa.int32()), + pa.array([], type=pa.int32()), + pa.array([30, 40, 50, 60], type=pa.int32()), + pa.array([70, 80, 90], type=pa.int32())] + chunked_b = pa.chunked_array(chunks_b) + arr_tab = pa.Table.from_arrays([chunked_a, chunked_b], names=['a', 'b']) + + # NOTE! + # This does *not* preserve the chunking of the arrow arrays. + df = arr_tab.to_pandas() + buf = _dataframe(df, table_name='tbl1') + exp = ( + 'tbl1 a=1i,b=10i\n' + + 'tbl1 a=2i,b=20i\n' + + 'tbl1 a=3i,b=30i\n' + + 'tbl1 a=4i,b=40i\n' + + 'tbl1 a=5i,b=50i\n' + + 'tbl1 a=6i,b=60i\n' + + 'tbl1 a=7i,b=70i\n' + + 'tbl1 a=8i,b=80i\n' + + 'tbl1 a=9i,b=90i\n') + self.assertEqual(buf, exp) + + if not hasattr(pd, 'ArrowDtype'): + # We don't have pandas ArrowDtype, so we can't test the rest. + return + + # To preserve the chunking we need to use a special pandas type: + pandarrow_a = pd.array(chunked_a, dtype='int16[pyarrow]') + pandarrow_b = pd.array(chunked_b, dtype='int32[pyarrow]') + df = pd.DataFrame({'a': pandarrow_a, 'b': pandarrow_b}) + + # Note that this dtype is experimental (currently), + # so we don't support it yet.. but we have everything in place should we + # need to, so - as for now - we just test that we raise a nice error. + with self.assertRaisesRegex( + qi.IngressError, + "Unsupported dtype int16\[pyarrow\] for column 'a'.*github"): + _dataframe(df, table_name='tbl1') + + @unittest.skipIf(not fastparquet, 'fastparquet not installed') + @with_tmp_dir + def test_parquet_roundtrip(self, tmpdir): + pa_parquet_path = tmpdir / 'test_pa.parquet' + fp_parquet_path = tmpdir / 'test_fp.parquet' + df = pd.DataFrame({ + 's': pd.Categorical(['a', 'b', 'a', 'c', 'a']), + 'a': pd.Series([1, 2, 3, 4, 5], dtype='int16'), + 'b': pd.Series([10, 20, 30, None, 50], dtype='UInt8'), + 'c': [0.5, float('nan'), 2.5, 3.5, None]}) + df.to_parquet(pa_parquet_path, engine='pyarrow') + df.to_parquet(fp_parquet_path, engine='fastparquet') + pa2pa_df = pd.read_parquet(pa_parquet_path, engine='pyarrow') + pa2fp_df = pd.read_parquet(pa_parquet_path, engine='fastparquet') + fp2pa_df = pd.read_parquet(fp_parquet_path, engine='pyarrow') + fp2fp_df = pd.read_parquet(fp_parquet_path, engine='fastparquet') + + exp_dtypes = ['category', 'int16', 'UInt8', 'float64'] + self.assertEqual(list(df.dtypes), exp_dtypes) + + def df_eq(exp_df, deser_df, exp_dtypes): + self.assertEqual(list(deser_df.dtypes), exp_dtypes) + if not exp_df.equals(deser_df): + print('\nexp_df:') + print(exp_df) + print('\ndeser_df:') + print(deser_df) + self.assertTrue(exp_df.equals(deser_df)) + + # fastparquet doesn't roundtrip with pyarrow parquet properly. + # It decays categories to object and UInt8 to float64. + # We need to set up special case expected results for that. + fallback_exp_dtypes = [ + np.dtype('O'), + np.dtype('int16'), + np.dtype('float64'), + np.dtype('float64')] + fallback_df = df.astype({'s': 'object', 'b': 'float64'}) + + df_eq(df, pa2pa_df, exp_dtypes) + df_eq(df, pa2fp_df, exp_dtypes) + df_eq(fallback_df, fp2pa_df, fallback_exp_dtypes) + df_eq(df, fp2fp_df, exp_dtypes) + + exp = ( + 'tbl1,s=a a=1i,b=10i,c=0.5\n' + + 'tbl1,s=b a=2i,b=20i,c=NaN\n' + + 'tbl1,s=a a=3i,b=30i,c=2.5\n' + + 'tbl1,s=c a=4i,c=3.5\n' + + 'tbl1,s=a a=5i,b=50i,c=NaN\n') + + fallback_exp = ( + 'tbl1 s="a",a=1i,b=10.0,c=0.5\n' + + 'tbl1 s="b",a=2i,b=20.0,c=NaN\n' + + 'tbl1 s="a",a=3i,b=30.0,c=2.5\n' + + 'tbl1 s="c",a=4i,b=NaN,c=3.5\n' + + 'tbl1 s="a",a=5i,b=50.0,c=NaN\n') + + self.assertEqual(_dataframe(df, table_name='tbl1'), exp) + self.assertEqual(_dataframe(pa2pa_df, table_name='tbl1'), exp) + self.assertEqual(_dataframe(pa2fp_df, table_name='tbl1'), exp) + self.assertEqual(_dataframe(fp2pa_df, table_name='tbl1'), fallback_exp) + self.assertEqual(_dataframe(fp2fp_df, table_name='tbl1'), exp) + + +if __name__ == '__main__': + if os.environ.get('TEST_QUESTDB_PROFILE') == '1': + import cProfile + cProfile.run('unittest.main()', sort='cumtime') + else: + unittest.main() diff --git a/test/test_dataframe_fuzz.py b/test/test_dataframe_fuzz.py new file mode 100644 index 00000000..991c5d9e --- /dev/null +++ b/test/test_dataframe_fuzz.py @@ -0,0 +1,192 @@ +""" +# On Linux, ensure `clang` is installed. +pyenv shell 3.10 +./proj clean +./proj build_fuzzing +./proj test_fuzzing +""" + +import sys +import struct +import patch_path +patch_path.patch() +import numpy as np +from numpy.random import Generator, PCG64 +import pandas as pd +import pyarrow as pa +import re +import atheris + + +with atheris.instrument_imports(): + import questdb.ingress as qi + + +@atheris.instrument_func +def get_test_alphabet(): + include_ranges = [ + (0x0021, 0x0021), + (0x0023, 0x0026), + (0x0028, 0x007E), + (0x00A1, 0x00AC), + (0x00AE, 0x00FF), + (0x0100, 0x017F), + (0x0180, 0x024F), + (0x2C60, 0x2C7F), + (0x16A0, 0x16F0), + (0x0370, 0x0377), + (0x037A, 0x037E), + (0x0384, 0x038A), + (0x038C, 0x038C)] + return [ + chr(code_point) + for current_range in include_ranges + for code_point in range(current_range[0], current_range[1] + 1)] + + +TEST_ALPHABET = get_test_alphabet() + + +def get_random_unicode(rand, length, none_val_prob=0): + """ + Adapted from https://stackoverflow.com/questions/1477294 + """ + if none_val_prob and (rand.random() < none_val_prob): + return None + return ''.join(rand.choice(TEST_ALPHABET) for _ in range(length)) + + +@atheris.instrument_func +def gen_string_series(rand, n_rows, none_val_prob, length, dtype): + series_n_rows = n_rows + if dtype == 'categorical': + series_n_rows //= 4 + data = [ + get_random_unicode(rand, length, none_val_prob) + for _ in range(series_n_rows)] + if dtype == 'categorical': + data = data * 6 + data = data[:n_rows] + rand.shuffle(data) + return pd.Series(data, dtype=dtype) + + +def gen_numpy_series(rand, n_rows, dtype): + return pd.Series( + rand.integers( + np.iinfo(dtype).min, + np.iinfo(dtype).max, + size=n_rows, + dtype=dtype)) + + +@atheris.instrument_func +def gen_series_i8_numpy(rand, n_rows, none_val_prob): + return gen_numpy_series(rand, n_rows, np.int8) + + +@atheris.instrument_func +def gen_series_pyobj_str(rand, n_rows, none_val_prob): + return gen_string_series(rand, n_rows, none_val_prob, 6, 'object') + + +# TODO: Test all datatypes +# TODO: Include None, NA and NaN. +series_generators = [ + gen_series_i8_numpy, + # gen_series_i16_numpy, + gen_series_pyobj_str] + + + +@atheris.instrument_func +def parse_input_bytes(input_bytes): + fdp = atheris.FuzzedDataProvider(input_bytes) + rand_seed = fdp.ConsumeUInt(1) + none_val_prob = fdp.ConsumeProbability() + table_name_type = fdp.ConsumeIntInRange(0, 4) + table_name_len = fdp.ConsumeIntInRange(1, 32) + n_cols = fdp.ConsumeIntInRange(10, 40) + col_generators = [ + series_generators[fdp.ConsumeIntInRange(0, len(series_generators) - 1)] + for _ in range(n_cols)] + n_rows = fdp.ConsumeIntInRange(10, 5000) + rand = Generator(PCG64(rand_seed)) + series_list = [] + col_name = lambda: f'{get_random_unicode(rand, 4)}_{len(series_list)}' + table_name = None + table_name_col = None + symbols = 'auto' + at = None + if table_name_type == 0: + table_name = get_random_unicode(rand, table_name_len) + else: + table_name_col = col_name() + dtype = { + 1: 'object', + 2: 'string', + 3: 'string[pyarrow]', + 4: 'category'}[table_name_type] + series = gen_string_series(rand, n_rows, 0, table_name_len, dtype) + series_list.append((table_name_col, series)) + + for index in range(n_cols): + name = col_name() + series = col_generators[index](rand, n_rows, none_val_prob) + series_list.append((name, series)) + rand.shuffle(series_list) + series = dict([ + (name, series) + for name, series in series_list]) + df = pd.DataFrame(series) + return df, table_name, table_name_col, symbols, at + + +@atheris.instrument_func +def test_dataframe(input_bytes): + # print(f'input_bytes: {input_bytes}') + params = parse_input_bytes(input_bytes) + df, table_name, table_name_col, symbols, at = params + + try: + BUF = qi.Buffer() + BUF.clear() + try: + BUF.dataframe( + df, + table_name=table_name, + table_name_col=table_name_col, + symbols=symbols, + at=at) + except Exception as e: + if isinstance(e, (qi.IngressError)): + msg = str(e) + if 'Bad argument `table_name`' in msg: + return + if re.search(r'Failed .*Bad string.*', msg): + return + if re.search(r'Bad string .*: Column names', msg): + return + if 'Ensure at least one column is not null.' in msg: + return + raise e + except: + print('>>>>>>>>>') + print(f'input_bytes: {input_bytes!r}') + print(f'df: {df}') + print(f'table_name: {table_name}') + print(f'table_name_col: {table_name_col}') + print(f'symbols: {symbols}') + print(f'at: {at}') + print('<<<<<<<<<') + raise + + +def main(): + args = list(sys.argv) + atheris.Setup(args, test_dataframe) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/test/test_dataframe_leaks.py b/test/test_dataframe_leaks.py new file mode 100644 index 00000000..2f33313c --- /dev/null +++ b/test/test_dataframe_leaks.py @@ -0,0 +1,43 @@ +import patch_path +patch_path.patch() + +import pandas as pd +import questdb.ingress as qi + +import os, psutil +process = psutil.Process(os.getpid()) + +def get_rss(): + return process.memory_info().rss + + +def serialize_and_cleanup(): + # qi.Buffer().row( + # 'table_name', + # symbols={'x': 'a', 'y': 'b'}, + # columns={'a': 1, 'b': 2, 'c': 3}) + df = pd.DataFrame({ + 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + 'b': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + 'c': [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]}) + qi.Buffer().dataframe(df, table_name='test') + + +def main(): + warmup_count = 0 + for n in range(1000000): + if n % 1000 == 0: + print(f'[iter: {n:09}, RSS: {get_rss():010}]') + if n > warmup_count: + before = get_rss() + serialize_and_cleanup() + if n > warmup_count: + after = get_rss() + if after != before: + msg = f'RSS changed from {before} to {after} after {n} iters' + print(msg) + + +if __name__ == '__main__': + main() + \ No newline at end of file