Make numpy and pandas optional for ~7 times smaller deps (#153)

jkbrzt · hallacy · web-flow · commit ede088293965 · 2023-01-06T12:23:29.000-08:00
* Make `numpy` and `pandas` optional dependencies

* Cleanup

* Cleanup

* Cleanup

* Cleanup

* Cleanup

* Cleanup

* Move `openpyxl` to `datalib` extras

* Improve errors and instructions

* Add “Optional dependencies” to README

* Polish README.md

* Polish README.md

Co-authored-by: hallacy &lt;hallacy@openai.com&gt;
diff --git a/README.md b/README.md
@@ -25,6 +25,26 @@ Install from source with:
 python setup.py install
 ```
 
+### Optional dependencies
+
+Install dependencies for [`openapi.embeddings_utils`](openai/embeddings_utils.py):
+
+```sh
+pip install openai[embeddings]
+```
+
+Install support for [Weights & Biases](https://wandb.me/openai-docs):
+
+```
+pip install openai[wandb]
+```
+
+Data libraries like `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with:
+
+```sh
+pip install openai[datalib]
+````
+
 ## Usage
 
 The library needs to be configured with your account's secret key which is available on the [website](https://beta.openai.com/account/api-keys). Either set it as the `OPENAI_API_KEY` environment variable before using the library:
diff --git a/openai/api_resources/embedding.py b/openai/api_resources/embedding.py
@@ -1,11 +1,10 @@
 import base64
 import time
 
-import numpy as np
 
 from openai import util
-from openai.api_resources.abstract import DeletableAPIResource, ListableAPIResource
 from openai.api_resources.abstract.engine_api_resource import EngineAPIResource
+from openai.datalib import numpy as np, assert_has_numpy
 from openai.error import TryAgain
 
 
@@ -40,6 +39,7 @@ def create(cls, *args, **kwargs):
 
                         # If an engine isn't using this optimization, don't do anything
                         if type(data["embedding"]) == str:
+                            assert_has_numpy()
                             data["embedding"] = np.frombuffer(
                                 base64.b64decode(data["embedding"]), dtype="float32"
                             ).tolist()
diff --git a/openai/datalib.py b/openai/datalib.py
@@ -0,0 +1,56 @@
+"""
+This module helps make data libraries like `numpy` and `pandas` optional dependencies.
+
+The libraries add up to 130MB+, which makes it challenging to deploy applications
+using this library in environments with code size constraints, like AWS Lambda.
+
+This module serves as an import proxy and provides a few utilities for dealing with the optionality.
+
+Since the primary use case of this library (talking to the OpenAI API) doesn’t generally require data libraries,
+it’s safe to make them optional. The rare case when data libraries are needed in the client is handled through
+assertions with instructive error messages.
+
+See also `setup.py`.
+
+"""
+try:
+    import numpy
+except ImportError:
+    numpy = None
+
+try:
+    import pandas
+except ImportError:
+    pandas = None
+
+HAS_NUMPY = bool(numpy)
+HAS_PANDAS = bool(pandas)
+
+INSTRUCTIONS = """
+
+OpenAI error: 
+
+    missing `{library}` 
+
+This feature requires additional dependencies:
+
+    $ pip install openai[datalib]
+
+"""
+
+NUMPY_INSTRUCTIONS = INSTRUCTIONS.format(library="numpy")
+PANDAS_INSTRUCTIONS = INSTRUCTIONS.format(library="pandas")
+
+
+class MissingDependencyError(Exception):
+    pass
+
+
+def assert_has_numpy():
+    if not HAS_NUMPY:
+        raise MissingDependencyError(NUMPY_INSTRUCTIONS)
+
+
+def assert_has_pandas():
+    if not HAS_PANDAS:
+        raise MissingDependencyError(PANDAS_INSTRUCTIONS)
diff --git a/openai/embeddings_utils.py b/openai/embeddings_utils.py
@@ -2,8 +2,6 @@
 from typing import List, Optional
 
 import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
 import plotly.express as px
 from scipy import spatial
 from sklearn.decomposition import PCA
@@ -12,6 +10,8 @@
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 
 import openai
+from openai.datalib import numpy as np
+from openai.datalib import pandas as pd
 
 
 @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
diff --git a/openai/tests/test_long_examples_validator.py b/openai/tests/test_long_examples_validator.py
@@ -2,9 +2,14 @@
 import subprocess
 from tempfile import NamedTemporaryFile
 
+import pytest
+
+from openai.datalib import HAS_PANDAS, HAS_NUMPY, NUMPY_INSTRUCTIONS, PANDAS_INSTRUCTIONS
 
-def test_long_examples_validator() -> None:
 
+@pytest.mark.skipif(not HAS_PANDAS, reason=PANDAS_INSTRUCTIONS)
+@pytest.mark.skipif(not HAS_NUMPY, reason=NUMPY_INSTRUCTIONS)
+def test_long_examples_validator() -> None:
     """
     Ensures that long_examples_validator() handles previously applied recommendations,
     namely dropped duplicates, without resulting in a KeyError.
@@ -43,5 +48,5 @@ def test_long_examples_validator() -> None:
     assert prepared_data_cmd_output.stderr == ""
     # validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
     assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout
-
+    
     return prepared_data_cmd_output.stdout
diff --git a/openai/validators.py b/openai/validators.py
@@ -2,7 +2,7 @@
 import sys
 from typing import Any, Callable, NamedTuple, Optional
 
-import pandas as pd
+from openai.datalib import pandas as pd, assert_has_pandas
 
 
 class Remediation(NamedTuple):
@@ -474,6 +474,7 @@ def read_any_format(fname, fields=["prompt", "completion"]):
      - for .xlsx it will read the first sheet
      - for .txt it will assume completions and split on newline
     """
+    assert_has_pandas()
     remediation = None
     necessary_msg = None
     immediate_msg = None
diff --git a/openai/wandb_logger.py b/openai/wandb_logger.py
@@ -13,10 +13,9 @@
     import re
     from pathlib import Path
 
-    import numpy as np
-    import pandas as pd
-
     from openai import File, FineTune
+    from openai.datalib import numpy as np
+    from openai.datalib import pandas as pd
 
 
 class WandbLogger:
diff --git a/setup.py b/setup.py
@@ -12,6 +12,15 @@
 with open("README.md", "r") as fh:
     long_description = fh.read()
 
+
+DATA_LIBRARIES = [
+    # These libraries are optional because of their size. See `openai/datalib.py`.
+    "numpy",
+    "pandas>=1.2.3",  # Needed for CLI fine-tuning data preparation tool
+    "pandas-stubs>=1.1.0.11",  # Needed for type hints for mypy
+    "openpyxl>=3.0.7",  # Needed for CLI fine-tuning data preparation tool xlsx format
+]
+
 setup(
     name="openai",
     description="Python client library for the OpenAI API",
@@ -21,22 +30,23 @@
     install_requires=[
         "requests>=2.20",  # to get the patch for CVE-2018-18074
         "tqdm",  # Needed for progress bars
-        "pandas>=1.2.3",  # Needed for CLI fine-tuning data preparation tool
-        "pandas-stubs>=1.1.0.11",  # Needed for type hints for mypy
-        "openpyxl>=3.0.7",  # Needed for CLI fine-tuning data preparation tool xlsx format
-        "numpy",
         'typing_extensions;python_version<"3.8"',  # Needed for type hints for mypy
         "aiohttp",  # Needed for async support
     ],
     extras_require={
         "dev": ["black~=21.6b0", "pytest==6.*", "pytest-asyncio", "pytest-mock"],
-        "wandb": ["wandb"],
+        "datalib": DATA_LIBRARIES,
+        "wandb": [
+            "wandb",
+            *DATA_LIBRARIES,
+        ],
         "embeddings": [
             "scikit-learn>=1.0.2",  # Needed for embedding utils, versions >= 1.1 require python 3.8
             "tenacity>=8.0.1",
             "matplotlib",
             "sklearn",
             "plotly",
+            *DATA_LIBRARIES,
         ],
     },
     python_requires=">=3.7.1",