diff --git a/README.md b/README.md index 53bab3ab2a..61f38a2afb 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,26 @@ Install from source with: python setup.py install ``` +### Optional dependencies + +Install dependencies for [`openapi.embeddings_utils`](openai/embeddings_utils.py): + +```sh +pip install openai[embeddings] +``` + +Install support for [Weights & Biases](https://wandb.me/openai-docs): + +``` +pip install openai[wandb] +``` + +Data libraries like `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with: + +```sh +pip install openai[datalib] +```` + ## Usage The library needs to be configured with your account's secret key which is available on the [website](https://beta.openai.com/account/api-keys). Either set it as the `OPENAI_API_KEY` environment variable before using the library: diff --git a/openai/api_resources/embedding.py b/openai/api_resources/embedding.py index 679f97973b..5f1cfe5609 100644 --- a/openai/api_resources/embedding.py +++ b/openai/api_resources/embedding.py @@ -1,11 +1,10 @@ import base64 import time -import numpy as np from openai import util -from openai.api_resources.abstract import DeletableAPIResource, ListableAPIResource from openai.api_resources.abstract.engine_api_resource import EngineAPIResource +from openai.datalib import numpy as np, assert_has_numpy from openai.error import TryAgain @@ -40,6 +39,7 @@ def create(cls, *args, **kwargs): # If an engine isn't using this optimization, don't do anything if type(data["embedding"]) == str: + assert_has_numpy() data["embedding"] = np.frombuffer( base64.b64decode(data["embedding"]), dtype="float32" ).tolist() diff --git a/openai/datalib.py b/openai/datalib.py new file mode 100644 index 0000000000..2781cfc4db --- /dev/null +++ b/openai/datalib.py @@ -0,0 +1,56 @@ +""" +This module helps make data libraries like `numpy` and `pandas` optional dependencies. + +The libraries add up to 130MB+, which makes it challenging to deploy applications +using this library in environments with code size constraints, like AWS Lambda. + +This module serves as an import proxy and provides a few utilities for dealing with the optionality. + +Since the primary use case of this library (talking to the OpenAI API) doesn’t generally require data libraries, +it’s safe to make them optional. The rare case when data libraries are needed in the client is handled through +assertions with instructive error messages. + +See also `setup.py`. + +""" +try: + import numpy +except ImportError: + numpy = None + +try: + import pandas +except ImportError: + pandas = None + +HAS_NUMPY = bool(numpy) +HAS_PANDAS = bool(pandas) + +INSTRUCTIONS = """ + +OpenAI error: + + missing `{library}` + +This feature requires additional dependencies: + + $ pip install openai[datalib] + +""" + +NUMPY_INSTRUCTIONS = INSTRUCTIONS.format(library="numpy") +PANDAS_INSTRUCTIONS = INSTRUCTIONS.format(library="pandas") + + +class MissingDependencyError(Exception): + pass + + +def assert_has_numpy(): + if not HAS_NUMPY: + raise MissingDependencyError(NUMPY_INSTRUCTIONS) + + +def assert_has_pandas(): + if not HAS_PANDAS: + raise MissingDependencyError(PANDAS_INSTRUCTIONS) diff --git a/openai/embeddings_utils.py b/openai/embeddings_utils.py index c4e8a2f448..056c2065c1 100644 --- a/openai/embeddings_utils.py +++ b/openai/embeddings_utils.py @@ -2,8 +2,6 @@ from typing import List, Optional import matplotlib.pyplot as plt -import numpy as np -import pandas as pd import plotly.express as px from scipy import spatial from sklearn.decomposition import PCA @@ -12,6 +10,8 @@ from tenacity import retry, stop_after_attempt, wait_random_exponential import openai +from openai.datalib import numpy as np +from openai.datalib import pandas as pd @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) diff --git a/openai/tests/test_long_examples_validator.py b/openai/tests/test_long_examples_validator.py index 6346b25a02..a9334d4f75 100644 --- a/openai/tests/test_long_examples_validator.py +++ b/openai/tests/test_long_examples_validator.py @@ -2,9 +2,14 @@ import subprocess from tempfile import NamedTemporaryFile +import pytest + +from openai.datalib import HAS_PANDAS, HAS_NUMPY, NUMPY_INSTRUCTIONS, PANDAS_INSTRUCTIONS -def test_long_examples_validator() -> None: +@pytest.mark.skipif(not HAS_PANDAS, reason=PANDAS_INSTRUCTIONS) +@pytest.mark.skipif(not HAS_NUMPY, reason=NUMPY_INSTRUCTIONS) +def test_long_examples_validator() -> None: """ Ensures that long_examples_validator() handles previously applied recommendations, namely dropped duplicates, without resulting in a KeyError. @@ -43,5 +48,5 @@ def test_long_examples_validator() -> None: assert prepared_data_cmd_output.stderr == "" # validate get_long_indexes() applied during optional_fn() call in long_examples_validator() assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout - + return prepared_data_cmd_output.stdout diff --git a/openai/validators.py b/openai/validators.py index 23ff525495..0329ed5c7d 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -2,7 +2,7 @@ import sys from typing import Any, Callable, NamedTuple, Optional -import pandas as pd +from openai.datalib import pandas as pd, assert_has_pandas class Remediation(NamedTuple): @@ -474,6 +474,7 @@ def read_any_format(fname, fields=["prompt", "completion"]): - for .xlsx it will read the first sheet - for .txt it will assume completions and split on newline """ + assert_has_pandas() remediation = None necessary_msg = None immediate_msg = None diff --git a/openai/wandb_logger.py b/openai/wandb_logger.py index 6dd7614ca2..ba650d1fe4 100644 --- a/openai/wandb_logger.py +++ b/openai/wandb_logger.py @@ -13,10 +13,9 @@ import re from pathlib import Path - import numpy as np - import pandas as pd - from openai import File, FineTune + from openai.datalib import numpy as np + from openai.datalib import pandas as pd class WandbLogger: diff --git a/setup.py b/setup.py index aa112f7931..e431d26ccd 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,15 @@ with open("README.md", "r") as fh: long_description = fh.read() + +DATA_LIBRARIES = [ + # These libraries are optional because of their size. See `openai/datalib.py`. + "numpy", + "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool + "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy + "openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format +] + setup( name="openai", description="Python client library for the OpenAI API", @@ -21,22 +30,23 @@ install_requires=[ "requests>=2.20", # to get the patch for CVE-2018-18074 "tqdm", # Needed for progress bars - "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool - "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy - "openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format - "numpy", 'typing_extensions;python_version<"3.8"', # Needed for type hints for mypy "aiohttp", # Needed for async support ], extras_require={ "dev": ["black~=21.6b0", "pytest==6.*", "pytest-asyncio", "pytest-mock"], - "wandb": ["wandb"], + "datalib": DATA_LIBRARIES, + "wandb": [ + "wandb", + *DATA_LIBRARIES, + ], "embeddings": [ "scikit-learn>=1.0.2", # Needed for embedding utils, versions >= 1.1 require python 3.8 "tenacity>=8.0.1", "matplotlib", "sklearn", "plotly", + *DATA_LIBRARIES, ], }, python_requires=">=3.7.1",