Skip to content

Commit ede0882

Browse files
jkbrzthallacy
andauthored
Make numpy and pandas optional for ~7 times smaller deps (#153)
* Make `numpy` and `pandas` optional dependencies * Cleanup * Cleanup * Cleanup * Cleanup * Cleanup * Cleanup * Move `openpyxl` to `datalib` extras * Improve errors and instructions * Add “Optional dependencies” to README * Polish README.md * Polish README.md Co-authored-by: hallacy <[email protected]>
1 parent 9678e15 commit ede0882

File tree

8 files changed

+106
-15
lines changed

8 files changed

+106
-15
lines changed

README.md

+20
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,26 @@ Install from source with:
2525
python setup.py install
2626
```
2727

28+
### Optional dependencies
29+
30+
Install dependencies for [`openapi.embeddings_utils`](openai/embeddings_utils.py):
31+
32+
```sh
33+
pip install openai[embeddings]
34+
```
35+
36+
Install support for [Weights & Biases](https://wandb.me/openai-docs):
37+
38+
```
39+
pip install openai[wandb]
40+
```
41+
42+
Data libraries like `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with:
43+
44+
```sh
45+
pip install openai[datalib]
46+
````
47+
2848
## Usage
2949

3050
The library needs to be configured with your account's secret key which is available on the [website](https://beta.openai.com/account/api-keys). Either set it as the `OPENAI_API_KEY` environment variable before using the library:

openai/api_resources/embedding.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
import base64
22
import time
33

4-
import numpy as np
54

65
from openai import util
7-
from openai.api_resources.abstract import DeletableAPIResource, ListableAPIResource
86
from openai.api_resources.abstract.engine_api_resource import EngineAPIResource
7+
from openai.datalib import numpy as np, assert_has_numpy
98
from openai.error import TryAgain
109

1110

@@ -40,6 +39,7 @@ def create(cls, *args, **kwargs):
4039

4140
# If an engine isn't using this optimization, don't do anything
4241
if type(data["embedding"]) == str:
42+
assert_has_numpy()
4343
data["embedding"] = np.frombuffer(
4444
base64.b64decode(data["embedding"]), dtype="float32"
4545
).tolist()

openai/datalib.py

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
This module helps make data libraries like `numpy` and `pandas` optional dependencies.
3+
4+
The libraries add up to 130MB+, which makes it challenging to deploy applications
5+
using this library in environments with code size constraints, like AWS Lambda.
6+
7+
This module serves as an import proxy and provides a few utilities for dealing with the optionality.
8+
9+
Since the primary use case of this library (talking to the OpenAI API) doesn’t generally require data libraries,
10+
it’s safe to make them optional. The rare case when data libraries are needed in the client is handled through
11+
assertions with instructive error messages.
12+
13+
See also `setup.py`.
14+
15+
"""
16+
try:
17+
import numpy
18+
except ImportError:
19+
numpy = None
20+
21+
try:
22+
import pandas
23+
except ImportError:
24+
pandas = None
25+
26+
HAS_NUMPY = bool(numpy)
27+
HAS_PANDAS = bool(pandas)
28+
29+
INSTRUCTIONS = """
30+
31+
OpenAI error:
32+
33+
missing `{library}`
34+
35+
This feature requires additional dependencies:
36+
37+
$ pip install openai[datalib]
38+
39+
"""
40+
41+
NUMPY_INSTRUCTIONS = INSTRUCTIONS.format(library="numpy")
42+
PANDAS_INSTRUCTIONS = INSTRUCTIONS.format(library="pandas")
43+
44+
45+
class MissingDependencyError(Exception):
46+
pass
47+
48+
49+
def assert_has_numpy():
50+
if not HAS_NUMPY:
51+
raise MissingDependencyError(NUMPY_INSTRUCTIONS)
52+
53+
54+
def assert_has_pandas():
55+
if not HAS_PANDAS:
56+
raise MissingDependencyError(PANDAS_INSTRUCTIONS)

openai/embeddings_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
from typing import List, Optional
33

44
import matplotlib.pyplot as plt
5-
import numpy as np
6-
import pandas as pd
75
import plotly.express as px
86
from scipy import spatial
97
from sklearn.decomposition import PCA
@@ -12,6 +10,8 @@
1210
from tenacity import retry, stop_after_attempt, wait_random_exponential
1311

1412
import openai
13+
from openai.datalib import numpy as np
14+
from openai.datalib import pandas as pd
1515

1616

1717
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))

openai/tests/test_long_examples_validator.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,14 @@
22
import subprocess
33
from tempfile import NamedTemporaryFile
44

5+
import pytest
6+
7+
from openai.datalib import HAS_PANDAS, HAS_NUMPY, NUMPY_INSTRUCTIONS, PANDAS_INSTRUCTIONS
58

6-
def test_long_examples_validator() -> None:
79

10+
@pytest.mark.skipif(not HAS_PANDAS, reason=PANDAS_INSTRUCTIONS)
11+
@pytest.mark.skipif(not HAS_NUMPY, reason=NUMPY_INSTRUCTIONS)
12+
def test_long_examples_validator() -> None:
813
"""
914
Ensures that long_examples_validator() handles previously applied recommendations,
1015
namely dropped duplicates, without resulting in a KeyError.
@@ -43,5 +48,5 @@ def test_long_examples_validator() -> None:
4348
assert prepared_data_cmd_output.stderr == ""
4449
# validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
4550
assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout
46-
51+
4752
return prepared_data_cmd_output.stdout

openai/validators.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import sys
33
from typing import Any, Callable, NamedTuple, Optional
44

5-
import pandas as pd
5+
from openai.datalib import pandas as pd, assert_has_pandas
66

77

88
class Remediation(NamedTuple):
@@ -474,6 +474,7 @@ def read_any_format(fname, fields=["prompt", "completion"]):
474474
- for .xlsx it will read the first sheet
475475
- for .txt it will assume completions and split on newline
476476
"""
477+
assert_has_pandas()
477478
remediation = None
478479
necessary_msg = None
479480
immediate_msg = None

openai/wandb_logger.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,9 @@
1313
import re
1414
from pathlib import Path
1515

16-
import numpy as np
17-
import pandas as pd
18-
1916
from openai import File, FineTune
17+
from openai.datalib import numpy as np
18+
from openai.datalib import pandas as pd
2019

2120

2221
class WandbLogger:

setup.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@
1212
with open("README.md", "r") as fh:
1313
long_description = fh.read()
1414

15+
16+
DATA_LIBRARIES = [
17+
# These libraries are optional because of their size. See `openai/datalib.py`.
18+
"numpy",
19+
"pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool
20+
"pandas-stubs>=1.1.0.11", # Needed for type hints for mypy
21+
"openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format
22+
]
23+
1524
setup(
1625
name="openai",
1726
description="Python client library for the OpenAI API",
@@ -21,22 +30,23 @@
2130
install_requires=[
2231
"requests>=2.20", # to get the patch for CVE-2018-18074
2332
"tqdm", # Needed for progress bars
24-
"pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool
25-
"pandas-stubs>=1.1.0.11", # Needed for type hints for mypy
26-
"openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format
27-
"numpy",
2833
'typing_extensions;python_version<"3.8"', # Needed for type hints for mypy
2934
"aiohttp", # Needed for async support
3035
],
3136
extras_require={
3237
"dev": ["black~=21.6b0", "pytest==6.*", "pytest-asyncio", "pytest-mock"],
33-
"wandb": ["wandb"],
38+
"datalib": DATA_LIBRARIES,
39+
"wandb": [
40+
"wandb",
41+
*DATA_LIBRARIES,
42+
],
3443
"embeddings": [
3544
"scikit-learn>=1.0.2", # Needed for embedding utils, versions >= 1.1 require python 3.8
3645
"tenacity>=8.0.1",
3746
"matplotlib",
3847
"sklearn",
3948
"plotly",
49+
*DATA_LIBRARIES,
4050
],
4151
},
4252
python_requires=">=3.7.1",

0 commit comments

Comments
 (0)