Skip to content

Commit be6d20d

Browse files
committed
feat(gist): fsspec file system for GitHub gists (resolves #888)
1 parent ac7031b commit be6d20d

File tree

5 files changed

+322
-2
lines changed

5 files changed

+322
-2
lines changed

docs/source/api.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ Built-in Implementations
117117
fsspec.implementations.dbfs.DatabricksFileSystem
118118
fsspec.implementations.dirfs.DirFileSystem
119119
fsspec.implementations.ftp.FTPFileSystem
120+
fsspec.implementations.gist.GistFileSystem
120121
fsspec.implementations.git.GitFileSystem
121122
fsspec.implementations.github.GithubFileSystem
122123
fsspec.implementations.http.HTTPFileSystem
@@ -162,6 +163,9 @@ Built-in Implementations
162163
.. autoclass:: fsspec.implementations.ftp.FTPFileSystem
163164
:members: __init__
164165

166+
.. autoclass:: fsspec.implementations.gist.GistFileSystem
167+
:members: __init__
168+
165169
.. autoclass:: fsspec.implementations.git.GitFileSystem
166170
:members: __init__
167171

fsspec/implementations/gist.py

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
import requests
2+
3+
from ..spec import AbstractFileSystem
4+
from ..utils import infer_storage_options
5+
from .memory import MemoryFile
6+
7+
8+
class GistFileSystem(AbstractFileSystem):
9+
"""
10+
Interface to files in a single GitHub Gist.
11+
12+
Provides read-only access to a gist's files. Gists do not contain
13+
subdirectories, so file listing is straightforward.
14+
15+
Parameters
16+
----------
17+
gist_id : str
18+
The ID of the gist you want to access (the long hex value from the URL).
19+
filenames : list[str] (optional)
20+
If provided, only make a file system representing these files, and do not fetch
21+
the list of all files for this gist.
22+
sha : str (optional)
23+
If provided, fetch a particular revision of the gist. If omitted,
24+
the latest revision is used.
25+
username : str (optional)
26+
GitHub username for authentication (required if token is given).
27+
token : str (optional)
28+
GitHub personal access token (required if username is given).
29+
timeout : (float, float) or float, optional
30+
Connect and read timeouts for requests (default 60s each).
31+
kwargs : dict
32+
Stored on `self.request_kw` and passed to `requests.get` when fetching Gist
33+
metadata or reading ("opening") a file.
34+
"""
35+
36+
protocol = "gist"
37+
gist_url = "https://api.github.com/gists/{gist_id}"
38+
gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
39+
40+
def __init__(
41+
self,
42+
gist_id,
43+
filenames=None,
44+
sha=None,
45+
username=None,
46+
token=None,
47+
timeout=None,
48+
**kwargs,
49+
):
50+
super().__init__()
51+
self.gist_id = gist_id
52+
self.filenames = filenames
53+
self.sha = sha # revision of the gist (optional)
54+
if (username is None) ^ (token is None):
55+
# Both or neither must be set
56+
if username or token:
57+
raise ValueError("Auth requires both username and token, or neither.")
58+
self.username = username
59+
self.token = token
60+
self.request_kw = kwargs
61+
# Default timeouts to 60s connect/read if none provided
62+
self.timeout = timeout if timeout is not None else (60, 60)
63+
64+
# We use a single-level "directory" cache, because a gist is essentially flat
65+
self.dircache[""] = self._fetch_file_list()
66+
67+
@property
68+
def kw(self):
69+
"""Auth parameters passed to 'requests' if we have username/token."""
70+
if self.username is not None and self.token is not None:
71+
return {"auth": (self.username, self.token), **self.request_kw}
72+
return self.request_kw
73+
74+
def _fetch_gist_metadata(self):
75+
"""
76+
Fetch the JSON metadata for this gist (possibly for a specific revision).
77+
"""
78+
if self.sha:
79+
url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
80+
else:
81+
url = self.gist_url.format(gist_id=self.gist_id)
82+
83+
r = requests.get(url, timeout=self.timeout, **self.kw)
84+
if r.status_code == 404:
85+
raise FileNotFoundError(
86+
f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
87+
)
88+
r.raise_for_status()
89+
return r.json()
90+
91+
def _fetch_file_list(self):
92+
"""
93+
Returns a list of dicts describing each file in the gist. These get stored
94+
in self.dircache[""].
95+
"""
96+
meta = self._fetch_gist_metadata()
97+
if self.filenames:
98+
available_files = meta.get("files", {})
99+
files = {}
100+
for fn in self.filenames:
101+
if fn not in available_files:
102+
raise FileNotFoundError(fn)
103+
files[fn] = available_files[fn]
104+
else:
105+
files = meta.get("files", {})
106+
107+
out = []
108+
for fname, finfo in files.items():
109+
if finfo is None:
110+
# Occasionally GitHub returns a file entry with null if it was deleted
111+
continue
112+
# Build a directory entry
113+
out.append(
114+
{
115+
"name": fname, # file's name
116+
"type": "file", # gists have no subdirectories
117+
"size": finfo.get("size", 0), # file size in bytes
118+
"raw_url": finfo.get("raw_url"),
119+
}
120+
)
121+
return out
122+
123+
@classmethod
124+
def _strip_protocol(cls, path):
125+
"""
126+
Remove 'gist://' from the path, if present.
127+
"""
128+
# The default infer_storage_options can handle gist://username:token@id/file
129+
# or gist://id/file, but let's ensure we handle a normal usage too.
130+
# We'll just strip the protocol prefix if it exists.
131+
path = infer_storage_options(path).get("path", path)
132+
return path.lstrip("/")
133+
134+
@staticmethod
135+
def _get_kwargs_from_urls(path):
136+
"""
137+
Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
138+
For example:
139+
gist://:TOKEN@<gist_id>/file.txt
140+
gist://username:TOKEN@<gist_id>/file.txt
141+
"""
142+
so = infer_storage_options(path)
143+
out = {}
144+
if "username" in so and so["username"]:
145+
out["username"] = so["username"]
146+
if "password" in so and so["password"]:
147+
out["token"] = so["password"]
148+
if "host" in so and so["host"]:
149+
# We interpret 'host' as the gist ID
150+
out["gist_id"] = so["host"]
151+
152+
# Extract SHA and filename from path
153+
if "path" in so and so["path"]:
154+
path_parts = so["path"].rsplit("/", 2)[-2:]
155+
if len(path_parts) == 2:
156+
if path_parts[0]: # SHA present
157+
out["sha"] = path_parts[0]
158+
if path_parts[1]: # filename also present
159+
out["filenames"] = [path_parts[1]]
160+
161+
return out
162+
163+
def ls(self, path="", detail=False, **kwargs):
164+
"""
165+
List files in the gist. Gists are single-level, so any 'path' is basically
166+
the filename, or empty for all files.
167+
168+
Parameters
169+
----------
170+
path : str, optional
171+
The filename to list. If empty, returns all files in the gist.
172+
detail : bool, default False
173+
If True, return a list of dicts; if False, return a list of filenames.
174+
"""
175+
path = self._strip_protocol(path or "")
176+
# If path is empty, return all
177+
if path == "":
178+
results = self.dircache[""]
179+
else:
180+
# We want just the single file with this name
181+
all_files = self.dircache[""]
182+
results = [f for f in all_files if f["name"] == path]
183+
if not results:
184+
raise FileNotFoundError(path)
185+
if detail:
186+
return results
187+
else:
188+
return sorted(f["name"] for f in results)
189+
190+
def _open(self, path, mode="rb", block_size=None, **kwargs):
191+
"""
192+
Read a single file from the gist.
193+
"""
194+
if mode != "rb":
195+
raise NotImplementedError("GitHub Gist FS is read-only (no write).")
196+
197+
path = self._strip_protocol(path)
198+
# Find the file entry in our dircache
199+
matches = [f for f in self.dircache[""] if f["name"] == path]
200+
if not matches:
201+
raise FileNotFoundError(path)
202+
finfo = matches[0]
203+
204+
raw_url = finfo.get("raw_url")
205+
if not raw_url:
206+
raise FileNotFoundError(f"No raw_url for file: {path}")
207+
208+
r = requests.get(raw_url, timeout=self.timeout, **self.kw)
209+
if r.status_code == 404:
210+
raise FileNotFoundError(path)
211+
r.raise_for_status()
212+
return MemoryFile(path, None, r.content)
213+
214+
def cat(self, path, recursive=False, on_error="raise", **kwargs):
215+
"""
216+
Return {path: contents} for the given file or files. If 'recursive' is True,
217+
and path is empty, returns all files in the gist.
218+
"""
219+
paths = self.expand_path(path, recursive=recursive)
220+
out = {}
221+
for p in paths:
222+
try:
223+
with self.open(p, "rb") as f:
224+
out[p] = f.read()
225+
except FileNotFoundError as e:
226+
if on_error == "raise":
227+
raise e
228+
elif on_error == "omit":
229+
pass # skip
230+
else:
231+
out[p] = e
232+
return out

fsspec/implementations/github.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
from ..utils import infer_storage_options
88
from .memory import MemoryFile
99

10-
# TODO: add GIST backend, would be very similar
11-
1210

1311
class GithubFileSystem(AbstractFileSystem):
1412
"""Interface to files in github
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import pytest
2+
3+
import fsspec
4+
from fsspec.implementations.gist import GistFileSystem
5+
6+
7+
@pytest.mark.parametrize(
8+
"gist_id,sha",
9+
[("2656908684d3965b80c2", "2fb2f12f332f7e242b1a2af1f41e30ddf99f24c7")],
10+
)
11+
def test_gist_public_all_files(gist_id, sha):
12+
fs = fsspec.filesystem("gist", gist_id=gist_id, sha=sha)
13+
# Listing
14+
all_files = fs.ls("")
15+
assert len(all_files) == 2
16+
# Cat
17+
data = fs.cat(all_files)
18+
assert set(data.keys()) == set(all_files)
19+
for v in data.values():
20+
assert isinstance(v, bytes)
21+
22+
23+
@pytest.mark.parametrize(
24+
"gist_id,sha,file",
25+
[
26+
(
27+
"2656908684d3965b80c2",
28+
"2fb2f12f332f7e242b1a2af1f41e30ddf99f24c7",
29+
"distributed_error_logs_PY3_7-3-2016",
30+
)
31+
],
32+
)
33+
def test_gist_public_one_file(gist_id, sha, file):
34+
fs = fsspec.filesystem("gist", gist_id=gist_id, sha=sha, filenames=[file])
35+
# Listing
36+
all_files = fs.ls("")
37+
assert len(all_files) == 1
38+
# Cat
39+
data = fs.cat(all_files)
40+
assert set(data.keys()) == set(all_files)
41+
for v in data.values():
42+
assert isinstance(v, bytes)
43+
44+
45+
@pytest.mark.parametrize(
46+
"gist_id,sha,file",
47+
[
48+
(
49+
"2656908684d3965b80c2",
50+
"2fb2f12f332f7e242b1a2af1f41e30ddf99f24c7",
51+
"file-that-doesnt-exist.py",
52+
)
53+
],
54+
)
55+
def test_gist_public_missing_file(gist_id, sha, file):
56+
with pytest.raises(FileNotFoundError):
57+
fsspec.filesystem("gist", gist_id=gist_id, sha=sha, filenames=[file])
58+
59+
60+
@pytest.mark.parametrize(
61+
"gist_id,sha,file,token,user",
62+
[
63+
("gist-id-123", "sha_hash_a0b1", "a_file.txt", "secret_token", "my-user"),
64+
("gist-id-123", "sha_hash_a0b1", "a_file.txt", "secret_token", ""), # No user
65+
("gist-id-123", "", "a_file.txt", "secret_token", "my-user"), # No SHA
66+
],
67+
)
68+
def test_gist_url_parse(gist_id, sha, file, token, user):
69+
if sha:
70+
fmt_str = f"gist://{user}:{token}@{gist_id}/{sha}/{file}"
71+
else:
72+
fmt_str = f"gist://{user}:{token}@{gist_id}/{file}"
73+
74+
parsed = GistFileSystem._get_kwargs_from_urls(fmt_str)
75+
76+
expected = {"gist_id": gist_id, "token": token, "filenames": [file]}
77+
if user: # Only include username if it's not empty
78+
expected["username"] = user
79+
if sha: # Only include SHA if it's specified
80+
expected["sha"] = sha
81+
82+
assert parsed == expected

fsspec/registry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ def register_implementation(name, cls, clobber=False, errtxt=None):
122122
"err": "Please install gdrivefs for access to Google Drive",
123123
},
124124
"generic": {"class": "fsspec.generic.GenericFileSystem"},
125+
"gist": {
126+
"class": "fsspec.implementations.gist.GistFileSystem",
127+
"err": "Install the requests package to use the gist FS",
128+
},
125129
"git": {
126130
"class": "fsspec.implementations.git.GitFileSystem",
127131
"err": "Install pygit2 to browse local git repos",

0 commit comments

Comments
 (0)