Skip to content

Commit 6d5f82d

Browse files
committed
feat(gist): fsspec file system for GitHub gists (resolves #888)
1 parent ac7031b commit 6d5f82d

File tree

5 files changed

+227
-2
lines changed

5 files changed

+227
-2
lines changed

docs/source/api.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ Built-in Implementations
117117
fsspec.implementations.dbfs.DatabricksFileSystem
118118
fsspec.implementations.dirfs.DirFileSystem
119119
fsspec.implementations.ftp.FTPFileSystem
120+
fsspec.implementations.gist.GistFileSystem
120121
fsspec.implementations.git.GitFileSystem
121122
fsspec.implementations.github.GithubFileSystem
122123
fsspec.implementations.http.HTTPFileSystem
@@ -162,6 +163,9 @@ Built-in Implementations
162163
.. autoclass:: fsspec.implementations.ftp.FTPFileSystem
163164
:members: __init__
164165

166+
.. autoclass:: fsspec.implementations.gist.GistFileSystem
167+
:members: __init__
168+
165169
.. autoclass:: fsspec.implementations.git.GitFileSystem
166170
:members: __init__
167171

fsspec/implementations/gist.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import requests
2+
3+
from ..spec import AbstractFileSystem
4+
from ..utils import infer_storage_options
5+
from .memory import MemoryFile
6+
7+
8+
class GistFileSystem(AbstractFileSystem):
9+
"""
10+
Interface to files in a single GitHub Gist.
11+
12+
Provides read-only access to a gist's files. Gists do not contain
13+
subdirectories, so file listing is straightforward.
14+
15+
Parameters
16+
----------
17+
gist_id : str
18+
The ID of the gist you want to access (the long hex value from the URL).
19+
sha : str (optional)
20+
If provided, fetch a particular revision of the gist. If omitted,
21+
the latest revision is used.
22+
username : str (optional)
23+
GitHub username for authentication (required if token is given).
24+
token : str (optional)
25+
GitHub personal access token (required if username is given).
26+
timeout : (float, float) or float, optional
27+
Connect and read timeouts for requests (default 60s each).
28+
kwargs : dict
29+
Passed to AbstractFileSystem base class.
30+
"""
31+
32+
protocol = "gist"
33+
gist_url = "https://api.github.com/gists/{gist_id}"
34+
gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
35+
36+
def __init__(
37+
self, gist_id, sha=None, username=None, token=None, timeout=None, **kwargs
38+
):
39+
super().__init__(**kwargs)
40+
self.gist_id = gist_id
41+
self.sha = sha # revision of the gist (optional)
42+
if (username is None) ^ (token is None):
43+
# Both or neither must be set
44+
if username or token:
45+
raise ValueError("Auth requires both username and token, or neither.")
46+
self.username = username
47+
self.token = token
48+
# Default timeouts to 60s connect/read if none provided
49+
self.timeout = timeout if timeout is not None else (60, 60)
50+
51+
# We use a single-level "directory" cache, because a gist is essentially flat
52+
self.dircache[""] = self._fetch_file_list()
53+
54+
@property
55+
def kw(self):
56+
"""Auth parameters passed to 'requests' if we have username/token."""
57+
if self.username is not None and self.token is not None:
58+
return {"auth": (self.username, self.token)}
59+
return {}
60+
61+
def _fetch_gist_metadata(self):
62+
"""
63+
Fetch the JSON metadata for this gist (possibly for a specific revision).
64+
"""
65+
if self.sha:
66+
url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
67+
else:
68+
url = self.gist_url.format(gist_id=self.gist_id)
69+
70+
r = requests.get(url, timeout=self.timeout, **self.kw)
71+
if r.status_code == 404:
72+
raise FileNotFoundError(
73+
f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
74+
)
75+
r.raise_for_status()
76+
return r.json()
77+
78+
def _fetch_file_list(self):
79+
"""
80+
Returns a list of dicts describing each file in the gist. These get stored
81+
in self.dircache[""].
82+
"""
83+
meta = self._fetch_gist_metadata()
84+
files = meta.get("files", {})
85+
out = []
86+
for fname, finfo in files.items():
87+
if finfo is None:
88+
# Occasionally GitHub returns a file entry with null if it was deleted
89+
continue
90+
# Build a directory entry
91+
out.append(
92+
{
93+
"name": fname, # file's name
94+
"type": "file", # gists have no subdirectories
95+
"size": finfo.get("size", 0), # file size in bytes
96+
"raw_url": finfo.get("raw_url"),
97+
}
98+
)
99+
return out
100+
101+
@classmethod
102+
def _strip_protocol(cls, path):
103+
"""
104+
Remove 'gist://' from the path, if present.
105+
"""
106+
# The default infer_storage_options can handle gist://username:token@id/file
107+
# or gist://id/file, but let's ensure we handle a normal usage too.
108+
# We'll just strip the protocol prefix if it exists.
109+
path = infer_storage_options(path).get("path", path)
110+
return path.lstrip("/")
111+
112+
@staticmethod
113+
def _get_kwargs_from_urls(path):
114+
"""
115+
Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
116+
For example:
117+
gist://:TOKEN@<gist_id>/file.txt
118+
gist://username:TOKEN@<gist_id>/file.txt
119+
"""
120+
so = infer_storage_options(path)
121+
out = {}
122+
if "username" in so and so["username"]:
123+
out["username"] = so["username"]
124+
if "password" in so and so["password"]:
125+
out["token"] = so["password"]
126+
if "host" in so and so["host"]:
127+
# We interpret 'host' as the gist ID
128+
out["gist_id"] = so["host"]
129+
return out
130+
131+
def ls(self, path="", detail=False, **kwargs):
132+
"""
133+
List files in the gist. Gists are single-level, so any 'path' is basically
134+
the filename, or empty for all files.
135+
136+
Parameters
137+
----------
138+
path : str, optional
139+
The filename to list. If empty, returns all files in the gist.
140+
detail : bool, default False
141+
If True, return a list of dicts; if False, return a list of filenames.
142+
"""
143+
path = self._strip_protocol(path or "")
144+
# If path is empty, return all
145+
if path == "":
146+
results = self.dircache[""]
147+
else:
148+
# We want just the single file with this name
149+
all_files = self.dircache[""]
150+
results = [f for f in all_files if f["name"] == path]
151+
if not results:
152+
raise FileNotFoundError(path)
153+
if detail:
154+
return results
155+
else:
156+
return sorted(f["name"] for f in results)
157+
158+
def _open(self, path, mode="rb", block_size=None, **kwargs):
159+
"""
160+
Read a single file from the gist.
161+
"""
162+
if mode != "rb":
163+
raise NotImplementedError("GitHub Gist FS is read-only (no write).")
164+
165+
path = self._strip_protocol(path)
166+
# Find the file entry in our dircache
167+
matches = [f for f in self.dircache[""] if f["name"] == path]
168+
if not matches:
169+
raise FileNotFoundError(path)
170+
finfo = matches[0]
171+
172+
raw_url = finfo.get("raw_url")
173+
if not raw_url:
174+
raise FileNotFoundError(f"No raw_url for file: {path}")
175+
176+
r = requests.get(raw_url, timeout=self.timeout, **self.kw)
177+
if r.status_code == 404:
178+
raise FileNotFoundError(path)
179+
r.raise_for_status()
180+
return MemoryFile(path, None, r.content)
181+
182+
def cat(self, path, recursive=False, on_error="raise", **kwargs):
183+
"""
184+
Return {path: contents} for the given file or files. If 'recursive' is True,
185+
and path is empty, returns all files in the gist.
186+
"""
187+
paths = self.expand_path(path, recursive=recursive)
188+
out = {}
189+
for p in paths:
190+
try:
191+
with self.open(p, "rb") as f:
192+
out[p] = f.read()
193+
except FileNotFoundError as e:
194+
if on_error == "raise":
195+
raise e
196+
elif on_error == "omit":
197+
pass # skip
198+
else:
199+
out[p] = e
200+
return out

fsspec/implementations/github.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
from ..utils import infer_storage_options
88
from .memory import MemoryFile
99

10-
# TODO: add GIST backend, would be very similar
11-
1210

1311
class GithubFileSystem(AbstractFileSystem):
1412
"""Interface to files in github
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import pytest
2+
3+
import fsspec
4+
5+
6+
@pytest.mark.parametrize(
7+
"gist_id,sha",
8+
[("16bee4256595d3b6814be139ab1bd54e", "760905f9f222ad41b9c3fd8308cbbd016943c65a")],
9+
)
10+
def test_gist_public(gist_id, sha):
11+
fs = fsspec.filesystem("gist", gist_id=gist_id, sha=sha)
12+
# Listing
13+
all_files = fs.ls("")
14+
assert len(all_files) > 0
15+
# Cat
16+
data = fs.cat(all_files)
17+
assert set(data.keys()) == set(all_files)
18+
for v in data.values():
19+
assert isinstance(v, bytes)

fsspec/registry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ def register_implementation(name, cls, clobber=False, errtxt=None):
122122
"err": "Please install gdrivefs for access to Google Drive",
123123
},
124124
"generic": {"class": "fsspec.generic.GenericFileSystem"},
125+
"gist": {
126+
"class": "fsspec.implementations.gist.GistFileSystem",
127+
"err": "Install the requests package to use the gist FS",
128+
},
125129
"git": {
126130
"class": "fsspec.implementations.git.GitFileSystem",
127131
"err": "Install pygit2 to browse local git repos",

0 commit comments

Comments
 (0)