Skip to content

Commit 99ff08f

Browse files
committed
BUG: Gracefully handle all utf-8 characters in urls GH17918
1 parent 77b4bb3 commit 99ff08f

File tree

3 files changed

+12
-2
lines changed

3 files changed

+12
-2
lines changed

doc/source/whatsnew/v0.21.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -998,6 +998,7 @@ I/O
998998
- Bug in :meth:`DataFrame.to_html` in which there was no validation of the ``justify`` parameter (:issue:`17527`)
999999
- Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`)
10001000
- Bug in :func:`to_json` where several conditions (including objects with unprintable symbols, objects with deep recursion, overlong labels) caused segfaults instead of raising the appropriate exception (:issue:`14256`)
1001+
- Bug in :func:`read_json` where all utf-8 characters were not encoded properly when reading json data from a url (:issue:`17918`)
10011002

10021003
Plotting
10031004
^^^^^^^^

pandas/io/common.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030

3131
if compat.PY3:
32-
from urllib.request import urlopen, pathname2url
32+
from urllib.request import urlopen, pathname2url, quote
3333
_urlopen = urlopen
3434
from urllib.parse import urlparse as parse_url
3535
from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -187,6 +187,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
187187
filepath_or_buffer = _stringify_path(filepath_or_buffer)
188188

189189
if _is_url(filepath_or_buffer):
190+
filepath_or_buffer = quote(filepath_or_buffer, safe=';/?:@&=+$,')
190191
req = _urlopen(filepath_or_buffer)
191192
content_encoding = req.headers.get('Content-Encoding', None)
192193
if content_encoding == 'gzip':

pandas/tests/io/json/test_pandas.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -845,12 +845,20 @@ def test_round_trip_exception_(self):
845845
index=df.index, columns=df.columns), df)
846846

847847
@network
848-
def test_url(self):
848+
def test_url_encoded(self):
849849
url = 'https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5' # noqa
850850
result = read_json(url, convert_dates=True)
851851
for c in ['created_at', 'closed_at', 'updated_at']:
852852
assert result[c].dtype == 'datetime64[ns]'
853853

854+
@network
855+
def test_url_unencoded(self):
856+
url = ('https://api.github.com/repos/pandas-dev/pandas/issues?per_pag'
857+
'e=5&test=fake parameter')
858+
result = read_json(url, convert_dates=True)
859+
for c in ['created_at', 'closed_at', 'updated_at']:
860+
assert result[c].dtype == 'datetime64[ns]'
861+
854862
def test_timedelta(self):
855863
converter = lambda x: pd.to_timedelta(x, unit='ms')
856864

0 commit comments

Comments
 (0)