pandas-dev · grantcooksey · Oct 21, 2017 · jreback · Oct 21, 2017 · jreback
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -998,6 +998,7 @@ I/O
 - Bug in :meth:`DataFrame.to_html` in which there was no validation of the ``justify`` parameter (:issue:`17527`)
 - Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`)
 - Bug in :func:`to_json` where several conditions (including objects with unprintable symbols, objects with deep recursion, overlong labels) caused segfaults instead of raising the appropriate exception (:issue:`14256`)
+- Bug in :func:`read_json` where all utf-8 characters were not encoded properly when reading json data from a url (:issue:`17918`)
 
 Plotting
 ^^^^^^^^

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -29,7 +29,7 @@
 
 
 if compat.PY3:
-    from urllib.request import urlopen, pathname2url
+    from urllib.request import urlopen, pathname2url, quote
     _urlopen = urlopen
     from urllib.parse import urlparse as parse_url
     from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -38,7 +38,7 @@
     from http.client import HTTPException  # noqa
 else:
     from urllib2 import urlopen as _urlopen
-    from urllib import urlencode, pathname2url  # noqa
+    from urllib import urlencode, pathname2url, quote  # noqa
     from urlparse import urlparse as parse_url
     from urlparse import uses_relative, uses_netloc, uses_params, urljoin
     from urllib2 import URLError  # noqa
@@ -187,6 +187,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
     if _is_url(filepath_or_buffer):
+        filepath_or_buffer = quote(filepath_or_buffer, safe=';/?:@&=+$,')
         req = _urlopen(filepath_or_buffer)
         content_encoding = req.headers.get('Content-Encoding', None)
         if content_encoding == 'gzip':

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -845,12 +845,20 @@ def test_round_trip_exception_(self):
             index=df.index, columns=df.columns), df)
 
     @network
-    def test_url(self):
+    def test_url_encoded(self):
         url = 'https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5'  # noqa
         result = read_json(url, convert_dates=True)
         for c in ['created_at', 'closed_at', 'updated_at']:
             assert result[c].dtype == 'datetime64[ns]'
 
+    @network
+    def test_url_unencoded(self):
+        url = ('https://api.github.com/repos/pandas-dev/pandas/issues?per_pag'
+               'e=5&test=fake parameter')
+        result = read_json(url, convert_dates=True)
+        for c in ['created_at', 'closed_at', 'updated_at']:
+            assert result[c].dtype == 'datetime64[ns]'
+
     def test_timedelta(self):
         converter = lambda x: pd.to_timedelta(x, unit='ms')