diff --git a/pandas/io/common.py b/pandas/io/common.py index dc7c483c1fb68..cf4bba6e97afb 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -104,85 +104,6 @@ def __next__(self): BaseIterator.next = lambda self: self.__next__() -try: - from boto.s3 import key - - class BotoFileLikeReader(key.Key): - """boto Key modified to be more file-like - - This modification of the boto Key will read through a supplied - S3 key once, then stop. The unmodified boto Key object will repeatedly - cycle through a file in S3: after reaching the end of the file, - boto will close the file. Then the next call to `read` or `next` will - re-open the file and start reading from the beginning. - - Also adds a `readline` function which will split the returned - values by the `\n` character. - """ - - def __init__(self, *args, **kwargs): - encoding = kwargs.pop("encoding", None) # Python 2 compat - super(BotoFileLikeReader, self).__init__(*args, **kwargs) - # Add a flag to mark the end of the read. - self.finished_read = False - self.buffer = "" - self.lines = [] - if encoding is None and compat.PY3: - encoding = "utf-8" - self.encoding = encoding - self.lines = [] - - def next(self): - return self.readline() - - __next__ = next - - def read(self, *args, **kwargs): - if self.finished_read: - return b'' if compat.PY3 else '' - return super(BotoFileLikeReader, self).read(*args, **kwargs) - - def close(self, *args, **kwargs): - self.finished_read = True - return super(BotoFileLikeReader, self).close(*args, **kwargs) - - def seekable(self): - """Needed for reading by bz2""" - return False - - def readline(self): - """Split the contents of the Key by '\n' characters.""" - if self.lines: - retval = self.lines[0] - self.lines = self.lines[1:] - return retval - if self.finished_read: - if self.buffer: - retval, self.buffer = self.buffer, "" - return retval - else: - raise StopIteration - - if self.encoding: - self.buffer = "{}{}".format( - self.buffer, self.read(8192).decode(self.encoding)) - else: - self.buffer = "{}{}".format(self.buffer, self.read(8192)) - - split_buffer = self.buffer.split("\n") - self.lines.extend(split_buffer[:-1]) - self.buffer = split_buffer[-1] - - return self.readline() -except ImportError: - # boto is only needed for reading from S3. - pass -except TypeError: - # boto/boto3 issues - # GH11915 - pass - - def _is_url(url): """Check to see if a URL has a valid protocol. @@ -319,32 +240,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, return tuple(to_return) if _is_s3_url(filepath_or_buffer): - try: - import boto - except: - raise ImportError("boto is required to handle s3 files") - # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST - # are environment variables - parsed_url = parse_url(filepath_or_buffer) - s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') - - try: - conn = boto.connect_s3(host=s3_host) - except boto.exception.NoAuthHandlerFound: - conn = boto.connect_s3(host=s3_host, anon=True) - - b = conn.get_bucket(parsed_url.netloc, validate=False) - if compat.PY2 and (compression == 'gzip' or - (compression == 'infer' and - filepath_or_buffer.endswith(".gz"))): - k = boto.s3.key.Key(b, parsed_url.path) - filepath_or_buffer = BytesIO(k.get_contents_as_string( - encoding=encoding)) - else: - k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) - k.open('r') # Expose read errors immediately - filepath_or_buffer = k - return filepath_or_buffer, None, compression + from pandas.io.s3 import get_filepath_or_buffer + return get_filepath_or_buffer(filepath_or_buffer, + encoding=encoding, + compression=compression) # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) diff --git a/pandas/io/s3.py b/pandas/io/s3.py new file mode 100644 index 0000000000000..df8f1d9187031 --- /dev/null +++ b/pandas/io/s3.py @@ -0,0 +1,112 @@ +""" s3 support for remote file interactivity """ + +import os +from pandas import compat +from pandas.compat import BytesIO + +try: + import boto + from boto.s3 import key +except: + raise ImportError("boto is required to handle s3 files") + +if compat.PY3: + from urllib.parse import urlparse as parse_url +else: + from urlparse import urlparse as parse_url + + +class BotoFileLikeReader(key.Key): + """boto Key modified to be more file-like + + This modification of the boto Key will read through a supplied + S3 key once, then stop. The unmodified boto Key object will repeatedly + cycle through a file in S3: after reaching the end of the file, + boto will close the file. Then the next call to `read` or `next` will + re-open the file and start reading from the beginning. + + Also adds a `readline` function which will split the returned + values by the `\n` character. + """ + + def __init__(self, *args, **kwargs): + encoding = kwargs.pop("encoding", None) # Python 2 compat + super(BotoFileLikeReader, self).__init__(*args, **kwargs) + # Add a flag to mark the end of the read. + self.finished_read = False + self.buffer = "" + self.lines = [] + if encoding is None and compat.PY3: + encoding = "utf-8" + self.encoding = encoding + self.lines = [] + + def next(self): + return self.readline() + + __next__ = next + + def read(self, *args, **kwargs): + if self.finished_read: + return b'' if compat.PY3 else '' + return super(BotoFileLikeReader, self).read(*args, **kwargs) + + def close(self, *args, **kwargs): + self.finished_read = True + return super(BotoFileLikeReader, self).close(*args, **kwargs) + + def seekable(self): + """Needed for reading by bz2""" + return False + + def readline(self): + """Split the contents of the Key by '\n' characters.""" + if self.lines: + retval = self.lines[0] + self.lines = self.lines[1:] + return retval + if self.finished_read: + if self.buffer: + retval, self.buffer = self.buffer, "" + return retval + else: + raise StopIteration + + if self.encoding: + self.buffer = "{}{}".format( + self.buffer, self.read(8192).decode(self.encoding)) + else: + self.buffer = "{}{}".format(self.buffer, self.read(8192)) + + split_buffer = self.buffer.split("\n") + self.lines.extend(split_buffer[:-1]) + self.buffer = split_buffer[-1] + + return self.readline() + + +def get_filepath_or_buffer(filepath_or_buffer, encoding=None, + compression=None): + + # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST + # are environment variables + parsed_url = parse_url(filepath_or_buffer) + s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') + + try: + conn = boto.connect_s3(host=s3_host) + except boto.exception.NoAuthHandlerFound: + conn = boto.connect_s3(host=s3_host, anon=True) + + b = conn.get_bucket(parsed_url.netloc, validate=False) + if compat.PY2 and (compression == 'gzip' or + (compression == 'infer' and + filepath_or_buffer.endswith(".gz"))): + k = boto.s3.key.Key(b, parsed_url.path) + filepath_or_buffer = BytesIO(k.get_contents_as_string( + encoding=encoding)) + else: + k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) + k.open('r') # Expose read errors immediately + filepath_or_buffer = k + return filepath_or_buffer, None, compression diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index d9c09fa788332..6845eb009df5d 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -472,9 +472,6 @@ def test_options_source_warning(self): class TestDataReader(tm.TestCase): - def test_is_s3_url(self): - from pandas.io.common import _is_s3_url - self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) @network def test_read_yahoo(self): diff --git a/pandas/io/tests/test_s3.py b/pandas/io/tests/test_s3.py new file mode 100644 index 0000000000000..8058698a906ea --- /dev/null +++ b/pandas/io/tests/test_s3.py @@ -0,0 +1,14 @@ +import nose +from pandas.util import testing as tm + +from pandas.io.common import _is_s3_url + + +class TestS3URL(tm.TestCase): + def test_is_s3_url(self): + self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) + self.assertFalse(_is_s3_url("s4://pandas/somethingelse.com")) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False)