diff --git a/doc/source/release.rst b/doc/source/release.rst index 47407eedb17bd..49656046129ca 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -285,6 +285,7 @@ Improvements to existing features - Translate ``sep='\s+'`` to ``delim_whitespace=True`` in :func:`read_csv`/:func:`read_table` if no other C-unsupported options specified (:issue:`6607`) +- ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`) .. _release.bug_fixes-0.14.0: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index fef5a24e6ea20..f4f40c8be7855 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -18,6 +18,7 @@ import pandas.compat as compat import pandas.core.common as com from warnings import warn +from distutils.version import LooseVersion __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] @@ -250,11 +251,19 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, **kwds): - from xlrd import (xldate_as_tuple, XL_CELL_DATE, + import xlrd + from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) - datemode = self.book.datemode + epoch1904 = self.book.datemode + + # xlrd >= 0.9.3 can return datetime objects directly. + if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): + xlrd_0_9_3 = True + else: + xlrd_0_9_3 = False + if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string @@ -271,12 +280,29 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: - dt = xldate_as_tuple(value, datemode) - # how to produce this first case? - if dt[0] < datetime.MINYEAR: # pragma: no cover - value = datetime.time(*dt[3:]) + if xlrd_0_9_3: + # Use the newer xlrd datetime handling. + value = xldate.xldate_as_datetime(value, epoch1904) + + # Excel doesn't distinguish between dates and time, + # so we treat dates on the epoch as times only. + # Also, Excel supports 1900 and 1904 epochs. + year = (value.timetuple())[0:3] + if ((not epoch1904 and year == (1899, 12, 31)) + or (epoch1904 and year == (1904, 1, 1))): + value = datetime.time(value.hour, + value.minute, + value.second, + value.microsecond) else: - value = datetime.datetime(*dt) + # Use the xlrd <= 0.9.2 date handling. + dt = xldate.xldate_as_tuple(value, epoch1904) + + if dt[0] < datetime.MINYEAR: + value = datetime.time(*dt[3:]) + else: + value = datetime.datetime(*dt) + elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: @@ -727,8 +753,9 @@ def __init__(self, path, engine=None, import xlsxwriter super(_XlsxWriter, self).__init__(path, engine=engine, - date_format=date_format, datetime_format=datetime_format, - **engine_kwargs) + date_format=date_format, + datetime_format=datetime_format, + **engine_kwargs) self.book = xlsxwriter.Workbook(path, **engine_kwargs) diff --git a/pandas/io/tests/data/times_1900.xls b/pandas/io/tests/data/times_1900.xls new file mode 100644 index 0000000000000..e9a62b2c25da9 Binary files /dev/null and b/pandas/io/tests/data/times_1900.xls differ diff --git a/pandas/io/tests/data/times_1904.xls b/pandas/io/tests/data/times_1904.xls new file mode 100644 index 0000000000000..ac70787c358a5 Binary files /dev/null and b/pandas/io/tests/data/times_1904.xls differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index fde5764993e76..eb245c12c5e30 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1,8 +1,9 @@ # pylint: disable=E1101 from pandas.compat import u, range, map -from datetime import datetime, date +from datetime import datetime, date, time import os +from distutils.version import LooseVersion import nose @@ -360,6 +361,49 @@ def test_reader_special_dtypes(self): convert_float=False) tm.assert_frame_equal(actual, no_convert_float) + def test_reader_seconds(self): + # Test reading times with and without milliseconds. GH5945. + _skip_if_no_xlrd() + import xlrd + + if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): + # Xlrd >= 0.9.3 can handle Excel milliseconds. + expected = DataFrame.from_items([("Time", + [time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54)])]) + else: + # Xlrd < 0.9.3 rounds Excel milliseconds. + expected = DataFrame.from_items([("Time", + [time(1, 2, 3), + time(2, 45, 56), + time(4, 29, 49), + time(6, 13, 42), + time(7, 57, 35), + time(9, 41, 29), + time(11, 25, 22), + time(13, 9, 15), + time(14, 53, 8), + time(16, 37, 1), + time(18, 20, 54)])]) + + epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls') + epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls') + + actual = read_excel(epoch_1900, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + actual = read_excel(epoch_1904, 'Sheet1') + tm.assert_frame_equal(actual, expected) + class ExcelWriterBase(SharedItems): # Base class for test cases to run with different Excel writers. @@ -400,7 +444,7 @@ def test_excel_deprecated_options(self): with ensure_clean(self.ext) as path: with tm.assert_produces_warning(FutureWarning): self.frame.to_excel(path, 'test1', cols=['A', 'B']) - + with tm.assert_produces_warning(False): self.frame.to_excel(path, 'test1', columns=['A', 'B']) @@ -832,9 +876,9 @@ def test_to_excel_output_encoding(self): index=[u('A\u0192'), 'B'], columns=[u('X\u0193'), 'Y', 'Z']) with ensure_clean(filename) as filename: - df.to_excel(filename, sheet_name = 'TestSheet', encoding='utf8') - result = read_excel(filename, 'TestSheet', encoding = 'utf8') - tm.assert_frame_equal(result,df) + df.to_excel(filename, sheet_name='TestSheet', encoding='utf8') + result = read_excel(filename, 'TestSheet', encoding='utf8') + tm.assert_frame_equal(result, df) def test_to_excel_unicode_filename(self):