Skip to content

Commit 4d0f191

Browse files
0x0L0x0L
authored and
0x0L
committed
[ENH] Fama/French
* add get_datasets_famafrench (requires bs4) * complete rewrite of get_data_famafrench
1 parent e0cb350 commit 4d0f191

File tree

1 file changed

+91
-27
lines changed

1 file changed

+91
-27
lines changed

pandas_datareader/data.py

Lines changed: 91 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import datetime as dt
99
import time
1010
import csv
11+
import re
1112

1213
from collections import defaultdict
1314

@@ -581,42 +582,105 @@ def fetch_data(url, name):
581582
return df
582583

583584

584-
_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'
585+
_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/'
586+
_FF_PREFIX = 'ftp/'
587+
_FF_SUFFIX = '_CSV.zip'
585588

586589

587-
def get_data_famafrench(name):
588-
# path of zip files
589-
zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name)
590+
def get_datasets_famafrench():
591+
"""
592+
Get the list of datasets available from the Fama/French data library.
593+
594+
Returns
595+
-------
596+
A list of valid inputs for get_data_famafrench.
597+
"""
598+
from bs4 import BeautifulSoup
599+
600+
with urlopen(_FAMAFRENCH_URL + 'data_library.html') as socket:
601+
root = BeautifulSoup(socket.read(), 'html.parser')
602+
603+
l = filter(lambda x: x.startswith(_FF_PREFIX) and x.endswith(_FF_SUFFIX),
604+
[e.attrs['href'] for e in root.findAll('a') if 'href' in e.attrs])
590605

591-
with urlopen(zip_file_path) as url:
592-
raw = url.read()
606+
return list(map(lambda x: x[len(_FF_PREFIX):-len(_FF_SUFFIX)], l))
607+
608+
609+
def _download_data_famafrench(name):
610+
url = ''.join([_FAMAFRENCH_URL, _FF_PREFIX, name, _FF_SUFFIX])
611+
with urlopen(url) as socket:
612+
raw = socket.read()
593613

594614
with tempfile.TemporaryFile() as tmpf:
595615
tmpf.write(raw)
596616

597617
with ZipFile(tmpf, 'r') as zf:
598-
data = zf.open(zf.namelist()[0]).readlines()
599-
600-
line_lengths = np.array(lmap(len, data))
601-
file_edges = np.where(line_lengths == 2)[0]
602-
603-
datasets = {}
604-
edges = zip(file_edges + 1, file_edges[1:])
605-
for i, (left_edge, right_edge) in enumerate(edges):
606-
dataset = [d.split() for d in data[left_edge:right_edge]]
607-
if len(dataset) > 10:
608-
ncol_raw = np.array(lmap(len, dataset))
609-
ncol = np.median(ncol_raw)
610-
header_index = np.where(ncol_raw == ncol - 1)[0][-1]
611-
header = dataset[header_index]
612-
ds_header = dataset[header_index + 1:]
613-
# to ensure the header is unique
614-
header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
615-
start=1)]
616-
index = np.array([d[0] for d in ds_header], dtype=int)
617-
dataset = np.array([d[1:] for d in ds_header], dtype=float)
618-
datasets[i] = DataFrame(dataset, index, columns=header)
618+
data = zf.open(zf.namelist()[0]).read().decode()
619+
620+
return data
621+
622+
623+
def _parse_date_famafrench(x):
624+
# what's the correct python way to do that ??
625+
x = x.strip()
626+
try: return dt.datetime.strptime(x, '%Y')
627+
except: pass
628+
try: return dt.datetime.strptime(x, '%Y%m')
629+
except: pass
630+
return to_datetime(x)
631+
632+
633+
def get_data_famafrench(name):
634+
"""
635+
Get data for the given name from the Fama/French data library.
636+
637+
For annual and monthly data, index is a pandas.PeriodIndex, otherwise
638+
it's a pandas.DatetimeIndex.
639+
640+
Returns
641+
-------
642+
df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys.
643+
See df['DESCR'] for a description of the dataset
644+
"""
645+
params = {'index_col': 0,
646+
'parse_dates': [0],
647+
'date_parser': _parse_date_famafrench}
648+
649+
# headers in these files are not valid
650+
if name.endswith('_Breakpoints'):
651+
c = ['<=0', '>0'] if name.find('-') > -1 else ['Count']
652+
r = range(0, 105, 5)
653+
params['names'] = ['Date'] + c + list(zip(r, r[1:]))
654+
params['skiprows'] = 1 if name != 'Prior_2-12_Breakpoints' else 3
655+
656+
doc_chunks, tables = [], []
657+
data = _download_data_famafrench(name)
658+
for chunk in data.split(2 * '\r\n'):
659+
if len(chunk) < 800:
660+
doc_chunks.append(chunk.replace('\r\n', ' ').strip())
661+
else:
662+
tables.append(chunk)
663+
664+
datasets, table_desc = {}, []
665+
for i, src in enumerate(tables):
666+
match = re.search('^\s*,', src, re.M) # the table starts there
667+
start = 0 if not match else match.start()
668+
669+
df = read_csv(StringIO('Date' + src[start:]), **params)
670+
try: df = df.to_period()
671+
except: pass
672+
datasets[i] = df
673+
674+
title = src[:start].replace('\r\n', ' ').strip()
675+
shape = '({} rows x {} cols)'.format(*df.shape)
676+
table_desc.append('{} {}'.format(title, shape).strip())
677+
678+
descr = '{}\n{}\n\n'.format(name.replace('_', ' '), len(name) * '-')
679+
if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n'
680+
681+
table_descr = map(lambda x: '{:3} : {}'.format(*x), enumerate(table_desc))
619682

683+
datasets['DESCR'] = descr + '\n'.join(table_descr)
620684
return datasets
621685

622686

0 commit comments

Comments
 (0)