Skip to content

Commit a7ca65c

Browse files
0x0L0x0L
authored and
0x0L
committed
[ENH] Fama/French
* add get_datasets_famafrench (requires bs4) * complete rewrite of get_data_famafrench
1 parent e0cb350 commit a7ca65c

File tree

1 file changed

+104
-27
lines changed

1 file changed

+104
-27
lines changed

pandas_datareader/data.py

Lines changed: 104 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import datetime as dt
99
import time
1010
import csv
11+
import re
1112

1213
from collections import defaultdict
1314

@@ -581,41 +582,117 @@ def fetch_data(url, name):
581582
return df
582583

583584

584-
_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'
585+
_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/'
586+
_FF_PREFIX = 'ftp/'
587+
_FF_SUFFIX = '_CSV.zip'
585588

586589

587-
def get_data_famafrench(name):
588-
# path of zip files
589-
zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name)
590+
def get_datasets_famafrench():
591+
"""
592+
Get the list of datasets available from the Fama/French data library.
593+
594+
Returns
595+
-------
596+
A list of valid inputs for get_data_famafrench.
597+
"""
598+
from bs4 import BeautifulSoup
599+
600+
with urlopen(_FAMAFRENCH_URL + 'data_library.html') as socket:
601+
root = BeautifulSoup(socket.read(), 'html.parser')
602+
603+
l = filter(lambda x: x.startswith(_FF_PREFIX) and x.endswith(_FF_SUFFIX),
604+
[e.attrs['href'] for e in root.findAll('a') if 'href' in e.attrs])
590605

591-
with urlopen(zip_file_path) as url:
592-
raw = url.read()
606+
return list(map(lambda x: x[len(_FF_PREFIX):-len(_FF_SUFFIX)], l))
607+
608+
609+
def _download_data_famafrench(name):
610+
url = '{0}{1}{2}{3}'.format(_FAMAFRENCH_URL, _FF_PREFIX, name, _FF_SUFFIX)
611+
with urlopen(url) as socket:
612+
raw = socket.read()
593613

594614
with tempfile.TemporaryFile() as tmpf:
595615
tmpf.write(raw)
596616

597617
with ZipFile(tmpf, 'r') as zf:
598-
data = zf.open(zf.namelist()[0]).readlines()
599-
600-
line_lengths = np.array(lmap(len, data))
601-
file_edges = np.where(line_lengths == 2)[0]
602-
603-
datasets = {}
604-
edges = zip(file_edges + 1, file_edges[1:])
605-
for i, (left_edge, right_edge) in enumerate(edges):
606-
dataset = [d.split() for d in data[left_edge:right_edge]]
607-
if len(dataset) > 10:
608-
ncol_raw = np.array(lmap(len, dataset))
609-
ncol = np.median(ncol_raw)
610-
header_index = np.where(ncol_raw == ncol - 1)[0][-1]
611-
header = dataset[header_index]
612-
ds_header = dataset[header_index + 1:]
613-
# to ensure the header is unique
614-
header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
615-
start=1)]
616-
index = np.array([d[0] for d in ds_header], dtype=int)
617-
dataset = np.array([d[1:] for d in ds_header], dtype=float)
618-
datasets[i] = DataFrame(dataset, index, columns=header)
618+
data = zf.open(zf.namelist()[0]).read().decode()
619+
620+
return data
621+
622+
623+
def _parse_date_famafrench(x):
624+
# what's the correct python way to do that ??
625+
x = x.strip()
626+
try: return dt.datetime.strptime(x, '%Y')
627+
except: pass
628+
try: return dt.datetime.strptime(x, '%Y%m')
629+
except: pass
630+
return to_datetime(x)
631+
632+
633+
def get_data_famafrench(name):
634+
"""
635+
Get data for the given name from the Fama/French data library.
636+
637+
For annual and monthly data, index is a pandas.PeriodIndex, otherwise
638+
it's a pandas.DatetimeIndex.
639+
640+
Returns
641+
-------
642+
df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys.
643+
See df['DESCR'] for a description of the dataset
644+
"""
645+
params = {'index_col': 0,
646+
'parse_dates': [0],
647+
'date_parser': _parse_date_famafrench}
648+
649+
if name.endswith('_Breakpoints'):
650+
# headers in these files are not valid
651+
params['skiprows'] = 1
652+
if name == 'Prior_2-12_Breakpoints':
653+
params['skiprows'] = 3
654+
655+
params['names'] = ['Date']
656+
if name.find('-') > -1:
657+
params['names'] += ['<=0', '>0']
658+
else:
659+
params['names'] += ['Count']
660+
r = range(0, 105, 5)
661+
params['names'] += list(zip(r, r[1:]))
662+
663+
data = _download_data_famafrench(name)
664+
665+
# chunks are split by blank lines
666+
# chunks with more than 800 characters are considered data
667+
# those with less characters are considered documentation
668+
doc, almost_csv = [], []
669+
for chunk in data.split(2 * '\r\n'):
670+
if len(chunk) < 800:
671+
doc.append(chunk.replace('\r\n', ' ').strip())
672+
else:
673+
almost_csv.append(chunk)
674+
desc = ' '.join(doc).replace(2*' ', ' ')
675+
676+
datasets, table_desc = {}, []
677+
for i, src in enumerate(almost_csv):
678+
# the table starts there
679+
match = re.search('^\s*,', src, re.M)
680+
start = 0 if not match else match.start()
681+
682+
df = read_csv(StringIO('Date' + src[start:]), **params)
683+
try: df = df.to_period()
684+
except: pass
685+
datasets[i] = df
686+
687+
short_desc = ' '.join([src[:start].replace('\r\n', ' ').strip(),
688+
'({0} rows x {1} cols)'.format(*df.shape)])
689+
table_desc.append(short_desc)
690+
691+
table_str = map(lambda x: '{} : {}'.format(*x), enumerate(table_desc))
692+
693+
TEMPLATE = '{0}\n{1}\n\n{2}\n\n{3}'
694+
datasets['DESCR'] = TEMPLATE.format(name.replace('_', ' '), len(name)*'-',
695+
desc, '\n'.join(table_str))
619696

620697
return datasets
621698

0 commit comments

Comments
 (0)