|
8 | 8 | import datetime as dt
|
9 | 9 | import time
|
10 | 10 | import csv
|
| 11 | +import re |
11 | 12 |
|
12 | 13 | from collections import defaultdict
|
13 | 14 |
|
@@ -581,42 +582,105 @@ def fetch_data(url, name):
|
581 | 582 | return df
|
582 | 583 |
|
583 | 584 |
|
584 |
| -_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp' |
| 585 | +_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/' |
| 586 | +_FF_PREFIX = 'ftp/' |
| 587 | +_FF_SUFFIX = '_CSV.zip' |
585 | 588 |
|
586 | 589 |
|
587 |
| -def get_data_famafrench(name): |
588 |
| - # path of zip files |
589 |
| - zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name) |
| 590 | +def get_datasets_famafrench(): |
| 591 | + """ |
| 592 | + Get the list of datasets available from the Fama/French data library. |
| 593 | +
|
| 594 | + Returns |
| 595 | + ------- |
| 596 | + A list of valid inputs for get_data_famafrench. |
| 597 | + """ |
| 598 | + from bs4 import BeautifulSoup |
| 599 | + |
| 600 | + with urlopen(_FAMAFRENCH_URL + 'data_library.html') as socket: |
| 601 | + root = BeautifulSoup(socket.read(), 'html.parser') |
| 602 | + |
| 603 | + l = filter(lambda x: x.startswith(_FF_PREFIX) and x.endswith(_FF_SUFFIX), |
| 604 | + [e.attrs['href'] for e in root.findAll('a') if 'href' in e.attrs]) |
590 | 605 |
|
591 |
| - with urlopen(zip_file_path) as url: |
592 |
| - raw = url.read() |
| 606 | + return list(map(lambda x: x[len(_FF_PREFIX):-len(_FF_SUFFIX)], l)) |
| 607 | + |
| 608 | + |
| 609 | +def _download_data_famafrench(name): |
| 610 | + url = ''.join([_FAMAFRENCH_URL, _FF_PREFIX, name, _FF_SUFFIX]) |
| 611 | + with urlopen(url) as socket: |
| 612 | + raw = socket.read() |
593 | 613 |
|
594 | 614 | with tempfile.TemporaryFile() as tmpf:
|
595 | 615 | tmpf.write(raw)
|
596 | 616 |
|
597 | 617 | with ZipFile(tmpf, 'r') as zf:
|
598 |
| - data = zf.open(zf.namelist()[0]).readlines() |
599 |
| - |
600 |
| - line_lengths = np.array(lmap(len, data)) |
601 |
| - file_edges = np.where(line_lengths == 2)[0] |
602 |
| - |
603 |
| - datasets = {} |
604 |
| - edges = zip(file_edges + 1, file_edges[1:]) |
605 |
| - for i, (left_edge, right_edge) in enumerate(edges): |
606 |
| - dataset = [d.split() for d in data[left_edge:right_edge]] |
607 |
| - if len(dataset) > 10: |
608 |
| - ncol_raw = np.array(lmap(len, dataset)) |
609 |
| - ncol = np.median(ncol_raw) |
610 |
| - header_index = np.where(ncol_raw == ncol - 1)[0][-1] |
611 |
| - header = dataset[header_index] |
612 |
| - ds_header = dataset[header_index + 1:] |
613 |
| - # to ensure the header is unique |
614 |
| - header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, |
615 |
| - start=1)] |
616 |
| - index = np.array([d[0] for d in ds_header], dtype=int) |
617 |
| - dataset = np.array([d[1:] for d in ds_header], dtype=float) |
618 |
| - datasets[i] = DataFrame(dataset, index, columns=header) |
| 618 | + data = zf.open(zf.namelist()[0]).read().decode() |
| 619 | + |
| 620 | + return data |
| 621 | + |
| 622 | + |
| 623 | +def _parse_date_famafrench(x): |
| 624 | + # what's the correct python way to do that ?? |
| 625 | + x = x.strip() |
| 626 | + try: return dt.datetime.strptime(x, '%Y') |
| 627 | + except: pass |
| 628 | + try: return dt.datetime.strptime(x, '%Y%m') |
| 629 | + except: pass |
| 630 | + return to_datetime(x) |
| 631 | + |
| 632 | + |
| 633 | +def get_data_famafrench(name): |
| 634 | + """ |
| 635 | + Get data for the given name from the Fama/French data library. |
| 636 | +
|
| 637 | + For annual and monthly data, index is a pandas.PeriodIndex, otherwise |
| 638 | + it's a pandas.DatetimeIndex. |
| 639 | +
|
| 640 | + Returns |
| 641 | + ------- |
| 642 | + df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys. |
| 643 | + See df['DESCR'] for a description of the dataset |
| 644 | + """ |
| 645 | + params = {'index_col': 0, |
| 646 | + 'parse_dates': [0], |
| 647 | + 'date_parser': _parse_date_famafrench} |
| 648 | + |
| 649 | + # headers in these files are not valid |
| 650 | + if name.endswith('_Breakpoints'): |
| 651 | + c = ['<=0', '>0'] if name.find('-') > -1 else ['Count'] |
| 652 | + r = range(0, 105, 5) |
| 653 | + params['names'] = ['Date'] + c + list(zip(r, r[1:])) |
| 654 | + params['skiprows'] = 1 if name != 'Prior_2-12_Breakpoints' else 3 |
| 655 | + |
| 656 | + doc_chunks, tables = [], [] |
| 657 | + data = _download_data_famafrench(name) |
| 658 | + for chunk in data.split(2 * '\r\n'): |
| 659 | + if len(chunk) < 800: |
| 660 | + doc_chunks.append(chunk.replace('\r\n', ' ').strip()) |
| 661 | + else: |
| 662 | + tables.append(chunk) |
| 663 | + |
| 664 | + datasets, table_desc = {}, [] |
| 665 | + for i, src in enumerate(tables): |
| 666 | + match = re.search('^\s*,', src, re.M) # the table starts there |
| 667 | + start = 0 if not match else match.start() |
| 668 | + |
| 669 | + df = read_csv(StringIO('Date' + src[start:]), **params) |
| 670 | + try: df = df.to_period() |
| 671 | + except: pass |
| 672 | + datasets[i] = df |
| 673 | + |
| 674 | + title = src[:start].replace('\r\n', ' ').strip() |
| 675 | + shape = '({0} rows x {1} cols)'.format(*df.shape) |
| 676 | + table_desc.append('{0} {1}'.format(title, shape).strip()) |
| 677 | + |
| 678 | + descr = '{0}\n{1}\n\n'.format(name.replace('_', ' '), len(name) * '-') |
| 679 | + if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n' |
| 680 | + |
| 681 | + table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc)) |
619 | 682 |
|
| 683 | + datasets['DESCR'] = descr + '\n'.join(table_descr) |
620 | 684 | return datasets
|
621 | 685 |
|
622 | 686 |
|
|
0 commit comments