diff --git a/.gitignore b/.gitignore index 57ac2c3..2bab4d9 100644 Binary files a/.gitignore and b/.gitignore differ diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py index 701eff4..5a2921c 100644 --- a/CanDI/candi/data.py +++ b/CanDI/candi/data.py @@ -26,7 +26,7 @@ def __init__(self): #self._verify_install() self._init_sources() self._init_depmap_paths() - self._init_index_tables() + # self._init_index_tables() def _verify_install(self): #ensures data being loaded is present try: diff --git a/CanDI/setup/data/config.ini b/CanDI/setup/data/config.draft.ini similarity index 100% rename from CanDI/setup/data/config.ini rename to CanDI/setup/data/config.draft.ini diff --git a/CanDI/setup/dataverse.py b/CanDI/setup/dataverse.py new file mode 100644 index 0000000..1dac195 --- /dev/null +++ b/CanDI/setup/dataverse.py @@ -0,0 +1,134 @@ +"""Metadata and scripts to collect datasets for CanDI +https://doi.org/10.7910/DVN/JIAT0H +""" +import os +import requests +from tqdm import tqdm +import sys + + +CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H' + +depmap_dataset_names = [ + 'CCLE_expression', + 'CCLE_fusions', + 'CCLE_gene_cn', + 'CCLE_mutations', + 'CCLE_RNAseq_reads', + 'CRISPR_gene_dependency', + 'CRISPR_gene_effect', + 'sample_info', + 'README', +] + +name2type = { + 'CCLE_expression': 'csv', + 'CCLE_fusions': 'csv', + 'CCLE_gene_cn': 'csv', + 'CCLE_mutations': 'csv', + 'CCLE_RNAseq_reads': 'csv', + 'CRISPR_gene_dependency': 'csv', + 'CRISPR_gene_effect': 'csv', + 'sample_info': 'csv', + 'README': 'txt', +} + +name2id = { + 'CCLE_expression': 8076862, + 'CCLE_fusions': 10085763, + 'CCLE_gene_cn': 8076861, + 'CCLE_mutations': 8076857, + 'CCLE_RNAseq_reads': 8076859, + 'CRISPR_gene_dependency': 8076863, + 'CRISPR_gene_effect': 8076860, + 'sample_info': 10085764, + 'README': 8151459, +} + + +def print_sys(s): + """system print + + Args: + s (str): the string to print + """ + print(s, flush = True, file = sys.stderr) + + +def dataverse_download(url, path, name, types): + """dataverse download helper with progress bar + + Args: + url (str): the url of the dataset + path (str): the path to save the dataset + name (str): the dataset name + types (dict): a dictionary mapping from the dataset name to the file format + """ + save_path = os.path.join(path, f"{name}.{types[name]}") + response = requests.get(url, stream=True) + total_size_in_bytes = int(response.headers.get("content-length", 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + with open(save_path, "wb") as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + + +def download_wrapper(name, path, return_type=None): + """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files + + Args: + name (str): the rough dataset query name + path (str): the path to save the dataset + return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] + + Returns: + str: the exact dataset query name + """ + server_path = "https://dataverse.harvard.edu/api/access/datafile/" + + url = server_path + str(name2id[name]) + + if not os.path.exists(path): + os.mkdir(path) + + file_name = f"{name}.{name2type[name]}" + + if os.path.exists(os.path.join(path, file_name)): + print_sys("Found local copy...") + os.path.join(path, file_name) + else: + print_sys("Downloading...") + dataverse_download(url, path, name, name2type) + + if return_type == "url": + return url + elif return_type == "name": + return file_name + elif return_type == ["url", "name"]: + return url, file_name + + +def depmap_dataverse_download(path, return_type=None): + """download all datasets to the path + + Args: + path (str): the path to save the datasets + return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] + """ + url_list = [] + file_names = [] + + for name in depmap_dataset_names: + url, file_name = download_wrapper(name, path, return_type=["url", "name"]) + url_list.append(url) + file_names.append(file_name) + + if return_type == "url": + return url_list + elif return_type == "name": + return file_names + elif return_type == ["url", "name"]: + return url_list, file_names diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py index 257d303..43518f3 100644 --- a/CanDI/setup/install.py +++ b/CanDI/setup/install.py @@ -1,11 +1,26 @@ +import argparse from manager import Manager if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--source", help="Specify the download source", default="dataverse") + args = parser.parse_args() - m = Manager() - m.get_depmap_info() - m.write_config(m.cfig_path, m.parser) - m.download_defaults() - m.write_config(m.cfig_path, m.parser) - m.depmap_autoformat() - m.write_config(m.cfig_path, m.parser) + if args.source == 'dataverse': + print("Downloading data from Dataverse") + m = Manager(download_source=args.source) + m.download_reformatted_data() + m.write_config(m.cfig_path, m.parser) + + elif args.source == 'depmap': + print("Downloading data from DepMap") + m = Manager(download_source=args.source) + m.get_depmap_info() + m.write_config(m.cfig_path, m.parser) + m.download_defaults() + m.write_config(m.cfig_path, m.parser) + m.depmap_autoformat() + m.write_config(m.cfig_path, m.parser) + + else: + raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'") \ No newline at end of file diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index 44aef56..3b50364 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -1,32 +1,32 @@ import os -import sys import configparser import json import time +import requests +import shutil +import pandas as pd from time import sleep from pathlib import Path -import contextlib from concurrent.futures import ThreadPoolExecutor -import pandas as pd -import requests +from dataverse import depmap_dataverse_download class Manager(object): """The Manager class handles interations with the datasources and the config file. It is used to setup of the config file upon installation. All data downloading is done by Manager """ - def __init__(self): + def __init__(self, download_source=None): manager_path = os.path.dirname(os.path.realpath(__file__)) cfig_path = manager_path + "/data/config.ini" parser = configparser.ConfigParser() - parser.read(cfig_path) + parser.read(cfig_path.replace(".ini", ".draft.ini")) self.manager_path = manager_path self.cfig_path = Path(cfig_path) self.parser = parser - - + self.download_source = download_source + def sanger_download(): pass @@ -217,6 +217,36 @@ def format_depmap_data(self, df, path): formatted[path.split("/")[-1]] = path + def download_reformatted_data(self, depmap_release=''): + if self.download_source == "dataverse": + urls, file_names = depmap_dataverse_download( + self.manager_path + '/data/depmap/', + return_type= ["url", "name"] + ) + + depmap_urls = { + file: url for url, file in zip(urls, file_names) + } + + depmap_files = {} + for file in file_names: + f_key = file.split('.')[0] + f_key = f_key.replace('CCLE_','') + f_key = f_key.replace('CRISPR_','') + depmap_files[f_key] = file + + formatted = { + f'{self.manager_path}/data/depmap/{file}': file for file in file_names + if 'readme' not in file.lower() + } + + self.parser["depmap_urls"] = depmap_urls + self.parser["depmap_files"] = depmap_files + self.parser["formatted"] = formatted + + else: + raise RuntimeError("Set download source to 'dataverse' before running download_formated_data") + @staticmethod def write_config(cfig_path, parser): @@ -224,9 +254,3 @@ def write_config(cfig_path, parser): with open(cfig_path, "w") as f: parser.write(f) f.close() - -if __name__ == "__main__": - m = Manager() - #m.depmap_download("fusions") - m.depmap_autoformat() - m.write_config(m.cfig_path, m.parser)