diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index af02bed..e5e6c9a 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9"] # ["3.8", "3.9", "3.10"] + python-version: ["3.11"] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4a1e620..9913050 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: os-version: ["ubuntu-latest"] - python-version: ["3.9"] # ["3.8", "3.9", "3.10"] + python-version: ["3.11"] steps: - uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index 2bab4d9..cdbe076 100644 Binary files a/.gitignore and b/.gitignore differ diff --git a/CanDI/__version__.py b/CanDI/__version__.py index cab7576..1fee926 100644 --- a/CanDI/__version__.py +++ b/CanDI/__version__.py @@ -1 +1 @@ -version = "0.1.1" \ No newline at end of file +version = "0.2.0" \ No newline at end of file diff --git a/CanDI/candi/__init__.py b/CanDI/candi/__init__.py index 7fc3fbd..a2245f0 100644 --- a/CanDI/candi/__init__.py +++ b/CanDI/candi/__init__.py @@ -1,4 +1,6 @@ +from . import load from . import data + data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects -from . import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster) +from .candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster) diff --git a/CanDI/candi/candi.py b/CanDI/candi/candi.py index c422fd8..eab6e45 100644 --- a/CanDI/candi/candi.py +++ b/CanDI/candi/candi.py @@ -1,11 +1,11 @@ # Classes for handling data aggregations import operator -from collections import OrderedDict, MutableSequence +from collections.abc import MutableSequence import itertools as it import pandas as pd import numpy as np from . import data, grabber -from . import entity +from ..structures import entity class SubsetHandler(object): diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py index 5a2921c..24fafcb 100644 --- a/CanDI/candi/data.py +++ b/CanDI/candi/data.py @@ -14,21 +14,27 @@ class Data(object): can be tuned to load specific datasets upon import by editing config.ini can call Data.load() to load any specific dataset """ - def __init__(self): + def __init__(self, config_path='auto', verbose=False): - self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup' - config_path = self._file_path / 'data/config.ini' + if config_path == 'auto': + self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup' + config_path = self._file_path / 'data/config.ini' + elif os.path.exists(config_path) == False: + raise FileNotFoundError("Config file not found at {}".format(config_path)) + elif os.path.exists(config_path) == True: + if verbose: print("Using config file at {}".format(config_path)) parser = configparser.ConfigParser() #parses config for data sources parser.read(config_path) self._parser = parser - #self._verify_install() + self._verify_install() self._init_sources() self._init_depmap_paths() - # self._init_index_tables() + self._init_index_tables() def _verify_install(self): #ensures data being loaded is present + #TODO: add more checks for different data sources try: assert "depmap_urls" in self._parser.sections() except AssertionError: @@ -91,6 +97,7 @@ def _handle_autoload(method, path): df = pd.read_csv(path, memory_map=True, low_memory=False, + sep='\t', index_col="DepMap_ID") elif method == "locations": diff --git a/CanDI/pipelines/__init__.py b/CanDI/pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/CanDI/pipelines/coessentiality/__init__.py b/CanDI/pipelines/coessentiality/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/CanDI/pipelines/diffexp.py b/CanDI/pipelines/diffexp.py new file mode 100644 index 0000000..ea4a581 --- /dev/null +++ b/CanDI/pipelines/diffexp.py @@ -0,0 +1,52 @@ +import numpy as np +import pandas as pd +import anndata as ad + +from pydeseq2.dds import DeseqDataSet +from pydeseq2.default_inference import DefaultInference +from pydeseq2.ds import DeseqStats +from adpbulk import ADPBulk + + +def pseudobulk_by_group(adt, groups, method="mean"): + # initialize the object + adpb = ADPBulk(adt, groupby=groups, method=method) + + # perform the pseudobulking + pseudobulk_matrix = adpb.fit_transform() + + # retrieve the sample metadata (useful for easy incorporation with edgeR) + sample_meta = adpb.get_meta() + + out = ad.AnnData( + X=pseudobulk_matrix, + obs=sample_meta.set_index('SampleName') + ) + + return out + + +def run_deseq(adata, design, tested_level, ref_level, n_cpus=8): + + inference = DefaultInference(n_cpus=n_cpus) + + dds = DeseqDataSet( + counts=adata.to_df().astype(int), + metadata=adata.obs, + design_factors=design, # compare samples based on the "condition" + refit_cooks=True, + inference=inference, + ) + + dds.deseq2() + + stat_res = DeseqStats( + dds, + contrast=[design, tested_level, ref_level], + inference=inference + ) + stat_res.summary() + + df = stat_res.results_df + + return df \ No newline at end of file diff --git a/CanDI/setup/dataverse.py b/CanDI/setup/dataverse.py index 1dac195..ea0aee6 100644 --- a/CanDI/setup/dataverse.py +++ b/CanDI/setup/dataverse.py @@ -9,6 +9,18 @@ CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H' + +### Datasets Metadata ### + +coessentiality_dataset_names = [ + 'genes', + # 10273535 + 'GLS_p', + # 10273534 + 'GLS_sign', + # 10273533 +] + depmap_dataset_names = [ 'CCLE_expression', 'CCLE_fusions', @@ -22,6 +34,11 @@ ] name2type = { + # Coessentiality datasets + 'genes': 'txt', + 'GLS_p': 'npy', + 'GLS_sign': 'npy', + # DepMap datasets 'CCLE_expression': 'csv', 'CCLE_fusions': 'csv', 'CCLE_gene_cn': 'csv', @@ -34,6 +51,11 @@ } name2id = { + # Coessentiality datasets + 'genes': 10273535, + 'GLS_p': 10273534, + 'GLS_sign': 10273533, + # DepMap datasets 'CCLE_expression': 8076862, 'CCLE_fusions': 10085763, 'CCLE_gene_cn': 8076861, @@ -46,6 +68,7 @@ } +### Utility functions ### def print_sys(s): """system print @@ -55,80 +78,102 @@ def print_sys(s): print(s, flush = True, file = sys.stderr) -def dataverse_download(url, path, name, types): - """dataverse download helper with progress bar - - Args: - url (str): the url of the dataset - path (str): the path to save the dataset - name (str): the dataset name - types (dict): a dictionary mapping from the dataset name to the file format - """ - save_path = os.path.join(path, f"{name}.{types[name]}") - response = requests.get(url, stream=True) - total_size_in_bytes = int(response.headers.get("content-length", 0)) - block_size = 1024 - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) - with open(save_path, "wb") as file: - for data in response.iter_content(block_size): - progress_bar.update(len(data)) - file.write(data) - progress_bar.close() - - -def download_wrapper(name, path, return_type=None): - """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files - - Args: - name (str): the rough dataset query name - path (str): the path to save the dataset - return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] +### Downloading scripts ### + +class Downloader: + def __init__(self): + pass + + def _dataverse_download(self, url, path, name, types): + """dataverse download helper with progress bar + + Args: + url (str): the url of the dataset + path (str): the path to save the dataset + name (str): the dataset name + types (dict): a dictionary mapping from the dataset name to the file format + """ + save_path = os.path.join(path, f"{name}.{types[name]}") + response = requests.get(url, stream=True) + total_size_in_bytes = int(response.headers.get("content-length", 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + with open(save_path, "wb") as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + + + def _download_wrapper(self, name, path, return_type=None): + """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files + + Args: + name (str): the rough dataset query name + path (str): the path to save the dataset + return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] + + Returns: + str: the exact dataset query name + """ + server_path = "https://dataverse.harvard.edu/api/access/datafile/" + + url = server_path + str(name2id[name]) + + if not os.path.exists(path): + os.mkdir(path) + + file_name = f"{name}.{name2type[name]}" + + if os.path.exists(os.path.join(path, file_name)): + print_sys("Found local copy...") + os.path.join(path, file_name) + else: + print_sys("Downloading...") + self._dataverse_download(url, path, name, name2type) + + if return_type == "url": + return url + elif return_type == "name": + return file_name + elif return_type == ["url", "name"]: + return url, file_name - Returns: - str: the exact dataset query name - """ - server_path = "https://dataverse.harvard.edu/api/access/datafile/" - - url = server_path + str(name2id[name]) - - if not os.path.exists(path): - os.mkdir(path) - - file_name = f"{name}.{name2type[name]}" - - if os.path.exists(os.path.join(path, file_name)): - print_sys("Found local copy...") - os.path.join(path, file_name) - else: - print_sys("Downloading...") - dataverse_download(url, path, name, name2type) - if return_type == "url": - return url - elif return_type == "name": - return file_name - elif return_type == ["url", "name"]: - return url, file_name - - -def depmap_dataverse_download(path, return_type=None): - """download all datasets to the path + def run(self, path, datasets, return_type=None): + """download all datasets to the path + + Args: + path (str): the path to save the datasets + return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] + """ + url_list = [] + file_names = [] + + for name in datasets: + url, file_name = self._download_wrapper(name, path, return_type=["url", "name"]) + url_list.append(url) + file_names.append(file_name) + + if return_type == "url": + return url_list + elif return_type == "name": + return file_names + elif return_type == ["url", "name"]: + return url_list, file_names + + +class DepMapDownloader(Downloader): + def __init__(self): + super().__init__() + + def download(self, path, return_type=None): + return self.run(path, depmap_dataset_names, return_type) - Args: - path (str): the path to save the datasets - return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] - """ - url_list = [] - file_names = [] - for name in depmap_dataset_names: - url, file_name = download_wrapper(name, path, return_type=["url", "name"]) - url_list.append(url) - file_names.append(file_name) +class CoessentialityDownloader(Downloader): + def __init__(self): + super().__init__() - if return_type == "url": - return url_list - elif return_type == "name": - return file_names - elif return_type == ["url", "name"]: - return url_list, file_names + def download(self, path, return_type=None): + return self.run(path, coessentiality_dataset_names, return_type) \ No newline at end of file diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py index 0042e94..29eed77 100644 --- a/CanDI/setup/install.py +++ b/CanDI/setup/install.py @@ -1,30 +1,45 @@ import argparse -from .manager import Manager +from . import manager + def main(): parser = argparse.ArgumentParser() + parser.add_argument("--database", help="Specify the database to download", default="depmap") parser.add_argument("--source", help="Specify the download source", default="dataverse") - parser.add_argument("--data_dir", help="Specify the data directory", default=None) + parser.add_argument("--directory", help="Specify the parent data directory", default='auto') args = parser.parse_args() - if args.source == 'dataverse': - print("Downloading data from Dataverse") - m = Manager(download_source=args.source, data_dir=args.data_dir) - m.download_reformatted_data() - m.write_config(m.cfig_path, m.parser) - - elif args.source == 'depmap': - print("Downloading data from DepMap") - m = Manager(download_source=args.source, data_dir=args.data_dir) - m.get_depmap_info() - m.write_config(m.cfig_path, m.parser) - m.download_defaults() - m.write_config(m.cfig_path, m.parser) - m.depmap_autoformat() - m.write_config(m.cfig_path, m.parser) + if args.database == 'depmap': + if args.source == 'dataverse': + print("Downloading data from Dataverse") + m = manager.DataverseDepMap(manager_path=args.directory, verbose=True) + m.download_reformatted_data() + m.write_config(m.cfig_path, m.parser) + + elif args.source == 'depmap': + print("Downloading data from DepMap") + m = manager.BroadDepMap(manager_path=args.directory, verbose=True) + m.get_depmap_info() + m.write_config(m.cfig_path, m.parser) + m.download_defaults() + m.write_config(m.cfig_path, m.parser) + m.depmap_autoformat() + m.write_config(m.cfig_path, m.parser) - else: - raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'") + else: + raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'") + if args.database == 'coessentiality': + if args.source == 'dataverse': + print("Downloading data from Dataverse") + m = manager.DataverseCoessentiality(manager_path=args.directory, verbose=True) + m.download_raw_files() + m.coessentiality_autoformat() + m.write_config(m.cfig_path, m.parser) + + else: + raise ValueError("Invalid source. Coessentiality data is only available on `dataverse`!") + + if __name__ == "__main__": main() \ No newline at end of file diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index 5d2c3c4..8efacf0 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -1,39 +1,114 @@ +""" +The manager module handles interations with the datasources +and the config file. It is used to setup of the config file upon installation. +All data downloading is done by Manager class and its subclasses. +""" + import os import configparser import json import time import requests -import shutil +import numpy as np +import polars as pl import pandas as pd from time import sleep from pathlib import Path from concurrent.futures import ThreadPoolExecutor -from .dataverse import depmap_dataverse_download +from . import dataverse + class Manager(object): - """The Manager class handles interations with the datasources - and the config file. It is used to setup of the config file upon installation. - All data downloading is done by Manager - """ - def __init__(self, download_source=None, data_dir=None): - - if data_dir: - manager_path = data_dir - else: + def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): + """Initializes the Manager class + + Args: + manager_path (str, optional): The path to the manager directory. This is where the data will be stored. + cfig_path (str, optional): The path to the config file. + """ + if manager_path == 'auto': manager_path = os.path.dirname(os.path.realpath(__file__)) - - cfig_path = manager_path + "/data/config.ini" + else: + # make sure the path is a directory and exists or create it + if not os.path.exists(manager_path): + os.makedirs(manager_path) + + if cfig_path == 'auto': + cfig_path = manager_path + "/data/config.ini" + + if verbose: + print(f"Manager Path: {manager_path}") + print(f"Config Path: {cfig_path}") + parser = configparser.ConfigParser() parser.read(cfig_path.replace(".ini", ".draft.ini")) self.manager_path = manager_path self.cfig_path = Path(cfig_path) self.parser = parser - self.download_source = download_source + + @staticmethod + def write_config(cfig_path, parser): + + print("Writing config file") + with open(cfig_path, "w") as f: + parser.write(f) + f.close() + + +class DataverseDepMap(Manager): + def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): + super().__init__(manager_path, cfig_path, verbose) + self.release = '21Q4' # default release uploded to CanDI dataverse + self.download_source = 'dataverse, ' + dataverse.CANDI_DATAVERSE_DOI - def sanger_download(): - pass + def download_reformatted_data(self): + if not os.path.exists(self.manager_path + '/data/'): + os.makedirs(self.manager_path + '/data/') + + if not os.path.exists(self.manager_path + '/data/depmap/'): + os.makedirs(self.manager_path + '/data/depmap/') + + session = dataverse.DepMapDownloader() + urls, file_names = session.download( + self.manager_path + '/data/depmap/', + return_type= ["url", "name"] + ) + depmap_urls = { + file: url for url, file in zip(urls, file_names) + } + + depmap_files = {} + for file in file_names: + f_key = file.split('.')[0] + f_key = f_key.replace('CCLE_','') + f_key = f_key.replace('CRISPR_','') + depmap_files[f_key] = file + + formatted = { + f'{self.manager_path}/data/depmap/{file}': file for file in file_names + if 'readme' not in file.lower() + } + + data_paths = { + 'depmap': 'data/depmap/', + 'genes': 'data/genes/', + 'corum': 'data/complexes/', + 'location': 'data/location/' + } + + self.parser["depmap_urls"] = depmap_urls + self.parser["depmap_files"] = depmap_files + self.parser["formatted"] = formatted + self.parser["data_paths"] = data_paths + + +class BroadDepMap(Manager): + def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): + super().__init__(manager_path, cfig_path, verbose) + self.download_source = 'Broad DepMap, https://depmap.org/' + def get_depmap_info(self, release="latest"): depmap = self.parser["download_urls"]["depmap"] @@ -48,7 +123,6 @@ def get_depmap_info(self, release="latest"): self.parser["depmap_urls"] = self.download_info self.parser["depmap_files"] = self.depmap_files - def parse_release(self): download_urls = {} @@ -74,16 +148,21 @@ def get_release(self, release): return release_info["releaseName"] - def format_filename(self, filename): + def format_filename(self, filename, release): + # set candi_name to the filename without the extension candi_name = filename.split(".")[0] - if "CRISPR_" in candi_name: - candi_name = candi_name[len("CRISPR_"):] - elif "CCLE_" in candi_name: - candi_name = candi_name[len("CCLE_"):] - if 'v2' in candi_name: - candi_name = candi_name[:-len("_v2")] + if release == "21Q4": + if "CRISPR_" in candi_name: + candi_name = candi_name[len("CRISPR_"):] + elif "CCLE_" in candi_name: + candi_name = candi_name[len("CCLE_"):] + if 'v2' in candi_name: + candi_name = candi_name[:-len("_v2")] + else: + #TODO: add more cases for different releases, e.g. 24Q4 new file formats + pass return candi_name @@ -118,14 +197,12 @@ def fetch_url(self, entry): downloads[filename] = str(path) - def parallel_fetch(self, entries): print("Starting Pool") with ThreadPoolExecutor(max_workers=4) as executor: for i in entries: executor.submit(self.fetch_url, i) - def download_defaults(self): default_sources = json.loads(self.parser.get("defaults","downloads")) @@ -134,7 +211,6 @@ def download_defaults(self): entries = [self.manage_request(i, "depmap") for i in to_download] self.parallel_fetch(entries) - def manage_request(self, name, path, filename=False): if filename: @@ -173,44 +249,48 @@ def depmap_autoformat(self): df = pd.read_csv(v, low_memory=False, memory_map=True) self.format_depmap_data(df, v) - def format_depmap_data(self, df, path): + def format_depmap_data(self, df, path, release): - if ("AAAS (8086)" in df.columns) or ("AAAS (ENSG00000094914)" in df.columns): + if release == "21Q4": + if ("AAAS (8086)" in df.columns) or ("AAAS (ENSG00000094914)" in df.columns): - df.rename(columns = lambda s: s.split(" ")[0], inplace=True) + df.rename(columns = lambda s: s.split(" ")[0], inplace=True) - if "Unnamed:" in df.columns: - df.rename(columns={"Unnamed:":"DepMap_ID"}, inplace=True) + if "Unnamed:" in df.columns: + df.rename(columns={"Unnamed:":"DepMap_ID"}, inplace=True) - df = df.set_index("DepMap_ID").T - df.reset_index(inplace=True) - df.rename(columns={"index":"gene"}, inplace=True) - df.set_index("gene", inplace=True) - df.to_csv(path) + df = df.set_index("DepMap_ID").T + df.reset_index(inplace=True) + df.rename(columns={"index":"gene"}, inplace=True) + df.set_index("gene", inplace=True) + df.to_csv(path) - if "Protein_Change" in df.columns: + if "Protein_Change" in df.columns: - try: - df.drop("Unnamed: 0", axis=1, inplace=True) - df.to_csv(path, index=False) - except KeyError: - pass + try: + df.drop("Unnamed: 0", axis=1, inplace=True) + df.to_csv(path, index=False) + except KeyError: + pass - if "Hugo_Symbol" in df.columns: - try: - df.rename(columns={"Hugo_Symbol": "gene"}, inplace=True) - df.to_csv(path, index=False) - except KeyError: - pass + if "Hugo_Symbol" in df.columns: + try: + df.rename(columns={"Hugo_Symbol": "gene"}, inplace=True) + df.to_csv(path, index=False) + except KeyError: + pass - if "LeftGene" in df.columns: - for col in df.columns: - if "Gene" in col: - split_cols = df[col].str.split(" ", expand=True) - df[col] = split_cols[0] - df[col[:-4] + "EnsemblID"] = split_cols[1].str.replace("(", "").str.replace(")", "") + if "LeftGene" in df.columns: + for col in df.columns: + if "Gene" in col: + split_cols = df[col].str.split(" ", expand=True) + df[col] = split_cols[0] + df[col[:-4] + "EnsemblID"] = split_cols[1].str.replace("(", "").str.replace(")", "") - df.to_csv(path, index=False) + df.to_csv(path, index=False) + else: + #TODO: add more cases for different releases, e.g. 24Q4 new file formats + pass try: formatted = self.parser["formatted"] @@ -221,46 +301,90 @@ def format_depmap_data(self, df, path): formatted[path.split("/")[-1]] = path - def download_reformatted_data(self, depmap_release=''): - if not os.path.exists(self.manager_path + '/data/'): - os.makedirs(self.manager_path + '/data/') +class SangerDepMap(Manager): + def __init__(self, cfig_path='auto'): + super().__init__(cfig_path) - if not os.path.exists(self.manager_path + '/data/depmap/'): - os.makedirs(self.manager_path + '/data/depmap/') + def sanger_download(): + pass - if self.download_source == "dataverse": - urls, file_names = depmap_dataverse_download( - self.manager_path + '/data/depmap/', - return_type= ["url", "name"] - ) - depmap_urls = { - file: url for url, file in zip(urls, file_names) - } +class DataverseCoessentiality(Manager): + def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): + super().__init__(manager_path, cfig_path, verbose) + self.download_source = 'Dataverse' + self.reference = 'https://github.com/kundajelab/coessentiality' + self.verbose = verbose + + def download_raw_files(self): + if not os.path.exists(self.manager_path + '/data/'): + os.makedirs(self.manager_path + '/data/') - depmap_files = {} - for file in file_names: - f_key = file.split('.')[0] - f_key = f_key.replace('CCLE_','') - f_key = f_key.replace('CRISPR_','') - depmap_files[f_key] = file + if not os.path.exists(self.manager_path + '/data/coessentiality/'): + os.makedirs(self.manager_path + '/data/coessentiality/') + + session = dataverse.CoessentialityDownloader() + urls, file_names = session.download( + self.manager_path + '/data/coessentiality/', + return_type= ["url", "name"] + ) - formatted = { - f'{self.manager_path}/data/depmap/{file}': file for file in file_names - if 'readme' not in file.lower() - } + self.urls = urls + self.file_names = file_names - self.parser["depmap_urls"] = depmap_urls - self.parser["depmap_files"] = depmap_files - self.parser["formatted"] = formatted + def _load_coessentiality_matrix(self): + data_dir = f'{self.manager_path}/data/coessentiality' - else: - raise RuntimeError("Set download source to 'dataverse' before running download_formated_data") + gene_names = pd.read_csv( + f'{data_dir}/genes.txt',header=None,names=['gene_name'] + )['gene_name'] + + GLS_sign = np.load(f'{data_dir}/GLS_sign.npy') + GLS_p = np.load(f'{data_dir}/GLS_p.npy') + + self.matrix = pl.from_dataframe( + pd.DataFrame((-1*np.log10(GLS_p)) * GLS_sign, columns = gene_names, index = gene_names).reset_index() + ) + + def _get_coessentiality_df(self, pvalue_threshold = 10**-3): + df = self.matrix.melt('gene_name') + df.columns = ['gene_1','gene_2','coessentiality'] + df = df.filter(~(pl.col('gene_1') == pl.col('gene_2'))) + df = df.filter(pl.col('coessentiality') > -np.log10(pvalue_threshold)) + + self.df = df + self.pvalue_threshold = pvalue_threshold - @staticmethod - def write_config(cfig_path, parser): + def coessentiality_autoformat(self): - print("Writing config file") - with open(cfig_path, "w") as f: - parser.write(f) - f.close() + if self.verbose: print("Building Coessentiality Matrix ...", end=' ') + self._load_coessentiality_matrix() + self.matrix.to_pandas().to_csv( + f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv' + ) + if self.verbose: print("Done!") + + if self.verbose: print("Building Coessentiality DataFrame ...", end=' ') + self._get_coessentiality_df() + self.df.to_pandas().to_csv( + f'{self.manager_path}/data/coessentiality/coessentiality_df.csv' + ) + if self.verbose: print("Done!") + + self.parser['data_paths'] = { + 'coessentiality': 'data/coessentiality/' + } + + self.parser['formatted'] = { + 'coessentiality_matrix.csv': f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv', + 'coessentiality_df.csv': f'{self.manager_path}/data/coessentiality/coessentiality_df.csv' + } + + self.parser['depmap_files'] = { + 'coessentiality': f'{self.manager_path}/data/coessentiality/coessentiality_df.csv', + 'coessentiality_matrix': f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv', + # 'coessentiality_signs': f'{self.manager_path}/data/coessentiality/GLS_sign.npy', + # 'coessentiality_pvalues': f'{self.manager_path}/data/coessentiality/GLS_p.npy', + # 'gene_names': f'{self.manager_path}/data/coessentiality/genes.txt', + # 'pvalue_threshold': self.pvalue_threshold, + } \ No newline at end of file diff --git a/CanDI/setup/reset_config.py b/CanDI/setup/reset_config.py index 39a3a9d..4325b8b 100644 --- a/CanDI/setup/reset_config.py +++ b/CanDI/setup/reset_config.py @@ -4,9 +4,24 @@ from .manager import Manager -def main(): +def write_cfig(cfig_path, parser): + + write_file = Manager.write_config + write_file(cfig_path, parser) + - cfig_path = os.path.dirname(os.path.realpath(__file__)) + "/data/config.ini" +def main(cfig_path='auto'): + """ + This function will reset the config file to only contain the default sections. + This is useful if you want to reset the config file to its original state. + """ + if cfig_path == 'auto': + cfig_path = os.path.dirname(os.path.realpath(__file__)) + "/data/config.ini" + elif os.path.exists(cfig_path) == False: + raise FileNotFoundError("Config file not found") + elif os.path.exists(cfig_path) == True: + print("Using config file at: " + cfig_path) + parser = configparser.ConfigParser() parser.read(cfig_path) @@ -21,10 +36,5 @@ def main(): write_cfig(cfig_path, parser) -def write_cfig(cfig_path, parser): - - write_file = Manager.write_config - write_file(cfig_path, parser) - if __name__ == "__main__": main() diff --git a/CanDI/setup/uninstall.py b/CanDI/setup/uninstall.py new file mode 100644 index 0000000..0037a56 --- /dev/null +++ b/CanDI/setup/uninstall.py @@ -0,0 +1,35 @@ +import os +import sys +import shutil +import argparse +from .manager import Manager + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--database", help="Specify the database to uninstall", default="depmap") + parser.add_argument("--directory", help="Specify the data parent directory", default='auto') + args = parser.parse_args() + + if args.database == 'depmap': + print("Uninstalling CanDI: removing DepMap data") + + m = Manager() + + if args.directory == 'auto': + depmap_path = m.manager_path + "/data/depmap/" + elif os.path.exists(args.directory): + depmap_path = args.directory + "/data/depmap/" + else: + sys.exit("Exit: Invalid directory path!") + + if not os.path.exists(depmap_path): + sys.exit("Exit: Directory does not contain DepMap data") + else: + os.listdir(depmap_path) + shutil.rmtree(depmap_path) + else: + raise ValueError("Invalid database. Currently only 'depmap' is supported") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/CanDI/structures/handlers.py b/CanDI/structures/handlers.py index 4a6b9d5..bbfed65 100644 --- a/CanDI/structures/handlers.py +++ b/CanDI/structures/handlers.py @@ -1,9 +1,10 @@ import operator import pandas as pd import numpy as np -import collections +from collections.abc import Iterable import six + class BinaryFilter: """BinaryFilter class filters datasets based on a specific threshold. It's often useful to filter essentiality, expression, copy number etc. @@ -146,7 +147,7 @@ def _get_variant(mut_dat, variant, item, all_except=False): assert item in mut_dat[variant].unique(), "{0} not found, options are: {1}".format(item, mut_dat[variant].unique()) - if isinstance(item, collections.Iterable) and not isinstance(item, six.string_types): + if isinstance(item, Iterable) and not isinstance(item, six.string_types): method = lambda x,y: mut_dat.loc[mut_dat[x].isin(y)] else: diff --git a/README.rst b/README.rst index 8c26095..c1c8222 100644 --- a/README.rst +++ b/README.rst @@ -1,36 +1,21 @@ CanDI - A global cancer data integrator ======================================= -|Documentation Status| -|DOI| -|Dataverse| +|PyPI| |Downloads| |Documentation Status| |DOI| |Dataverse| -Package Installation --------------------- +Installation +------------ -CanDI is now available on `PyPI `_ and can be installed with pip: +CanDI is now available on `PyPI `_ and can be installed with pip. +Then, a command from CanDI will automatically download stable datasets from `Dataverse `_. .. code:: bash + # Package Installation pip install PyCanDI -___ -For the latest version (development version) install from GitHub: - -.. code:: bash - - pip install git+https://github.com/GilbertLabUCSF/CanDI.git - - -Prepare Datasets -~~~~~~~~~~~~~~~~ - -The python command from CanDI will automatically download and modify -datasets. - -.. code:: bash - - python CanDI/CanDI/setup/install.py + # Prepare Datasets + candi-install Downloaded and formatted datasets would organize this way: @@ -52,7 +37,13 @@ Downloaded and formatted datasets would organize this way: └── locations └── merged_locations.csv -Package Usage + +**Note**: + *Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users + based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets.* + + +Usage ------------- Import CanDI into python @@ -79,9 +70,28 @@ CanDI Objects - ``GeneCluster`` : Provides cross dataset indexing for a group of user defined genes. +Citation +-------- + +If you use CanDI in your research, please cite the following paper: + +.. code:: bibtex + + Yogodzinski C, Arab A, Pritchard JR, Goodarzi H, Gilbert LA. + A global cancer data integrator reveals principles of synthetic lethality, sex disparity and immunotherapy. + Genome Med. 2021;13(1):167. Published 2021 Oct 18. doi:10.1186/s13073-021-00987-8 + + + +.. |PyPI| image:: https://img.shields.io/pypi/v/PyCanDI + :target: https://pypi.org/project/PyCanDI/ + .. |Documentation Status| image:: https://readthedocs.org/projects/candi/badge/?version=latest :target: https://candi.readthedocs.io/en/latest/?badge=latest +.. |Downloads| image:: https://static.pepy.tech/badge/pycandi + :target: https://pepy.tech/project/pycandi + .. |DOI| image:: https://zenodo.org/badge/DOI/10.1186/s13073-021-00987-8.svg :target: https://doi.org/10.1186/s13073-021-00987-8 diff --git a/environment.yml b/environment.yml index 3c3a84b..6570f0e 100644 --- a/environment.yml +++ b/environment.yml @@ -1,7 +1,13 @@ name: candi dependencies: - - python==3.9 + - python>=3.11,<4.0 - pandas + - numpy + - polars - configparser - requests - tqdm + - pip + - pip: + - pydeseq2 + - adpbulk \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8504299..8c1dd7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,9 @@ pandas +numpy +polars +anndata configparser requests tqdm - +adpbulk +pydeseq2 \ No newline at end of file diff --git a/scripts/run_deseq.r b/scripts/run_deseq.r deleted file mode 100644 index f4f1088..0000000 --- a/scripts/run_deseq.r +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env Rscript -library(DESeq2) - -args = commandArgs(trailingOnly=TRUE) - -#read data -counts.mat <- read.csv(args[1]) -coldata <- read.csv(args[2]) - -#name metadata columns -colnames(coldata) <- c("sample.id", "condition") -#convert datatype to factor -coldata[,-1] <- as.factor(coldata[, -1]) -#Match sample ids to sample columns in counts matrix -coldata$sample.id <- sub("-", ".", coldata$sample.id) - -#init dds object -dds <- DESeqDataSetFromMatrix(countData=counts.mat, - colData=coldata, - design= ~condition, - tidy = TRUE) - -dds <- estimateSizeFactors(dds) -#Filter lowly expressed genes -idx <- rowSums(counts(dds, normalized=TRUE) >= 5) >= 3 -dds <- dds[idx,] - -dds <- DESeq(dds) #run deseq -res <- results(dds) #get results -#Show results -print(summary(res)) -print(head(res)) - -write.csv(res, args[3]) #save results - diff --git a/setup.py b/setup.py index c15190e..b5a85ff 100644 --- a/setup.py +++ b/setup.py @@ -10,10 +10,13 @@ name='PyCanDI', description='A cancer data integration package', version=version, - packages=find_packages(), + + packages=find_packages(exclude=['tests', 'test_*']), + long_description=long_description, long_description_content_type='text/x-rst', - python_requires='>=3.9', + + python_requires='>=3.11,<4.0', install_requires=[ "pandas", "configparser", @@ -21,14 +24,21 @@ "tqdm", ], url = 'https://github.com/GilbertLabUCSF/CanDI', + entry_points={ 'console_scripts': [ 'candi-install = CanDI.setup.install:main', + 'candi-uninstall = CanDI.setup.uninstall:main', ], }, + classifiers=[ 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', ], + + include_package_data=True, + setup_requires=['setuptools_scm'], + ) diff --git a/tests/test_candi.py b/tests/test_candi.py index dd64b59..74f1a08 100644 --- a/tests/test_candi.py +++ b/tests/test_candi.py @@ -2,6 +2,7 @@ import pandas as pd import numpy as np from CanDI.structures.entity import Entity +from CanDI.setup.manager import Manager class testEntity(unittest.TestCase): @@ -92,5 +93,6 @@ def test_canc_filters(self): self.assertIsInstance(over, pd.core.frame.DataFrame) self.assertIsInstance(under, pd.core.frame.DataFrame) - - +class testManager(unittest.TestCase): + #TODO: Implement tests for Manager class + pass \ No newline at end of file