diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index af02bed..e5e6c9a 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
+ python-version: ["3.11"]
steps:
- uses: actions/checkout@v3
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 4a1e620..9913050 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -15,7 +15,7 @@ jobs:
fail-fast: false
matrix:
os-version: ["ubuntu-latest"]
- python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
+ python-version: ["3.11"]
steps:
- uses: actions/checkout@v3
diff --git a/.gitignore b/.gitignore
index 2bab4d9..cdbe076 100644
Binary files a/.gitignore and b/.gitignore differ
diff --git a/CanDI/__version__.py b/CanDI/__version__.py
index cab7576..1fee926 100644
--- a/CanDI/__version__.py
+++ b/CanDI/__version__.py
@@ -1 +1 @@
-version = "0.1.1"
\ No newline at end of file
+version = "0.2.0"
\ No newline at end of file
diff --git a/CanDI/candi/__init__.py b/CanDI/candi/__init__.py
index 7fc3fbd..a2245f0 100644
--- a/CanDI/candi/__init__.py
+++ b/CanDI/candi/__init__.py
@@ -1,4 +1,6 @@
+from . import load
from . import data
+
data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects
-from . import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)
+from .candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)
diff --git a/CanDI/candi/candi.py b/CanDI/candi/candi.py
index c422fd8..eab6e45 100644
--- a/CanDI/candi/candi.py
+++ b/CanDI/candi/candi.py
@@ -1,11 +1,11 @@
# Classes for handling data aggregations
import operator
-from collections import OrderedDict, MutableSequence
+from collections.abc import MutableSequence
import itertools as it
import pandas as pd
import numpy as np
from . import data, grabber
-from . import entity
+from ..structures import entity
class SubsetHandler(object):
diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py
index 5a2921c..24fafcb 100644
--- a/CanDI/candi/data.py
+++ b/CanDI/candi/data.py
@@ -14,21 +14,27 @@ class Data(object):
can be tuned to load specific datasets upon import by editing config.ini
can call Data.load() to load any specific dataset
"""
- def __init__(self):
+ def __init__(self, config_path='auto', verbose=False):
- self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
- config_path = self._file_path / 'data/config.ini'
+ if config_path == 'auto':
+ self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
+ config_path = self._file_path / 'data/config.ini'
+ elif os.path.exists(config_path) == False:
+ raise FileNotFoundError("Config file not found at {}".format(config_path))
+ elif os.path.exists(config_path) == True:
+ if verbose: print("Using config file at {}".format(config_path))
parser = configparser.ConfigParser() #parses config for data sources
parser.read(config_path)
self._parser = parser
- #self._verify_install()
+ self._verify_install()
self._init_sources()
self._init_depmap_paths()
- # self._init_index_tables()
+ self._init_index_tables()
def _verify_install(self): #ensures data being loaded is present
+ #TODO: add more checks for different data sources
try:
assert "depmap_urls" in self._parser.sections()
except AssertionError:
@@ -91,6 +97,7 @@ def _handle_autoload(method, path):
df = pd.read_csv(path,
memory_map=True,
low_memory=False,
+ sep='\t',
index_col="DepMap_ID")
elif method == "locations":
diff --git a/CanDI/pipelines/__init__.py b/CanDI/pipelines/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/CanDI/pipelines/coessentiality/__init__.py b/CanDI/pipelines/coessentiality/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/CanDI/pipelines/diffexp.py b/CanDI/pipelines/diffexp.py
new file mode 100644
index 0000000..ea4a581
--- /dev/null
+++ b/CanDI/pipelines/diffexp.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+import anndata as ad
+
+from pydeseq2.dds import DeseqDataSet
+from pydeseq2.default_inference import DefaultInference
+from pydeseq2.ds import DeseqStats
+from adpbulk import ADPBulk
+
+
+def pseudobulk_by_group(adt, groups, method="mean"):
+ # initialize the object
+ adpb = ADPBulk(adt, groupby=groups, method=method)
+
+ # perform the pseudobulking
+ pseudobulk_matrix = adpb.fit_transform()
+
+ # retrieve the sample metadata (useful for easy incorporation with edgeR)
+ sample_meta = adpb.get_meta()
+
+ out = ad.AnnData(
+ X=pseudobulk_matrix,
+ obs=sample_meta.set_index('SampleName')
+ )
+
+ return out
+
+
+def run_deseq(adata, design, tested_level, ref_level, n_cpus=8):
+
+ inference = DefaultInference(n_cpus=n_cpus)
+
+ dds = DeseqDataSet(
+ counts=adata.to_df().astype(int),
+ metadata=adata.obs,
+ design_factors=design, # compare samples based on the "condition"
+ refit_cooks=True,
+ inference=inference,
+ )
+
+ dds.deseq2()
+
+ stat_res = DeseqStats(
+ dds,
+ contrast=[design, tested_level, ref_level],
+ inference=inference
+ )
+ stat_res.summary()
+
+ df = stat_res.results_df
+
+ return df
\ No newline at end of file
diff --git a/CanDI/setup/dataverse.py b/CanDI/setup/dataverse.py
index 1dac195..ea0aee6 100644
--- a/CanDI/setup/dataverse.py
+++ b/CanDI/setup/dataverse.py
@@ -9,6 +9,18 @@
CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H'
+
+### Datasets Metadata ###
+
+coessentiality_dataset_names = [
+ 'genes',
+ # 10273535
+ 'GLS_p',
+ # 10273534
+ 'GLS_sign',
+ # 10273533
+]
+
depmap_dataset_names = [
'CCLE_expression',
'CCLE_fusions',
@@ -22,6 +34,11 @@
]
name2type = {
+ # Coessentiality datasets
+ 'genes': 'txt',
+ 'GLS_p': 'npy',
+ 'GLS_sign': 'npy',
+ # DepMap datasets
'CCLE_expression': 'csv',
'CCLE_fusions': 'csv',
'CCLE_gene_cn': 'csv',
@@ -34,6 +51,11 @@
}
name2id = {
+ # Coessentiality datasets
+ 'genes': 10273535,
+ 'GLS_p': 10273534,
+ 'GLS_sign': 10273533,
+ # DepMap datasets
'CCLE_expression': 8076862,
'CCLE_fusions': 10085763,
'CCLE_gene_cn': 8076861,
@@ -46,6 +68,7 @@
}
+### Utility functions ###
def print_sys(s):
"""system print
@@ -55,80 +78,102 @@ def print_sys(s):
print(s, flush = True, file = sys.stderr)
-def dataverse_download(url, path, name, types):
- """dataverse download helper with progress bar
-
- Args:
- url (str): the url of the dataset
- path (str): the path to save the dataset
- name (str): the dataset name
- types (dict): a dictionary mapping from the dataset name to the file format
- """
- save_path = os.path.join(path, f"{name}.{types[name]}")
- response = requests.get(url, stream=True)
- total_size_in_bytes = int(response.headers.get("content-length", 0))
- block_size = 1024
- progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
- with open(save_path, "wb") as file:
- for data in response.iter_content(block_size):
- progress_bar.update(len(data))
- file.write(data)
- progress_bar.close()
-
-
-def download_wrapper(name, path, return_type=None):
- """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
-
- Args:
- name (str): the rough dataset query name
- path (str): the path to save the dataset
- return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+### Downloading scripts ###
+
+class Downloader:
+ def __init__(self):
+ pass
+
+ def _dataverse_download(self, url, path, name, types):
+ """dataverse download helper with progress bar
+
+ Args:
+ url (str): the url of the dataset
+ path (str): the path to save the dataset
+ name (str): the dataset name
+ types (dict): a dictionary mapping from the dataset name to the file format
+ """
+ save_path = os.path.join(path, f"{name}.{types[name]}")
+ response = requests.get(url, stream=True)
+ total_size_in_bytes = int(response.headers.get("content-length", 0))
+ block_size = 1024
+ progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+ with open(save_path, "wb") as file:
+ for data in response.iter_content(block_size):
+ progress_bar.update(len(data))
+ file.write(data)
+ progress_bar.close()
+
+
+ def _download_wrapper(self, name, path, return_type=None):
+ """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
+
+ Args:
+ name (str): the rough dataset query name
+ path (str): the path to save the dataset
+ return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+
+ Returns:
+ str: the exact dataset query name
+ """
+ server_path = "https://dataverse.harvard.edu/api/access/datafile/"
+
+ url = server_path + str(name2id[name])
+
+ if not os.path.exists(path):
+ os.mkdir(path)
+
+ file_name = f"{name}.{name2type[name]}"
+
+ if os.path.exists(os.path.join(path, file_name)):
+ print_sys("Found local copy...")
+ os.path.join(path, file_name)
+ else:
+ print_sys("Downloading...")
+ self._dataverse_download(url, path, name, name2type)
+
+ if return_type == "url":
+ return url
+ elif return_type == "name":
+ return file_name
+ elif return_type == ["url", "name"]:
+ return url, file_name
- Returns:
- str: the exact dataset query name
- """
- server_path = "https://dataverse.harvard.edu/api/access/datafile/"
-
- url = server_path + str(name2id[name])
-
- if not os.path.exists(path):
- os.mkdir(path)
-
- file_name = f"{name}.{name2type[name]}"
-
- if os.path.exists(os.path.join(path, file_name)):
- print_sys("Found local copy...")
- os.path.join(path, file_name)
- else:
- print_sys("Downloading...")
- dataverse_download(url, path, name, name2type)
- if return_type == "url":
- return url
- elif return_type == "name":
- return file_name
- elif return_type == ["url", "name"]:
- return url, file_name
-
-
-def depmap_dataverse_download(path, return_type=None):
- """download all datasets to the path
+ def run(self, path, datasets, return_type=None):
+ """download all datasets to the path
+
+ Args:
+ path (str): the path to save the datasets
+ return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+ """
+ url_list = []
+ file_names = []
+
+ for name in datasets:
+ url, file_name = self._download_wrapper(name, path, return_type=["url", "name"])
+ url_list.append(url)
+ file_names.append(file_name)
+
+ if return_type == "url":
+ return url_list
+ elif return_type == "name":
+ return file_names
+ elif return_type == ["url", "name"]:
+ return url_list, file_names
+
+
+class DepMapDownloader(Downloader):
+ def __init__(self):
+ super().__init__()
+
+ def download(self, path, return_type=None):
+ return self.run(path, depmap_dataset_names, return_type)
- Args:
- path (str): the path to save the datasets
- return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
- """
- url_list = []
- file_names = []
- for name in depmap_dataset_names:
- url, file_name = download_wrapper(name, path, return_type=["url", "name"])
- url_list.append(url)
- file_names.append(file_name)
+class CoessentialityDownloader(Downloader):
+ def __init__(self):
+ super().__init__()
- if return_type == "url":
- return url_list
- elif return_type == "name":
- return file_names
- elif return_type == ["url", "name"]:
- return url_list, file_names
+ def download(self, path, return_type=None):
+ return self.run(path, coessentiality_dataset_names, return_type)
\ No newline at end of file
diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py
index 0042e94..29eed77 100644
--- a/CanDI/setup/install.py
+++ b/CanDI/setup/install.py
@@ -1,30 +1,45 @@
import argparse
-from .manager import Manager
+from . import manager
+
def main():
parser = argparse.ArgumentParser()
+ parser.add_argument("--database", help="Specify the database to download", default="depmap")
parser.add_argument("--source", help="Specify the download source", default="dataverse")
- parser.add_argument("--data_dir", help="Specify the data directory", default=None)
+ parser.add_argument("--directory", help="Specify the parent data directory", default='auto')
args = parser.parse_args()
- if args.source == 'dataverse':
- print("Downloading data from Dataverse")
- m = Manager(download_source=args.source, data_dir=args.data_dir)
- m.download_reformatted_data()
- m.write_config(m.cfig_path, m.parser)
-
- elif args.source == 'depmap':
- print("Downloading data from DepMap")
- m = Manager(download_source=args.source, data_dir=args.data_dir)
- m.get_depmap_info()
- m.write_config(m.cfig_path, m.parser)
- m.download_defaults()
- m.write_config(m.cfig_path, m.parser)
- m.depmap_autoformat()
- m.write_config(m.cfig_path, m.parser)
+ if args.database == 'depmap':
+ if args.source == 'dataverse':
+ print("Downloading data from Dataverse")
+ m = manager.DataverseDepMap(manager_path=args.directory, verbose=True)
+ m.download_reformatted_data()
+ m.write_config(m.cfig_path, m.parser)
+
+ elif args.source == 'depmap':
+ print("Downloading data from DepMap")
+ m = manager.BroadDepMap(manager_path=args.directory, verbose=True)
+ m.get_depmap_info()
+ m.write_config(m.cfig_path, m.parser)
+ m.download_defaults()
+ m.write_config(m.cfig_path, m.parser)
+ m.depmap_autoformat()
+ m.write_config(m.cfig_path, m.parser)
- else:
- raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'")
+ else:
+ raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'")
+ if args.database == 'coessentiality':
+ if args.source == 'dataverse':
+ print("Downloading data from Dataverse")
+ m = manager.DataverseCoessentiality(manager_path=args.directory, verbose=True)
+ m.download_raw_files()
+ m.coessentiality_autoformat()
+ m.write_config(m.cfig_path, m.parser)
+
+ else:
+ raise ValueError("Invalid source. Coessentiality data is only available on `dataverse`!")
+
+
if __name__ == "__main__":
main()
\ No newline at end of file
diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py
index 5d2c3c4..8efacf0 100644
--- a/CanDI/setup/manager.py
+++ b/CanDI/setup/manager.py
@@ -1,39 +1,114 @@
+"""
+The manager module handles interations with the datasources
+and the config file. It is used to setup of the config file upon installation.
+All data downloading is done by Manager class and its subclasses.
+"""
+
import os
import configparser
import json
import time
import requests
-import shutil
+import numpy as np
+import polars as pl
import pandas as pd
from time import sleep
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
-from .dataverse import depmap_dataverse_download
+from . import dataverse
+
class Manager(object):
- """The Manager class handles interations with the datasources
- and the config file. It is used to setup of the config file upon installation.
- All data downloading is done by Manager
- """
- def __init__(self, download_source=None, data_dir=None):
-
- if data_dir:
- manager_path = data_dir
- else:
+ def __init__(self, manager_path='auto', cfig_path='auto', verbose=False):
+ """Initializes the Manager class
+
+ Args:
+ manager_path (str, optional): The path to the manager directory. This is where the data will be stored.
+ cfig_path (str, optional): The path to the config file.
+ """
+ if manager_path == 'auto':
manager_path = os.path.dirname(os.path.realpath(__file__))
-
- cfig_path = manager_path + "/data/config.ini"
+ else:
+ # make sure the path is a directory and exists or create it
+ if not os.path.exists(manager_path):
+ os.makedirs(manager_path)
+
+ if cfig_path == 'auto':
+ cfig_path = manager_path + "/data/config.ini"
+
+ if verbose:
+ print(f"Manager Path: {manager_path}")
+ print(f"Config Path: {cfig_path}")
+
parser = configparser.ConfigParser()
parser.read(cfig_path.replace(".ini", ".draft.ini"))
self.manager_path = manager_path
self.cfig_path = Path(cfig_path)
self.parser = parser
- self.download_source = download_source
+
+ @staticmethod
+ def write_config(cfig_path, parser):
+
+ print("Writing config file")
+ with open(cfig_path, "w") as f:
+ parser.write(f)
+ f.close()
+
+
+class DataverseDepMap(Manager):
+ def __init__(self, manager_path='auto', cfig_path='auto', verbose=False):
+ super().__init__(manager_path, cfig_path, verbose)
+ self.release = '21Q4' # default release uploded to CanDI dataverse
+ self.download_source = 'dataverse, ' + dataverse.CANDI_DATAVERSE_DOI
- def sanger_download():
- pass
+ def download_reformatted_data(self):
+ if not os.path.exists(self.manager_path + '/data/'):
+ os.makedirs(self.manager_path + '/data/')
+
+ if not os.path.exists(self.manager_path + '/data/depmap/'):
+ os.makedirs(self.manager_path + '/data/depmap/')
+
+ session = dataverse.DepMapDownloader()
+ urls, file_names = session.download(
+ self.manager_path + '/data/depmap/',
+ return_type= ["url", "name"]
+ )
+ depmap_urls = {
+ file: url for url, file in zip(urls, file_names)
+ }
+
+ depmap_files = {}
+ for file in file_names:
+ f_key = file.split('.')[0]
+ f_key = f_key.replace('CCLE_','')
+ f_key = f_key.replace('CRISPR_','')
+ depmap_files[f_key] = file
+
+ formatted = {
+ f'{self.manager_path}/data/depmap/{file}': file for file in file_names
+ if 'readme' not in file.lower()
+ }
+
+ data_paths = {
+ 'depmap': 'data/depmap/',
+ 'genes': 'data/genes/',
+ 'corum': 'data/complexes/',
+ 'location': 'data/location/'
+ }
+
+ self.parser["depmap_urls"] = depmap_urls
+ self.parser["depmap_files"] = depmap_files
+ self.parser["formatted"] = formatted
+ self.parser["data_paths"] = data_paths
+
+
+class BroadDepMap(Manager):
+ def __init__(self, manager_path='auto', cfig_path='auto', verbose=False):
+ super().__init__(manager_path, cfig_path, verbose)
+ self.download_source = 'Broad DepMap, https://depmap.org/'
+
def get_depmap_info(self, release="latest"):
depmap = self.parser["download_urls"]["depmap"]
@@ -48,7 +123,6 @@ def get_depmap_info(self, release="latest"):
self.parser["depmap_urls"] = self.download_info
self.parser["depmap_files"] = self.depmap_files
-
def parse_release(self):
download_urls = {}
@@ -74,16 +148,21 @@ def get_release(self, release):
return release_info["releaseName"]
- def format_filename(self, filename):
+ def format_filename(self, filename, release):
+ # set candi_name to the filename without the extension
candi_name = filename.split(".")[0]
- if "CRISPR_" in candi_name:
- candi_name = candi_name[len("CRISPR_"):]
- elif "CCLE_" in candi_name:
- candi_name = candi_name[len("CCLE_"):]
- if 'v2' in candi_name:
- candi_name = candi_name[:-len("_v2")]
+ if release == "21Q4":
+ if "CRISPR_" in candi_name:
+ candi_name = candi_name[len("CRISPR_"):]
+ elif "CCLE_" in candi_name:
+ candi_name = candi_name[len("CCLE_"):]
+ if 'v2' in candi_name:
+ candi_name = candi_name[:-len("_v2")]
+ else:
+ #TODO: add more cases for different releases, e.g. 24Q4 new file formats
+ pass
return candi_name
@@ -118,14 +197,12 @@ def fetch_url(self, entry):
downloads[filename] = str(path)
-
def parallel_fetch(self, entries):
print("Starting Pool")
with ThreadPoolExecutor(max_workers=4) as executor:
for i in entries:
executor.submit(self.fetch_url, i)
-
def download_defaults(self):
default_sources = json.loads(self.parser.get("defaults","downloads"))
@@ -134,7 +211,6 @@ def download_defaults(self):
entries = [self.manage_request(i, "depmap") for i in to_download]
self.parallel_fetch(entries)
-
def manage_request(self, name, path, filename=False):
if filename:
@@ -173,44 +249,48 @@ def depmap_autoformat(self):
df = pd.read_csv(v, low_memory=False, memory_map=True)
self.format_depmap_data(df, v)
- def format_depmap_data(self, df, path):
+ def format_depmap_data(self, df, path, release):
- if ("AAAS (8086)" in df.columns) or ("AAAS (ENSG00000094914)" in df.columns):
+ if release == "21Q4":
+ if ("AAAS (8086)" in df.columns) or ("AAAS (ENSG00000094914)" in df.columns):
- df.rename(columns = lambda s: s.split(" ")[0], inplace=True)
+ df.rename(columns = lambda s: s.split(" ")[0], inplace=True)
- if "Unnamed:" in df.columns:
- df.rename(columns={"Unnamed:":"DepMap_ID"}, inplace=True)
+ if "Unnamed:" in df.columns:
+ df.rename(columns={"Unnamed:":"DepMap_ID"}, inplace=True)
- df = df.set_index("DepMap_ID").T
- df.reset_index(inplace=True)
- df.rename(columns={"index":"gene"}, inplace=True)
- df.set_index("gene", inplace=True)
- df.to_csv(path)
+ df = df.set_index("DepMap_ID").T
+ df.reset_index(inplace=True)
+ df.rename(columns={"index":"gene"}, inplace=True)
+ df.set_index("gene", inplace=True)
+ df.to_csv(path)
- if "Protein_Change" in df.columns:
+ if "Protein_Change" in df.columns:
- try:
- df.drop("Unnamed: 0", axis=1, inplace=True)
- df.to_csv(path, index=False)
- except KeyError:
- pass
+ try:
+ df.drop("Unnamed: 0", axis=1, inplace=True)
+ df.to_csv(path, index=False)
+ except KeyError:
+ pass
- if "Hugo_Symbol" in df.columns:
- try:
- df.rename(columns={"Hugo_Symbol": "gene"}, inplace=True)
- df.to_csv(path, index=False)
- except KeyError:
- pass
+ if "Hugo_Symbol" in df.columns:
+ try:
+ df.rename(columns={"Hugo_Symbol": "gene"}, inplace=True)
+ df.to_csv(path, index=False)
+ except KeyError:
+ pass
- if "LeftGene" in df.columns:
- for col in df.columns:
- if "Gene" in col:
- split_cols = df[col].str.split(" ", expand=True)
- df[col] = split_cols[0]
- df[col[:-4] + "EnsemblID"] = split_cols[1].str.replace("(", "").str.replace(")", "")
+ if "LeftGene" in df.columns:
+ for col in df.columns:
+ if "Gene" in col:
+ split_cols = df[col].str.split(" ", expand=True)
+ df[col] = split_cols[0]
+ df[col[:-4] + "EnsemblID"] = split_cols[1].str.replace("(", "").str.replace(")", "")
- df.to_csv(path, index=False)
+ df.to_csv(path, index=False)
+ else:
+ #TODO: add more cases for different releases, e.g. 24Q4 new file formats
+ pass
try:
formatted = self.parser["formatted"]
@@ -221,46 +301,90 @@ def format_depmap_data(self, df, path):
formatted[path.split("/")[-1]] = path
- def download_reformatted_data(self, depmap_release=''):
- if not os.path.exists(self.manager_path + '/data/'):
- os.makedirs(self.manager_path + '/data/')
+class SangerDepMap(Manager):
+ def __init__(self, cfig_path='auto'):
+ super().__init__(cfig_path)
- if not os.path.exists(self.manager_path + '/data/depmap/'):
- os.makedirs(self.manager_path + '/data/depmap/')
+ def sanger_download():
+ pass
- if self.download_source == "dataverse":
- urls, file_names = depmap_dataverse_download(
- self.manager_path + '/data/depmap/',
- return_type= ["url", "name"]
- )
- depmap_urls = {
- file: url for url, file in zip(urls, file_names)
- }
+class DataverseCoessentiality(Manager):
+ def __init__(self, manager_path='auto', cfig_path='auto', verbose=False):
+ super().__init__(manager_path, cfig_path, verbose)
+ self.download_source = 'Dataverse'
+ self.reference = 'https://github.com/kundajelab/coessentiality'
+ self.verbose = verbose
+
+ def download_raw_files(self):
+ if not os.path.exists(self.manager_path + '/data/'):
+ os.makedirs(self.manager_path + '/data/')
- depmap_files = {}
- for file in file_names:
- f_key = file.split('.')[0]
- f_key = f_key.replace('CCLE_','')
- f_key = f_key.replace('CRISPR_','')
- depmap_files[f_key] = file
+ if not os.path.exists(self.manager_path + '/data/coessentiality/'):
+ os.makedirs(self.manager_path + '/data/coessentiality/')
+
+ session = dataverse.CoessentialityDownloader()
+ urls, file_names = session.download(
+ self.manager_path + '/data/coessentiality/',
+ return_type= ["url", "name"]
+ )
- formatted = {
- f'{self.manager_path}/data/depmap/{file}': file for file in file_names
- if 'readme' not in file.lower()
- }
+ self.urls = urls
+ self.file_names = file_names
- self.parser["depmap_urls"] = depmap_urls
- self.parser["depmap_files"] = depmap_files
- self.parser["formatted"] = formatted
+ def _load_coessentiality_matrix(self):
+ data_dir = f'{self.manager_path}/data/coessentiality'
- else:
- raise RuntimeError("Set download source to 'dataverse' before running download_formated_data")
+ gene_names = pd.read_csv(
+ f'{data_dir}/genes.txt',header=None,names=['gene_name']
+ )['gene_name']
+
+ GLS_sign = np.load(f'{data_dir}/GLS_sign.npy')
+ GLS_p = np.load(f'{data_dir}/GLS_p.npy')
+
+ self.matrix = pl.from_dataframe(
+ pd.DataFrame((-1*np.log10(GLS_p)) * GLS_sign, columns = gene_names, index = gene_names).reset_index()
+ )
+
+ def _get_coessentiality_df(self, pvalue_threshold = 10**-3):
+ df = self.matrix.melt('gene_name')
+ df.columns = ['gene_1','gene_2','coessentiality']
+ df = df.filter(~(pl.col('gene_1') == pl.col('gene_2')))
+ df = df.filter(pl.col('coessentiality') > -np.log10(pvalue_threshold))
+
+ self.df = df
+ self.pvalue_threshold = pvalue_threshold
- @staticmethod
- def write_config(cfig_path, parser):
+ def coessentiality_autoformat(self):
- print("Writing config file")
- with open(cfig_path, "w") as f:
- parser.write(f)
- f.close()
+ if self.verbose: print("Building Coessentiality Matrix ...", end=' ')
+ self._load_coessentiality_matrix()
+ self.matrix.to_pandas().to_csv(
+ f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv'
+ )
+ if self.verbose: print("Done!")
+
+ if self.verbose: print("Building Coessentiality DataFrame ...", end=' ')
+ self._get_coessentiality_df()
+ self.df.to_pandas().to_csv(
+ f'{self.manager_path}/data/coessentiality/coessentiality_df.csv'
+ )
+ if self.verbose: print("Done!")
+
+ self.parser['data_paths'] = {
+ 'coessentiality': 'data/coessentiality/'
+ }
+
+ self.parser['formatted'] = {
+ 'coessentiality_matrix.csv': f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv',
+ 'coessentiality_df.csv': f'{self.manager_path}/data/coessentiality/coessentiality_df.csv'
+ }
+
+ self.parser['depmap_files'] = {
+ 'coessentiality': f'{self.manager_path}/data/coessentiality/coessentiality_df.csv',
+ 'coessentiality_matrix': f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv',
+ # 'coessentiality_signs': f'{self.manager_path}/data/coessentiality/GLS_sign.npy',
+ # 'coessentiality_pvalues': f'{self.manager_path}/data/coessentiality/GLS_p.npy',
+ # 'gene_names': f'{self.manager_path}/data/coessentiality/genes.txt',
+ # 'pvalue_threshold': self.pvalue_threshold,
+ }
\ No newline at end of file
diff --git a/CanDI/setup/reset_config.py b/CanDI/setup/reset_config.py
index 39a3a9d..4325b8b 100644
--- a/CanDI/setup/reset_config.py
+++ b/CanDI/setup/reset_config.py
@@ -4,9 +4,24 @@
from .manager import Manager
-def main():
+def write_cfig(cfig_path, parser):
+
+ write_file = Manager.write_config
+ write_file(cfig_path, parser)
+
- cfig_path = os.path.dirname(os.path.realpath(__file__)) + "/data/config.ini"
+def main(cfig_path='auto'):
+ """
+ This function will reset the config file to only contain the default sections.
+ This is useful if you want to reset the config file to its original state.
+ """
+ if cfig_path == 'auto':
+ cfig_path = os.path.dirname(os.path.realpath(__file__)) + "/data/config.ini"
+ elif os.path.exists(cfig_path) == False:
+ raise FileNotFoundError("Config file not found")
+ elif os.path.exists(cfig_path) == True:
+ print("Using config file at: " + cfig_path)
+
parser = configparser.ConfigParser()
parser.read(cfig_path)
@@ -21,10 +36,5 @@ def main():
write_cfig(cfig_path, parser)
-def write_cfig(cfig_path, parser):
-
- write_file = Manager.write_config
- write_file(cfig_path, parser)
-
if __name__ == "__main__":
main()
diff --git a/CanDI/setup/uninstall.py b/CanDI/setup/uninstall.py
new file mode 100644
index 0000000..0037a56
--- /dev/null
+++ b/CanDI/setup/uninstall.py
@@ -0,0 +1,35 @@
+import os
+import sys
+import shutil
+import argparse
+from .manager import Manager
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--database", help="Specify the database to uninstall", default="depmap")
+ parser.add_argument("--directory", help="Specify the data parent directory", default='auto')
+ args = parser.parse_args()
+
+ if args.database == 'depmap':
+ print("Uninstalling CanDI: removing DepMap data")
+
+ m = Manager()
+
+ if args.directory == 'auto':
+ depmap_path = m.manager_path + "/data/depmap/"
+ elif os.path.exists(args.directory):
+ depmap_path = args.directory + "/data/depmap/"
+ else:
+ sys.exit("Exit: Invalid directory path!")
+
+ if not os.path.exists(depmap_path):
+ sys.exit("Exit: Directory does not contain DepMap data")
+ else:
+ os.listdir(depmap_path)
+ shutil.rmtree(depmap_path)
+ else:
+ raise ValueError("Invalid database. Currently only 'depmap' is supported")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/CanDI/structures/handlers.py b/CanDI/structures/handlers.py
index 4a6b9d5..bbfed65 100644
--- a/CanDI/structures/handlers.py
+++ b/CanDI/structures/handlers.py
@@ -1,9 +1,10 @@
import operator
import pandas as pd
import numpy as np
-import collections
+from collections.abc import Iterable
import six
+
class BinaryFilter:
"""BinaryFilter class filters datasets based on a specific threshold.
It's often useful to filter essentiality, expression, copy number etc.
@@ -146,7 +147,7 @@ def _get_variant(mut_dat, variant, item, all_except=False):
assert item in mut_dat[variant].unique(), "{0} not found, options are: {1}".format(item, mut_dat[variant].unique())
- if isinstance(item, collections.Iterable) and not isinstance(item, six.string_types):
+ if isinstance(item, Iterable) and not isinstance(item, six.string_types):
method = lambda x,y: mut_dat.loc[mut_dat[x].isin(y)]
else:
diff --git a/README.rst b/README.rst
index 8c26095..c1c8222 100644
--- a/README.rst
+++ b/README.rst
@@ -1,36 +1,21 @@
CanDI - A global cancer data integrator
=======================================
-|Documentation Status|
-|DOI|
-|Dataverse|
+|PyPI| |Downloads| |Documentation Status| |DOI| |Dataverse|
-Package Installation
---------------------
+Installation
+------------
-CanDI is now available on `PyPI `_ and can be installed with pip:
+CanDI is now available on `PyPI `_ and can be installed with pip.
+Then, a command from CanDI will automatically download stable datasets from `Dataverse `_.
.. code:: bash
+ # Package Installation
pip install PyCanDI
-___
-For the latest version (development version) install from GitHub:
-
-.. code:: bash
-
- pip install git+https://github.com/GilbertLabUCSF/CanDI.git
-
-
-Prepare Datasets
-~~~~~~~~~~~~~~~~
-
-The python command from CanDI will automatically download and modify
-datasets.
-
-.. code:: bash
-
- python CanDI/CanDI/setup/install.py
+ # Prepare Datasets
+ candi-install
Downloaded and formatted datasets would organize this way:
@@ -52,7 +37,13 @@ Downloaded and formatted datasets would organize this way:
└── locations
└── merged_locations.csv
-Package Usage
+
+**Note**:
+ *Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users
+ based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets.*
+
+
+Usage
-------------
Import CanDI into python
@@ -79,9 +70,28 @@ CanDI Objects
- ``GeneCluster`` : Provides cross dataset indexing for a group of user
defined genes.
+Citation
+--------
+
+If you use CanDI in your research, please cite the following paper:
+
+.. code:: bibtex
+
+ Yogodzinski C, Arab A, Pritchard JR, Goodarzi H, Gilbert LA.
+ A global cancer data integrator reveals principles of synthetic lethality, sex disparity and immunotherapy.
+ Genome Med. 2021;13(1):167. Published 2021 Oct 18. doi:10.1186/s13073-021-00987-8
+
+
+
+.. |PyPI| image:: https://img.shields.io/pypi/v/PyCanDI
+ :target: https://pypi.org/project/PyCanDI/
+
.. |Documentation Status| image:: https://readthedocs.org/projects/candi/badge/?version=latest
:target: https://candi.readthedocs.io/en/latest/?badge=latest
+.. |Downloads| image:: https://static.pepy.tech/badge/pycandi
+ :target: https://pepy.tech/project/pycandi
+
.. |DOI| image:: https://zenodo.org/badge/DOI/10.1186/s13073-021-00987-8.svg
:target: https://doi.org/10.1186/s13073-021-00987-8
diff --git a/environment.yml b/environment.yml
index 3c3a84b..6570f0e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,7 +1,13 @@
name: candi
dependencies:
- - python==3.9
+ - python>=3.11,<4.0
- pandas
+ - numpy
+ - polars
- configparser
- requests
- tqdm
+ - pip
+ - pip:
+ - pydeseq2
+ - adpbulk
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 8504299..8c1dd7a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,9 @@
pandas
+numpy
+polars
+anndata
configparser
requests
tqdm
-
+adpbulk
+pydeseq2
\ No newline at end of file
diff --git a/scripts/run_deseq.r b/scripts/run_deseq.r
deleted file mode 100644
index f4f1088..0000000
--- a/scripts/run_deseq.r
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env Rscript
-library(DESeq2)
-
-args = commandArgs(trailingOnly=TRUE)
-
-#read data
-counts.mat <- read.csv(args[1])
-coldata <- read.csv(args[2])
-
-#name metadata columns
-colnames(coldata) <- c("sample.id", "condition")
-#convert datatype to factor
-coldata[,-1] <- as.factor(coldata[, -1])
-#Match sample ids to sample columns in counts matrix
-coldata$sample.id <- sub("-", ".", coldata$sample.id)
-
-#init dds object
-dds <- DESeqDataSetFromMatrix(countData=counts.mat,
- colData=coldata,
- design= ~condition,
- tidy = TRUE)
-
-dds <- estimateSizeFactors(dds)
-#Filter lowly expressed genes
-idx <- rowSums(counts(dds, normalized=TRUE) >= 5) >= 3
-dds <- dds[idx,]
-
-dds <- DESeq(dds) #run deseq
-res <- results(dds) #get results
-#Show results
-print(summary(res))
-print(head(res))
-
-write.csv(res, args[3]) #save results
-
diff --git a/setup.py b/setup.py
index c15190e..b5a85ff 100644
--- a/setup.py
+++ b/setup.py
@@ -10,10 +10,13 @@
name='PyCanDI',
description='A cancer data integration package',
version=version,
- packages=find_packages(),
+
+ packages=find_packages(exclude=['tests', 'test_*']),
+
long_description=long_description,
long_description_content_type='text/x-rst',
- python_requires='>=3.9',
+
+ python_requires='>=3.11,<4.0',
install_requires=[
"pandas",
"configparser",
@@ -21,14 +24,21 @@
"tqdm",
],
url = 'https://github.com/GilbertLabUCSF/CanDI',
+
entry_points={
'console_scripts': [
'candi-install = CanDI.setup.install:main',
+ 'candi-uninstall = CanDI.setup.uninstall:main',
],
},
+
classifiers=[
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
],
+
+ include_package_data=True,
+ setup_requires=['setuptools_scm'],
+
)
diff --git a/tests/test_candi.py b/tests/test_candi.py
index dd64b59..74f1a08 100644
--- a/tests/test_candi.py
+++ b/tests/test_candi.py
@@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
from CanDI.structures.entity import Entity
+from CanDI.setup.manager import Manager
class testEntity(unittest.TestCase):
@@ -92,5 +93,6 @@ def test_canc_filters(self):
self.assertIsInstance(over, pd.core.frame.DataFrame)
self.assertIsInstance(under, pd.core.frame.DataFrame)
-
-
+class testManager(unittest.TestCase):
+ #TODO: Implement tests for Manager class
+ pass
\ No newline at end of file