Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/run_pytest.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Run cellbender's tests
# Run cellbender test suite

name: 'pytest'

Expand All @@ -7,11 +7,13 @@ on: pull_request
jobs:
build:

runs-on: 'ubuntu-latest'
strategy:
matrix:
os: ['ubuntu-latest', 'windows-latest']
python-version: ['3.7']

runs-on: ${{ matrix.os }}

steps:
- name: 'Checkout repo'
uses: actions/checkout@v3
Expand Down
5 changes: 4 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
include README.rst
include LICENSE
include requirements.txt
include requirements-rtd.txt
include requirements-rtd.txt
include requirements-dev.txt
include cellbender/VERSION.txt
include cellbender/remove_background/report.ipynb
2 changes: 1 addition & 1 deletion build_docker_release.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

tag=$(cat cellbender/__init__.py | sed -e 's?__version__ = ??' | sed "s/^'\(.*\)'$/\1/")
tag=$(cat cellbender/VERSION.txt)
release=v${tag}

docker build \
Expand Down
1 change: 1 addition & 0 deletions cellbender/VERSION.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.3.0
4 changes: 3 additions & 1 deletion cellbender/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
__version__ = '0.3.0'
from .base_cli import get_version

__version__ = get_version()
7 changes: 1 addition & 6 deletions cellbender/base_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,7 @@ def read(rel_path):


def get_version() -> str:
for line in read('__init__.py').splitlines():
if line.startswith('__version__'):
delim = '"' if '"' in line else "'"
return line.split(delim)[1]
else:
raise RuntimeError("Unable to find version string.")
return read('VERSION.txt').splitlines()[0]


class AbstractCLI(ABC):
Expand Down
2 changes: 1 addition & 1 deletion cellbender/remove_background/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def make_tarball(files: List[str], tarball_name: str) -> bool:
for file in files:
# without arcname, unpacking results in unpredictable file locations!
tar.add(file, arcname=os.path.basename(file))
os.rename(tarball_name + '.tmp', tarball_name)
os.replace(tarball_name + '.tmp', tarball_name)
return True


Expand Down
11 changes: 0 additions & 11 deletions cellbender/remove_background/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,17 +207,6 @@ def setup_and_logging(args):
+ ' '.join(['cellbender', 'remove-background'] + sys.argv[2:]))
logger.info("CellBender " + get_version())

# Set up checkpointing by creating a unique workflow hash.
hashcode = create_workflow_hashcode(
module_path=os.path.dirname(cellbender.__file__),
args_to_remove=(['output_file', 'fpr', 'input_checkpoint_tarball', 'debug',
'posterior_batch_size', 'checkpoint_min', 'truth_file',
'posterior_regularization', 'cdf_threshold_q', 'prq_alpha',
'estimator', 'use_multiprocessing_estimation', 'cpu_threads']
+ (['epochs'] if args.constant_learning_rate else [])),
args=args)[:10]
args.checkpoint_filename = hashcode # store this in args
logger.info(f'(Workflow hash {hashcode})')
return args, file_handler


Expand Down
55 changes: 15 additions & 40 deletions cellbender/remove_background/estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def _estimation_array_to_csr(index_converter,
data: np.ndarray,
m: np.ndarray,
noise_offsets: Optional[Dict[int, int]],
dtype=np.int64) -> sp.csr_matrix:
dtype=np.int) -> sp.csr_matrix:
"""Say you have point estimates for each count matrix element (data) and
you have the 'm'-indices for each value (m). This returns a CSR matrix
that has the shape of the count matrix, where duplicate entries have
Expand All @@ -229,7 +229,7 @@ def _estimation_array_to_csr(index_converter,
a flat format, indexed by 'm'.
m: Array of the same length as data, where each entry is an m-index.
noise_offsets: Noise count offset values keyed by 'm'.
dtype: Data type for sparse matrix. Int32 is too small for 'm' indices.
dtype: Data type for values of sparse matrix

Results:
noise_csr: Noise point estimate, as a CSR sparse matrix.
Expand All @@ -238,7 +238,7 @@ def _estimation_array_to_csr(index_converter,
row, col = index_converter.get_ng_indices(m_inds=m)
if noise_offsets is not None:
data = data + np.array([noise_offsets.get(i, 0) for i in m])
coo = sp.coo_matrix((data.astype(dtype), (row.astype(dtype), col.astype(dtype))),
coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
shape=index_converter.matrix_shape, dtype=dtype)
coo.sum_duplicates()
return coo.tocsr()
Expand Down Expand Up @@ -463,11 +463,9 @@ def estimate_noise(self,
if use_multiple_processes:

logger.info('Dividing dataset into chunks of genes')
chunk_logic_list = list(
self._gene_chunk_iterator(
noise_log_prob_coo=noise_log_prob_coo,
n_chunks=n_chunks,
)
chunk_logic_list = self._gene_chunk_iterator(
noise_log_prob_coo=noise_log_prob_coo,
n_chunks=n_chunks,
)

logger.info('Computing the output in asynchronous chunks in parallel...')
Expand Down Expand Up @@ -538,10 +536,9 @@ def estimate_noise(self,
def _gene_chunk_iterator(self,
noise_log_prob_coo: sp.coo_matrix,
n_chunks: int) \
-> Generator[np.ndarray, None, None]:
"""Yields chunks of the posterior that can be treated as independent,
from the standpoint of MCKP count estimation. That is, they contain all
matrix entries for any genes they include.
-> List[np.ndarray]:
"""Return a list of logical (size m) arrays used to select gene chunks
on which to compute the MCKP estimate. These chunks are independent.

Args:
noise_log_prob_coo: Full noise log prob posterior COO
Expand All @@ -551,36 +548,14 @@ def _gene_chunk_iterator(self,
Logical array which indexes elements of coo posterior for the chunk
"""

# TODO this generator is way too slow

# approximate number of entries in a chunk
# approx_chunk_entries = (noise_log_prob_coo.data.size - 1) // n_chunks

# get gene annotations
_, genes = self.index_converter.get_ng_indices(m_inds=noise_log_prob_coo.row)
genes_series = pd.Series(genes)

# things we need to keep track of for each chunk
# current_chunk_genes = []
# entry_logic = np.zeros(noise_log_prob_coo.data.size, dtype=bool)

# TODO eliminate for loop to speed this up
# take the list of genes from the coo, sort it, and divide it evenly
# somehow break ties for genes overlapping boundaries of divisions
sorted_genes = np.sort(genes)
gene_arrays = np.array_split(sorted_genes, n_chunks)
last_gene_set = {}
for gene_array in gene_arrays:
gene_set = set(gene_array)
gene_set = gene_set.difference(last_gene_set) # only the new stuff
# if there is a second chunk, make sure there is a gene unique to it
if (n_chunks > 1) and (len(gene_set) == len(set(genes))): # all genes in first set
# this mainly exists for tests
gene_set = gene_set - {gene_arrays[-1][-1]}
last_gene_set = gene_set
entry_logic = genes_series.isin(gene_set).values
if sum(entry_logic) > 0:
yield entry_logic
gene_chunk_arrays = np.array_split(np.arange(self.index_converter.total_n_genes), n_chunks)

gene_logic_arrays = [genes_series.isin(x).values for x in gene_chunk_arrays]
return gene_logic_arrays

def _chunk_estimate_noise(self,
noise_log_prob_coo: sp.coo_matrix,
Expand Down Expand Up @@ -810,7 +785,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
"""
array_length = len(np.unique(noise_log_prob_coo.row))

m = np.zeros(array_length)
m = np.zeros(array_length, dtype=np.uint64)
out = np.zeros(array_length)
a = 0

Expand All @@ -829,7 +804,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
out[a:(a + len_s)] = s.detach().cpu().numpy()
a = a + len_s

return {'m': m.astype(int), 'result': out}
return {'m': m, 'result': out}


def pandas_grouped_apply(coo: sp.coo_matrix,
Expand Down
74 changes: 41 additions & 33 deletions cellbender/remove_background/posterior.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def _get_cell_noise_count_posterior_coo(
f'accurate for your dataset.')
raise RuntimeError('Zero cells found!')

dataloader_index_to_analyzed_bc_index = np.where(cell_logic)[0]
dataloader_index_to_analyzed_bc_index = torch.where(torch.tensor(cell_logic))[0]
cell_data_loader = DataLoader(
count_matrix[cell_logic],
empty_drop_dataset=None,
Expand All @@ -468,6 +468,12 @@ def _get_cell_noise_count_posterior_coo(
log_probs = []
ind = 0
n_minibatches = len(cell_data_loader)
analyzed_gene_inds = torch.tensor(self.analyzed_gene_inds.copy())
if analyzed_bcs_only:
barcode_inds = torch.tensor(self.dataset_obj.analyzed_barcode_inds.copy())
else:
barcode_inds = torch.tensor(self.barcode_inds.copy())
nonzero_noise_offset_dict = {}

logger.info('Computing posterior noise count probabilities in mini-batches.')

Expand Down Expand Up @@ -505,57 +511,52 @@ def _get_cell_noise_count_posterior_coo(
)

# Get the original gene index from gene index in the trimmed dataset.
genes_i = self.analyzed_gene_inds[genes_i_analyzed]
genes_i = analyzed_gene_inds[genes_i_analyzed.cpu()]

# Barcode index in the dataloader.
bcs_i = bcs_i_chunk + ind
bcs_i = (bcs_i_chunk + ind).cpu()

# Obtain the real barcode index since we only use cells.
bcs_i = dataloader_index_to_analyzed_bc_index[bcs_i]

# Translate chunk barcode inds to overall inds.
if analyzed_bcs_only:
bcs_i = self.dataset_obj.analyzed_barcode_inds[bcs_i]
else:
bcs_i = self.barcode_inds[bcs_i]
bcs_i = barcode_inds[bcs_i]

# Add sparse matrix values to lists.
try:
bcs.extend(bcs_i.tolist())
genes.extend(genes_i.tolist())
c.extend(c_i.tolist())
log_probs.extend(log_prob_i.tolist())
c_offset.extend(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed]
.detach().cpu().numpy())
except TypeError as e:
# edge case of a single value
bcs.append(bcs_i)
genes.append(genes_i)
c.append(c_i)
log_probs.append(log_prob_i)
c_offset.append(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed]
.detach().cpu().numpy())
bcs.append(bcs_i.detach())
genes.append(genes_i.detach())
c.append(c_i.detach().cpu())
log_probs.append(log_prob_i.detach().cpu())

# Update offset dict with any nonzeros.
nonzero_offset_inds, nonzero_noise_count_offsets = dense_to_sparse_op_torch(
noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed].detach().flatten(),
)
m_i = self.index_converter.get_m_indices(cell_inds=bcs_i, gene_inds=genes_i)

nonzero_noise_offset_dict.update(
dict(zip(m_i[nonzero_offset_inds.detach().cpu()].tolist(),
nonzero_noise_count_offsets.detach().cpu().tolist()))
)
c_offset.append(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed].detach().cpu())

# Increment barcode index counter.
ind += data.shape[0] # Same as data_loader.batch_size

# Convert the lists to numpy arrays.
log_probs = np.array(log_probs, dtype=float)
c = np.array(c, dtype=np.uint32)
barcodes = np.array(bcs, dtype=np.uint64) # uint32 is too small!
genes = np.array(genes, dtype=np.uint64) # use same as above for IndexConverter
noise_count_offsets = np.array(c_offset, dtype=np.uint32)
# Concatenate lists.
log_probs = torch.cat(log_probs)
c = torch.cat(c)
barcodes = torch.cat(bcs)
genes = torch.cat(genes)

# Translate (barcode, gene) inds to 'm' format index.
m = self.index_converter.get_m_indices(cell_inds=barcodes, gene_inds=genes)

# Put the counts into a sparse csr_matrix.
self._noise_count_posterior_coo = sp.coo_matrix(
(log_probs, (m, c)),
shape=[np.prod(self.count_matrix_shape), n_counts_max],
shape=[np.prod(self.count_matrix_shape, dtype=np.uint64), n_counts_max],
)
noise_offset_dict = dict(zip(m, noise_count_offsets))
nonzero_noise_offset_dict = {k: v for k, v in noise_offset_dict.items() if (v > 0)}
self._noise_count_posterior_coo_offsets = nonzero_noise_offset_dict
return self._noise_count_posterior_coo

Expand Down Expand Up @@ -1517,7 +1518,9 @@ def __repr__(self):
f'\n\ttotal_n_genes: {self.total_n_genes}'
f'\n\tmatrix_shape: {self.matrix_shape}')

def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndarray:
def get_m_indices(self,
cell_inds: Union[np.ndarray, torch.Tensor],
gene_inds: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
"""Given arrays of cell indices and gene indices, suitable for a sparse matrix,
convert them to 'm' index values.
"""
Expand All @@ -1527,7 +1530,12 @@ def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndar
if not ((gene_inds >= 0) & (gene_inds < self.total_n_genes)).all():
raise ValueError(f'Requested gene_inds out of range: '
f'{gene_inds[(gene_inds < 0) | (gene_inds >= self.total_n_genes)]}')
return cell_inds * self.total_n_genes + gene_inds
if type(cell_inds) == np.ndarray:
return cell_inds.astype(np.uint64) * self.total_n_genes + gene_inds.astype(np.uint64)
elif type(cell_inds) == torch.Tensor:
return cell_inds.type(torch.int64) * self.total_n_genes + gene_inds.type(torch.int64)
else:
raise ValueError('IndexConverter.get_m_indices received cell_inds of unkown object type')

def get_ng_indices(self, m_inds: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Given a list of 'm' index values, return two arrays: cell index values
Expand Down
25 changes: 15 additions & 10 deletions cellbender/remove_background/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import subprocess
import datetime
import os
import shutil
import logging
from typing import Dict, Optional

Expand All @@ -42,26 +43,30 @@


def _run_notebook(file):
subprocess.run(f'cp {file} tmp.report.ipynb', shell=True)
shutil.copy(file, 'tmp.report.ipynb')
subprocess.run(run_notebook_str(file='tmp.report.ipynb'), shell=True)
subprocess.run(f'rm tmp.report.ipynb', shell=True)
os.remove('tmp.report.ipynb')
return 'tmp.report.nbconvert.ipynb'


def _to_html(file, output) -> str:
subprocess.run(to_html_str(file=file, output=output), shell=True)
subprocess.run(f'mv {file.replace(".ipynb", ".html")} {output}', shell=True)
subprocess.run(f'rm {file}', shell=True)
os.replace(file.replace(".ipynb", ".html"), output)
os.remove(file)
return output


def _postprocess_html(file: str, title: str):
with open(file, mode='r') as f:
html = f.read()
html = html.replace('<title>tmp.report.nbconvert</title>',
f'<title>{title}</title>')
with open(file, mode='w') as f:
f.write(html)
try:
with open(file, mode='r', encoding="utf8", errors="surrogateescape") as f:
html = f.read()
html = html.replace('<title>tmp.report.nbconvert</title>',
f'<title>{title}</title>')
with open(file, mode='w', encoding="utf8", errors="surrogateescape") as f:
f.write(html)
except:
logger.warning('Failed to overwrite default HTML report title. '
'This is purely aesthetic and does not affect output.')


def run_notebook_make_html(file, output) -> str:
Expand Down
Loading