Skip to content

Commit 3a4dc8d

Browse files
sjflemingkshakiralecw
authored
v0.3.1 (#303)
* Add WDL input to set number of retries. (#247) * Move hash computation so that it is recomputed on retry, and now-invalid checkpoint is not loaded. (#258) * Bug fix for WDL using MTX input (#246) * Memory-efficient posterior generation (#263) * Fix posterior and estimator integer overflow bugs on Windows (#259) * Move from setup.py to pyproject.toml (#240) * Fix bugs with report generation across platforms (#302) --------- Co-authored-by: kshakir <[email protected]> Co-authored-by: alecw <[email protected]>
1 parent 4990df7 commit 3a4dc8d

23 files changed

+254
-207
lines changed

.github/workflows/run_pytest.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Run cellbender's tests
1+
# Run cellbender test suite
22

33
name: 'pytest'
44

@@ -7,11 +7,13 @@ on: pull_request
77
jobs:
88
build:
99

10-
runs-on: 'ubuntu-latest'
1110
strategy:
1211
matrix:
12+
os: ['ubuntu-latest', 'windows-latest']
1313
python-version: ['3.7']
1414

15+
runs-on: ${{ matrix.os }}
16+
1517
steps:
1618
- name: 'Checkout repo'
1719
uses: actions/checkout@v3

MANIFEST.in

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
include README.rst
22
include LICENSE
33
include requirements.txt
4-
include requirements-rtd.txt
4+
include requirements-rtd.txt
5+
include requirements-dev.txt
6+
include cellbender/VERSION.txt
7+
include cellbender/remove_background/report.ipynb

build_docker_release.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
tag=$(cat cellbender/__init__.py | sed -e 's?__version__ = ??' | sed "s/^'\(.*\)'$/\1/")
3+
tag=$(cat cellbender/VERSION.txt)
44
release=v${tag}
55

66
docker build \

cellbender/VERSION.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0.3.0

cellbender/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
__version__ = '0.3.0'
1+
from .base_cli import get_version
2+
3+
__version__ = get_version()

cellbender/base_cli.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,7 @@ def read(rel_path):
2323

2424

2525
def get_version() -> str:
26-
for line in read('__init__.py').splitlines():
27-
if line.startswith('__version__'):
28-
delim = '"' if '"' in line else "'"
29-
return line.split(delim)[1]
30-
else:
31-
raise RuntimeError("Unable to find version string.")
26+
return read('VERSION.txt').splitlines()[0]
3227

3328

3429
class AbstractCLI(ABC):

cellbender/remove_background/checkpoint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ def make_tarball(files: List[str], tarball_name: str) -> bool:
297297
for file in files:
298298
# without arcname, unpacking results in unpredictable file locations!
299299
tar.add(file, arcname=os.path.basename(file))
300-
os.rename(tarball_name + '.tmp', tarball_name)
300+
os.replace(tarball_name + '.tmp', tarball_name)
301301
return True
302302

303303

cellbender/remove_background/cli.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -207,17 +207,6 @@ def setup_and_logging(args):
207207
+ ' '.join(['cellbender', 'remove-background'] + sys.argv[2:]))
208208
logger.info("CellBender " + get_version())
209209

210-
# Set up checkpointing by creating a unique workflow hash.
211-
hashcode = create_workflow_hashcode(
212-
module_path=os.path.dirname(cellbender.__file__),
213-
args_to_remove=(['output_file', 'fpr', 'input_checkpoint_tarball', 'debug',
214-
'posterior_batch_size', 'checkpoint_min', 'truth_file',
215-
'posterior_regularization', 'cdf_threshold_q', 'prq_alpha',
216-
'estimator', 'use_multiprocessing_estimation', 'cpu_threads']
217-
+ (['epochs'] if args.constant_learning_rate else [])),
218-
args=args)[:10]
219-
args.checkpoint_filename = hashcode # store this in args
220-
logger.info(f'(Workflow hash {hashcode})')
221210
return args, file_handler
222211

223212

cellbender/remove_background/estimation.py

Lines changed: 15 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def _estimation_array_to_csr(index_converter,
218218
data: np.ndarray,
219219
m: np.ndarray,
220220
noise_offsets: Optional[Dict[int, int]],
221-
dtype=np.int64) -> sp.csr_matrix:
221+
dtype=np.int) -> sp.csr_matrix:
222222
"""Say you have point estimates for each count matrix element (data) and
223223
you have the 'm'-indices for each value (m). This returns a CSR matrix
224224
that has the shape of the count matrix, where duplicate entries have
@@ -229,7 +229,7 @@ def _estimation_array_to_csr(index_converter,
229229
a flat format, indexed by 'm'.
230230
m: Array of the same length as data, where each entry is an m-index.
231231
noise_offsets: Noise count offset values keyed by 'm'.
232-
dtype: Data type for sparse matrix. Int32 is too small for 'm' indices.
232+
dtype: Data type for values of sparse matrix
233233
234234
Results:
235235
noise_csr: Noise point estimate, as a CSR sparse matrix.
@@ -238,7 +238,7 @@ def _estimation_array_to_csr(index_converter,
238238
row, col = index_converter.get_ng_indices(m_inds=m)
239239
if noise_offsets is not None:
240240
data = data + np.array([noise_offsets.get(i, 0) for i in m])
241-
coo = sp.coo_matrix((data.astype(dtype), (row.astype(dtype), col.astype(dtype))),
241+
coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
242242
shape=index_converter.matrix_shape, dtype=dtype)
243243
coo.sum_duplicates()
244244
return coo.tocsr()
@@ -463,11 +463,9 @@ def estimate_noise(self,
463463
if use_multiple_processes:
464464

465465
logger.info('Dividing dataset into chunks of genes')
466-
chunk_logic_list = list(
467-
self._gene_chunk_iterator(
468-
noise_log_prob_coo=noise_log_prob_coo,
469-
n_chunks=n_chunks,
470-
)
466+
chunk_logic_list = self._gene_chunk_iterator(
467+
noise_log_prob_coo=noise_log_prob_coo,
468+
n_chunks=n_chunks,
471469
)
472470

473471
logger.info('Computing the output in asynchronous chunks in parallel...')
@@ -538,10 +536,9 @@ def estimate_noise(self,
538536
def _gene_chunk_iterator(self,
539537
noise_log_prob_coo: sp.coo_matrix,
540538
n_chunks: int) \
541-
-> Generator[np.ndarray, None, None]:
542-
"""Yields chunks of the posterior that can be treated as independent,
543-
from the standpoint of MCKP count estimation. That is, they contain all
544-
matrix entries for any genes they include.
539+
-> List[np.ndarray]:
540+
"""Return a list of logical (size m) arrays used to select gene chunks
541+
on which to compute the MCKP estimate. These chunks are independent.
545542
546543
Args:
547544
noise_log_prob_coo: Full noise log prob posterior COO
@@ -551,36 +548,14 @@ def _gene_chunk_iterator(self,
551548
Logical array which indexes elements of coo posterior for the chunk
552549
"""
553550

554-
# TODO this generator is way too slow
555-
556-
# approximate number of entries in a chunk
557-
# approx_chunk_entries = (noise_log_prob_coo.data.size - 1) // n_chunks
558-
559551
# get gene annotations
560552
_, genes = self.index_converter.get_ng_indices(m_inds=noise_log_prob_coo.row)
561553
genes_series = pd.Series(genes)
562554

563-
# things we need to keep track of for each chunk
564-
# current_chunk_genes = []
565-
# entry_logic = np.zeros(noise_log_prob_coo.data.size, dtype=bool)
566-
567-
# TODO eliminate for loop to speed this up
568-
# take the list of genes from the coo, sort it, and divide it evenly
569-
# somehow break ties for genes overlapping boundaries of divisions
570-
sorted_genes = np.sort(genes)
571-
gene_arrays = np.array_split(sorted_genes, n_chunks)
572-
last_gene_set = {}
573-
for gene_array in gene_arrays:
574-
gene_set = set(gene_array)
575-
gene_set = gene_set.difference(last_gene_set) # only the new stuff
576-
# if there is a second chunk, make sure there is a gene unique to it
577-
if (n_chunks > 1) and (len(gene_set) == len(set(genes))): # all genes in first set
578-
# this mainly exists for tests
579-
gene_set = gene_set - {gene_arrays[-1][-1]}
580-
last_gene_set = gene_set
581-
entry_logic = genes_series.isin(gene_set).values
582-
if sum(entry_logic) > 0:
583-
yield entry_logic
555+
gene_chunk_arrays = np.array_split(np.arange(self.index_converter.total_n_genes), n_chunks)
556+
557+
gene_logic_arrays = [genes_series.isin(x).values for x in gene_chunk_arrays]
558+
return gene_logic_arrays
584559

585560
def _chunk_estimate_noise(self,
586561
noise_log_prob_coo: sp.coo_matrix,
@@ -810,7 +785,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
810785
"""
811786
array_length = len(np.unique(noise_log_prob_coo.row))
812787

813-
m = np.zeros(array_length)
788+
m = np.zeros(array_length, dtype=np.uint64)
814789
out = np.zeros(array_length)
815790
a = 0
816791

@@ -829,7 +804,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
829804
out[a:(a + len_s)] = s.detach().cpu().numpy()
830805
a = a + len_s
831806

832-
return {'m': m.astype(int), 'result': out}
807+
return {'m': m, 'result': out}
833808

834809

835810
def pandas_grouped_apply(coo: sp.coo_matrix,

cellbender/remove_background/posterior.py

Lines changed: 41 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ def _get_cell_noise_count_posterior_coo(
451451
f'accurate for your dataset.')
452452
raise RuntimeError('Zero cells found!')
453453

454-
dataloader_index_to_analyzed_bc_index = np.where(cell_logic)[0]
454+
dataloader_index_to_analyzed_bc_index = torch.where(torch.tensor(cell_logic))[0]
455455
cell_data_loader = DataLoader(
456456
count_matrix[cell_logic],
457457
empty_drop_dataset=None,
@@ -468,6 +468,12 @@ def _get_cell_noise_count_posterior_coo(
468468
log_probs = []
469469
ind = 0
470470
n_minibatches = len(cell_data_loader)
471+
analyzed_gene_inds = torch.tensor(self.analyzed_gene_inds.copy())
472+
if analyzed_bcs_only:
473+
barcode_inds = torch.tensor(self.dataset_obj.analyzed_barcode_inds.copy())
474+
else:
475+
barcode_inds = torch.tensor(self.barcode_inds.copy())
476+
nonzero_noise_offset_dict = {}
471477

472478
logger.info('Computing posterior noise count probabilities in mini-batches.')
473479

@@ -505,57 +511,52 @@ def _get_cell_noise_count_posterior_coo(
505511
)
506512

507513
# Get the original gene index from gene index in the trimmed dataset.
508-
genes_i = self.analyzed_gene_inds[genes_i_analyzed]
514+
genes_i = analyzed_gene_inds[genes_i_analyzed.cpu()]
509515

510516
# Barcode index in the dataloader.
511-
bcs_i = bcs_i_chunk + ind
517+
bcs_i = (bcs_i_chunk + ind).cpu()
512518

513519
# Obtain the real barcode index since we only use cells.
514520
bcs_i = dataloader_index_to_analyzed_bc_index[bcs_i]
515521

516522
# Translate chunk barcode inds to overall inds.
517-
if analyzed_bcs_only:
518-
bcs_i = self.dataset_obj.analyzed_barcode_inds[bcs_i]
519-
else:
520-
bcs_i = self.barcode_inds[bcs_i]
523+
bcs_i = barcode_inds[bcs_i]
521524

522525
# Add sparse matrix values to lists.
523-
try:
524-
bcs.extend(bcs_i.tolist())
525-
genes.extend(genes_i.tolist())
526-
c.extend(c_i.tolist())
527-
log_probs.extend(log_prob_i.tolist())
528-
c_offset.extend(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed]
529-
.detach().cpu().numpy())
530-
except TypeError as e:
531-
# edge case of a single value
532-
bcs.append(bcs_i)
533-
genes.append(genes_i)
534-
c.append(c_i)
535-
log_probs.append(log_prob_i)
536-
c_offset.append(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed]
537-
.detach().cpu().numpy())
526+
bcs.append(bcs_i.detach())
527+
genes.append(genes_i.detach())
528+
c.append(c_i.detach().cpu())
529+
log_probs.append(log_prob_i.detach().cpu())
530+
531+
# Update offset dict with any nonzeros.
532+
nonzero_offset_inds, nonzero_noise_count_offsets = dense_to_sparse_op_torch(
533+
noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed].detach().flatten(),
534+
)
535+
m_i = self.index_converter.get_m_indices(cell_inds=bcs_i, gene_inds=genes_i)
536+
537+
nonzero_noise_offset_dict.update(
538+
dict(zip(m_i[nonzero_offset_inds.detach().cpu()].tolist(),
539+
nonzero_noise_count_offsets.detach().cpu().tolist()))
540+
)
541+
c_offset.append(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed].detach().cpu())
538542

539543
# Increment barcode index counter.
540544
ind += data.shape[0] # Same as data_loader.batch_size
541545

542-
# Convert the lists to numpy arrays.
543-
log_probs = np.array(log_probs, dtype=float)
544-
c = np.array(c, dtype=np.uint32)
545-
barcodes = np.array(bcs, dtype=np.uint64) # uint32 is too small!
546-
genes = np.array(genes, dtype=np.uint64) # use same as above for IndexConverter
547-
noise_count_offsets = np.array(c_offset, dtype=np.uint32)
546+
# Concatenate lists.
547+
log_probs = torch.cat(log_probs)
548+
c = torch.cat(c)
549+
barcodes = torch.cat(bcs)
550+
genes = torch.cat(genes)
548551

549552
# Translate (barcode, gene) inds to 'm' format index.
550553
m = self.index_converter.get_m_indices(cell_inds=barcodes, gene_inds=genes)
551554

552555
# Put the counts into a sparse csr_matrix.
553556
self._noise_count_posterior_coo = sp.coo_matrix(
554557
(log_probs, (m, c)),
555-
shape=[np.prod(self.count_matrix_shape), n_counts_max],
558+
shape=[np.prod(self.count_matrix_shape, dtype=np.uint64), n_counts_max],
556559
)
557-
noise_offset_dict = dict(zip(m, noise_count_offsets))
558-
nonzero_noise_offset_dict = {k: v for k, v in noise_offset_dict.items() if (v > 0)}
559560
self._noise_count_posterior_coo_offsets = nonzero_noise_offset_dict
560561
return self._noise_count_posterior_coo
561562

@@ -1517,7 +1518,9 @@ def __repr__(self):
15171518
f'\n\ttotal_n_genes: {self.total_n_genes}'
15181519
f'\n\tmatrix_shape: {self.matrix_shape}')
15191520

1520-
def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndarray:
1521+
def get_m_indices(self,
1522+
cell_inds: Union[np.ndarray, torch.Tensor],
1523+
gene_inds: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
15211524
"""Given arrays of cell indices and gene indices, suitable for a sparse matrix,
15221525
convert them to 'm' index values.
15231526
"""
@@ -1527,7 +1530,12 @@ def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndar
15271530
if not ((gene_inds >= 0) & (gene_inds < self.total_n_genes)).all():
15281531
raise ValueError(f'Requested gene_inds out of range: '
15291532
f'{gene_inds[(gene_inds < 0) | (gene_inds >= self.total_n_genes)]}')
1530-
return cell_inds * self.total_n_genes + gene_inds
1533+
if type(cell_inds) == np.ndarray:
1534+
return cell_inds.astype(np.uint64) * self.total_n_genes + gene_inds.astype(np.uint64)
1535+
elif type(cell_inds) == torch.Tensor:
1536+
return cell_inds.type(torch.int64) * self.total_n_genes + gene_inds.type(torch.int64)
1537+
else:
1538+
raise ValueError('IndexConverter.get_m_indices received cell_inds of unkown object type')
15311539

15321540
def get_ng_indices(self, m_inds: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
15331541
"""Given a list of 'm' index values, return two arrays: cell index values

0 commit comments

Comments
 (0)