Source code for bullkpy.pp.qc

from __future__ import annotations

from typing import Sequence
import numpy as np
import pandas as pd
import scipy.sparse as sp
import anndata as ad

from ..logging import info, warn

[docs] def _is_integerish(X, max_check: int = 50000, tol: float = 1e-8) -> bool: """Heuristic: does X look like raw counts (non-negative integers)?""" if sp.issparse(X): data = X.data if data.size == 0: return True # all zeros if data.size > max_check: data = data[:max_check] if np.min(data) < 0: return False frac = np.abs(data - np.round(data)) return bool(np.max(frac) < tol) else: A = np.asarray(X) if A.size == 0: return True flat = A.ravel() if flat.size > max_check: flat = flat[:max_check] if np.min(flat) < 0: return False frac = np.abs(flat - np.round(flat)) return bool(np.max(frac) < tol)
[docs] def qc_metrics( adata: ad.AnnData, *, layer: str | None = "counts", mt_prefix: str = "MT-", mt_var_key: str | None = None, detection_threshold: float = 0.0, compute_total_counts: bool = True, compute_n_genes: bool = True, compute_pct_mt: bool = True, ) -> None: """ Compute bulk QC metrics in adata.obs. Smart behavior: - If `layer` looks like raw counts: compute total_counts, n_genes_detected, pct_counts_mt - If `layer` looks non-integer (log/normalized): compute only n_genes_detected using `detection_threshold` """ X = adata.layers[layer] if (layer is not None and layer in adata.layers) else adata.X is_counts = _is_integerish(X) if not is_counts: # Logged/normalized: only detection QC makes sense if compute_total_counts or compute_pct_mt: warn( f"qc_metrics: layer='{layer}' does not look like raw integer counts. " "Skipping total_counts and pct_counts_mt; computing only n_genes_detected " f"using detection_threshold>{detection_threshold}." ) compute_total_counts = False compute_pct_mt = False # --- total_counts --- if compute_total_counts: if sp.issparse(X): adata.obs["total_counts"] = np.asarray(X.sum(axis=1)).ravel() else: adata.obs["total_counts"] = np.sum(np.asarray(X), axis=1) # --- n_genes_detected --- if compute_n_genes: thr = float(detection_threshold) if not is_counts else 0.0 if sp.issparse(X): adata.obs["n_genes_detected"] = np.asarray((X > thr).sum(axis=1)).ravel() else: adata.obs["n_genes_detected"] = (np.asarray(X) > thr).sum(axis=1) # --- pct_counts_mt --- if mt_var_key is not None and mt_var_key in adata.var.columns: mt_mask = adata.var[mt_var_key].astype(bool).to_numpy() else: gene_names = pd.Series(adata.var_names.astype(str)) mt_mask = gene_names.str.upper().str.startswith(mt_prefix.upper()).to_numpy() if mt_mask.sum() == 0: warn("qc_metrics: no mitochondrial genes detected with given mt_prefix/mt_var_key; pct_counts_mt not computed.") return Xmt = X[:, mt_mask] if sp.issparse(X): mt_counts = np.asarray(Xmt.sum(axis=1)).ravel() tot = np.asarray(X.sum(axis=1)).ravel() else: mt_counts = np.sum(np.asarray(Xmt), axis=1) tot = np.sum(np.asarray(X), axis=1) tot_safe = np.where(tot == 0, np.nan, tot) adata.obs["pct_counts_mt"] = 100.0 * (mt_counts / tot_safe) info("QC metrics added to adata.obs (total_counts, n_genes_detected, pct_counts_mt where applicable).")