Source code for bullkpy.io

from __future__ import annotations

from pathlib import Path
from typing import Literal

import pandas as pd
import numpy as np
import anndata as ad

from .logging import info, warn


[docs] def read_counts( filename: str | Path, *, sep: str = "\t", orientation: Literal["genes_by_samples", "samples_by_genes"] = "genes_by_samples", dtype: str | None = "int64", ) -> ad.AnnData: """ Read a bulk RNA-seq count matrix and return an AnnData object. Parameters ---------- filename Path to counts file (tsv/csv). Rows or columns must be gene IDs. sep Column separator (default: tab). orientation - "genes_by_samples": genes in rows, samples in columns (common output of HTSeq, featureCounts) - "samples_by_genes": samples in rows, genes in columns dtype Cast count matrix to dtype (default: int64). Set to None to disable casting. Returns ------- AnnData AnnData object with samples in `.obs` and genes in `.var`. """ filename = Path(filename) info(f"Reading count matrix from {filename}") df = pd.read_csv(filename, sep=sep, index_col=0) info(f"Raw matrix shape: {df.shape}") if orientation == "genes_by_samples": info("Interpreting rows as genes and columns as samples") df = df.T elif orientation == "samples_by_genes": info("Interpreting rows as samples and columns as genes") else: raise ValueError( "orientation must be 'genes_by_samples' or 'samples_by_genes'" ) # Basic sanity checks if df.index.has_duplicates: warn("Sample names are duplicated") if df.columns.has_duplicates: warn("Gene names are duplicated") if dtype is not None: try: df = df.astype(dtype) except Exception as e: warn(f"Could not cast counts to {dtype}: {e}") # Warn if data does not look like counts if np.any(df.values < 0): warn("Count matrix contains negative values") adata = ad.AnnData(X=df) adata.obs_names = df.index.astype(str) adata.var_names = df.columns.astype(str) info( f"Created AnnData object with " f"{adata.n_obs} samples ? {adata.n_vars} genes" ) return adata
[docs] def add_metadata( adata: ad.AnnData, metadata_file: str | Path, *, index_col: str, sep: str = "\t", low_memory = False, how: Literal["left", "inner"] = "left", ) -> ad.AnnData: """ Add sample metadata to an AnnData object. Parameters ---------- adata AnnData object with samples in `.obs`. metadata_file Path to metadata file (tsv, csv, or xlsx). index_col Column in metadata that matches `adata.obs_names`. sep Column separator for tsv/csv files. low_memory Pandas low_memory parameter how Merge strategy: - "left": keep all samples in adata (default) - "inner": keep only samples present in metadata Returns ------- AnnData The same AnnData object with updated `.obs`. """ metadata_file = Path(metadata_file) info(f"Adding metadata from {metadata_file}") # Load metadata if metadata_file.suffix in {".xls", ".xlsx"}: meta = pd.read_excel(metadata_file, low_memory=low_memory) else: meta = pd.read_csv(metadata_file, sep=sep) if index_col not in meta.columns: raise ValueError( f"index_col '{index_col}' not found in metadata columns" ) meta = meta.set_index(index_col) # Sanity checks if meta.index.has_duplicates: warn("Metadata index contains duplicated sample IDs") missing = adata.obs_names.difference(meta.index) if len(missing) > 0: warn( f"{len(missing)} samples in AnnData are missing metadata " f"(showing up to 5): {list(missing[:5])}" ) overlap = adata.obs_names.intersection(meta.index) info(f"Found metadata for {len(overlap)} / {adata.n_obs} samples") # Merge adata.obs = adata.obs.merge( meta, left_index=True, right_index=True, how=how, ) return adata