Config

`helical.models.uce.UCEConfig`

Configuration class to use the Universal Cell-Embedding Model.

Parameters:

Name	Type	Description	Default
`model_name`	`Literal['33l_8ep_1024t_1280', '4layer_model']`	The model name	`"4layer_model"`
`batch_size`	`int`	The batch size	`24`
`species`	`Literal['human', 'mouse', 'frog', 'zebrafish', 'mouse_lemur', 'pig', 'macaca_fascicularis', 'macaca_mulatta']`	The species of the data.	`"human"`
`gene_embedding_model`	`Literal['ESM2']`	The gene embedding model to use. For now, we only support ESM2.	`'ESM2'`
`pad_length`	`int`	The padding length	`1536`
`pad_token_idx`	`int`	The padding token index	`0`
`chrom_token_left_idx`	`int`	The left chrom token index	`1`
`chrom_token_right_idx`	`int`	The right chrom token index	`2`
`cls_token_idx`	`int`	The cls token index	`3`
`CHROM_TOKEN_OFFSET`	`int`	The chrom token offset	`143574`
`sample_size`	`int`	The sample size	`1024`
`CXG`	`bool`	Whether to use CXG or not	`True`
`output_dim`	`int`	The output dimension	`1280`
`d_hid`	`int`	The hidden dimension	`5120`
`token_dim`	`int`	The token dimension	`5120`
`multi_gpu`	`bool`	Whether to use multiple GPUs or not	`False`
`device`	`Literal['cpu', 'cuda']`	The device to use. Either use "cuda" or "cpu".	`"cpu"`
`accelerator`	`bool`	The accelerator configuration. By default same device as model.	`False`

Returns:

Type	Description
`UCEConfig`	The UCE configuration object

Source code in helical/models/uce/uce_config.py

class UCEConfig:
    """Configuration class to use the Universal Cell-Embedding Model.

    Parameters
    ----------
    model_name : Literal["33l_8ep_1024t_1280", "4layer_model"], optional, default="4layer_model"
        The model name
    batch_size : int, optional, default=24
        The batch size
    species : Literal['human', 'mouse', 'frog', 'zebrafish', "mouse_lemur", "pig", "macaca_fascicularis", "macaca_mulatta"], optional, default="human"
        The species of the data.
    gene_embedding_model: Literal['ESM2'], optional, default="ESM2"
        The gene embedding model to use. For now, we only support ESM2.
    pad_length : int, optional, default=1536
        The padding length
    pad_token_idx : int, optional, default=0
        The padding token index
    chrom_token_left_idx : int, optional, default=1
        The left chrom token index
    chrom_token_right_idx : int, optional, default=2
        The right chrom token index
    cls_token_idx : int, optional, default=3
        The cls token index
    CHROM_TOKEN_OFFSET : int, optional, default=143574
        The chrom token offset
    sample_size : int, optional, default=1024
        The sample size
    CXG : bool, optional, default=True
        Whether to use CXG or not
    output_dim : int, optional, default=1280
        The output dimension
    d_hid : int, optional, default=5120
        The hidden dimension
    token_dim : int, optional, default=5120
        The token dimension
    multi_gpu : bool, optional, default=False
        Whether to use multiple GPUs or not
    device : Literal["cpu", "cuda"], optional, default="cpu"
        The device to use. Either use "cuda" or "cpu".
    accelerator : bool, optional, default=False
        The accelerator configuration. By default same device as model.

    Returns
    -------
    UCEConfig
        The UCE configuration object
    """

    def __init__(
        self,
        model_name: Literal["33l_8ep_1024t_1280", "4layer_model"] = "4layer_model",
        batch_size: int = 24,
        species: Literal[
            "human",
            "mouse",
            "frog",
            "zebrafish",
            "mouse_lemur",
            "pig",
            "macaca_fascicularis",
            "macaca_mulatta",
        ] = "human",
        gene_embedding_model: Literal["ESM2"] = "ESM2",
        pad_length: int = 1536,
        pad_token_idx: int = 0,
        chrom_token_left_idx: int = 1,
        chrom_token_right_idx: int = 2,
        cls_token_idx: int = 3,
        CHROM_TOKEN_OFFSET: int = 143574,
        sample_size: int = 1024,
        CXG: bool = True,
        output_dim: int = 1280,
        d_hid: int = 5120,
        token_dim: int = 5120,
        multi_gpu: bool = False,
        device: Literal["cpu", "cuda"] = "cpu",
        accelerator: Optional[bool] = False,
    ):

        # model specific parameters
        self.model_map = {
            "33l_8ep_1024t_1280": {
                "n_layers": 33,
            },
            "4layer_model": {
                "n_layers": 4,
            },
        }

        if model_name not in self.model_map:
            raise ValueError(
                f"Model name {model_name} not found in available models: {self.model_map.keys()}."
            )

        list_of_files_to_download = [
            "uce/all_tokens.torch",
            f"uce/{model_name}.torch",
            "uce/species_chrom.csv",
            "uce/species_offsets.pkl",
            f"uce/protein_embeddings/{SPECIES_GENE_EMBEDDINGS[gene_embedding_model][species]}",
        ]

        model_path = Path(CACHE_DIR_HELICAL, "uce", f"{model_name}.torch")

        self.config = {
            "model_name": model_name,
            "model_path": model_path,
            "list_of_files_to_download": list_of_files_to_download,
            "batch_size": batch_size,
            "species": species,
            "gene_embedding_model": gene_embedding_model,
            "pad_length": pad_length,
            "pad_token_idx": pad_token_idx,
            "chrom_token_left_idx": chrom_token_left_idx,
            "chrom_token_right_idx": chrom_token_right_idx,
            "cls_token_idx": cls_token_idx,
            "CHROM_TOKEN_OFFSET": CHROM_TOKEN_OFFSET,
            "sample_size": sample_size,
            "CXG": CXG,
            "n_layers": self.model_map[model_name]["n_layers"],
            "output_dim": output_dim,
            "d_hid": d_hid,
            "token_file_path": model_path.parent / "all_tokens.torch",
            "token_dim": token_dim,
            "multi_gpu": multi_gpu,
            "device": device,
            "accelerator": accelerator,
            "embsize": output_dim,
        }