Skip to content

Config

helical.models.evo_2.Evo2Config

Configuration class for Evo 2 models.

Parameters:

Name Type Description Default
model_name str

The name of the model to use. The default is "evo2-7b". Options are "evo2-7b", "evo2-7b-base", "evo2-1b-base".

'evo2-7b'
batch_size int

The batch size to use. The default is 1.

1
Source code in helical/models/evo_2/evo_2_config.py
class Evo2Config:
    """Configuration class for Evo 2 models.

    Parameters
    ----------
    model_name : str, optional
        The name of the model to use. The default is "evo2-7b". Options are "evo2-7b", "evo2-7b-base", "evo2-1b-base".
    batch_size : int, optional
        The batch size to use. The default is 1.
    """

    def __init__(
        self,
        model_name: Literal["evo2-7b", "evo2-7b-base", "evo2-1b-base"] = "evo2-7b",
        batch_size: int = 1,
    ):
        # model specific parameters
        self.model_map = {
            "evo2-7b": {
                "model_name": "evo2_7b",
                "model_hf_name": "arcinstitute/evo2_7b",
                "default_embedding_layer": "blocks.31.mlp.l3",
                "vocab_size": 512,
                "hidden_size": 4096,
                # Number of long convolution filters in each hyena block. Can be smaller than `hidden_size`
                "num_filters": 4096,
                "hcl_layer_idxs": [2, 6, 9, 13, 16, 20, 23, 27, 30],
                "hcm_layer_idxs": [1, 5, 8, 12, 15, 19, 22, 26, 29],
                "hcs_layer_idxs": [0, 4, 7, 11, 14, 18, 21, 25, 28],
                "attn_layer_idxs": [3, 10, 17, 24, 31],
                "hcm_filter_length": 128,
                "hcl_filter_groups": 4096,
                "hcm_filter_groups": 256,
                "hcs_filter_groups": 256,
                "hcs_filter_length": 7,
                "num_layers": 32,
                # Length of the short, depthwise FIR applied to input projections
                "short_filter_length": 3,
                "num_attention_heads": 32,
                "short_filter_bias": False,  # add bias to FIR
                "mlp_init_method": "torch.nn.init.zeros_",
                "mlp_output_init_method": "torch.nn.init.zeros_",
                "eps": 0.000001,
                "state_size": 16,
                "rotary_emb_base": 100000000000,
                "rotary_emb_scaling_factor": 128,
                "use_interpolated_rotary_pos_emb": True,
                "make_vocab_size_divisible_by": 8,
                "inner_size_multiple_of": 16,  # force GLU inner_size to be a multiple of
                "inner_mlp_size": 11264,
                "log_intermediate_values": False,
                # Number of groups in GQA
                "proj_groups": 1,
                # Number of groups in grouped
                "hyena_filter_groups": 1,
                # Split strategy for channels
                "column_split_hyena": False,
                "column_split": True,
                "interleave": True,
                # Layer > 0 nn.identity activation
                "evo2_style_activations": True,
                # Legacy options for MP / PP inference
                "model_parallel_size": 1,
                "pipe_parallel_size": 1,
                "tie_embeddings": True,
                "mha_out_proj_bias": True,
                "hyena_out_proj_bias": True,
                "hyena_flip_x1x2": False,
                "qkv_proj_bias": False,
                "use_fp8_input_projections": True,
                "max_seqlen": 1048576,
                "final_norm": True,
                "use_flash_attn": True,
                "use_flash_rmsnorm": False,
                "use_flash_depthwise": False,
                "use_flashfft": False,
                "use_laughing_hyena": False,
                "inference_mode": True,
                "prefill_style": "fft",
                "mlp_activation": "gelu",
                "print_activations": False,
            },
            "evo2-7b-base": {
                "model_name": "evo2_7b_base",
                "model_hf_name": "arcinstitute/evo2_7b_base",
                "default_embedding_layer": "blocks.31.mlp.l3",
                "vocab_size": 512,
                "hidden_size": 4096,
                "num_filters": 4096,
                "hcl_layer_idxs": [2, 6, 9, 13, 16, 20, 23, 27, 30],
                "hcm_layer_idxs": [1, 5, 8, 12, 15, 19, 22, 26, 29],
                "hcs_layer_idxs": [0, 4, 7, 11, 14, 18, 21, 25, 28],
                "attn_layer_idxs": [3, 10, 17, 24, 31],
                "hcm_filter_length": 128,
                "hcl_filter_groups": 4096,
                "hcm_filter_groups": 256,
                "hcs_filter_groups": 256,
                "hcs_filter_length": 7,
                "num_layers": 32,
                "short_filter_length": 3,
                "num_attention_heads": 32,
                "short_filter_bias": False,
                "mlp_init_method": "torch.nn.init.zeros_",
                "mlp_output_init_method": "torch.nn.init.zeros_",
                "eps": 0.000001,
                "state_size": 16,
                "rotary_emb_base": 10000,
                "make_vocab_size_divisible_by": 8,
                "inner_size_multiple_of": 16,
                "inner_mlp_size": 11008,
                "log_intermediate_values": False,
                "proj_groups": 1,
                "hyena_filter_groups": 1,
                "column_split_hyena": False,
                "column_split": True,
                "interleave": True,
                "evo2_style_activations": True,
                "model_parallel_size": 1,
                "pipe_parallel_size": 1,
                "tie_embeddings": True,
                "mha_out_proj_bias": True,
                "hyena_out_proj_bias": True,
                "hyena_flip_x1x2": False,
                "qkv_proj_bias": False,
                "use_fp8_input_projections": False,
                "max_seqlen": 32768,
                "max_batch_size": 1,
                "final_norm": True,
                "use_flash_attn": True,
                "use_flash_rmsnorm": False,
                "use_flash_depthwise": False,
                "use_flashfft": False,
                "use_laughing_hyena": False,
                "inference_mode": True,
                "tokenizer_type": "CharLevelTokenizer",
                "prefill_style": "fft",
                "mlp_activation": "gelu",
                "print_activations": False,
                "use_interpolated_rotary_pos_emb": True,
            },
            "evo2-1b-base": {
                "model_name": "evo2_1b_base",
                "model_hf_name": "arcinstitute/evo2_1b_base",
                "default_embedding_layer": "blocks.24.mlp.l3",
                "vocab_size": 512,
                "hidden_size": 1920,
                # Number of independent filters in Hyena-LI
                "num_filters": 1920,
                "attn_layer_idxs": [3, 10, 17, 24],
                "hcl_layer_idxs": [2, 6, 9, 13, 16, 20, 23],
                "hcm_layer_idxs": [1, 5, 8, 12, 15, 19, 22],
                "hcs_layer_idxs": [0, 4, 7, 11, 14, 18, 21],
                "hcm_filter_length": 128,
                "hcl_filter_groups": 1920,
                "hcm_filter_groups": 128,
                "hcs_filter_groups": 128,
                "hcs_filter_length": 7,
                "num_layers": 25,
                # Length of the short, depthwise FIR applied to input projections
                "short_filter_length": 3,
                "num_attention_heads": 15,
                "short_filter_bias": False,  # add bias to FIR
                "mlp_init_method": "torch.nn.init.zeros_",
                "mlp_output_init_method": "torch.nn.init.zeros_",
                "eps": 0.000001,
                "state_size": 16,
                "rotary_emb_base": 10000,
                "make_vocab_size_divisible_by": 8,
                "inner_size_multiple_of": 16,  # force GLU inner_size to be a multiple of
                "inner_mlp_size": 5120,
                "log_intermediate_values": False,
                # Number of groups in GQA
                "proj_groups": 1,
                # Number of groups in grouped
                "hyena_filter_groups": 1,
                # Split strategy for channels
                "column_split_hyena": False,
                "column_split": True,
                "interleave": True,
                # Layer > 0 nn.identity activation
                "evo2_style_activations": True,
                # Legacy options for MP / PP inference
                "model_parallel_size": 1,
                "pipe_parallel_size": 1,
                "tie_embeddings": True,
                "mha_out_proj_bias": True,
                "hyena_out_proj_bias": True,
                "hyena_flip_x1x2": False,
                "qkv_proj_bias": False,
                "use_fp8_input_projections": True,
                "max_seqlen": 8192,
                "final_norm": True,
                "use_flash_attn": True,
                "use_flash_rmsnorm": False,
                "use_flash_depthwise": False,
                "use_flashfft": False,
                "use_laughing_hyena": False,
                "inference_mode": True,
                "prefill_style": "fft",
                "mlp_activation": "gelu",
                "print_activations": False,
            },
            "evo2-40b-base": {
                "model_name": "evo2_40b_base",
                "model_hf_name": "arcinstitute/evo2_40b_base",
                "default_embedding_layer": "blocks.49.mlp.l3",
                "vocab_size": 512,
                "hidden_size": 8192,
                "num_filters": 8192,
                "hcl_layer_idxs": [2, 6, 9, 13, 16, 20, 23, 27, 30, 34, 38, 41, 45, 48],
                "hcm_layer_idxs": [1, 5, 8, 12, 15, 19, 22, 26, 29, 33, 37, 40, 44, 47],
                "hcs_layer_idxs": [0, 4, 7, 11, 14, 18, 21, 25, 28, 32, 36, 39, 43, 46],
                "attn_layer_idxs": [3, 10, 17, 24, 31, 35, 42, 49],
                "hcm_filter_length": 128,
                "hcl_filter_groups": 8192,
                "hcm_filter_groups": 512,
                "hcs_filter_groups": 512,
                "hcs_filter_length": 7,
                "num_layers": 50,
                "short_filter_length": 3,
                "num_attention_heads": 64,
                "short_filter_bias": False,
                "mlp_init_method": "torch.nn.init.zeros_",
                "mlp_output_init_method": "torch.nn.init.zeros_",
                "eps": 0.000001,
                "state_size": 16,
                "rotary_emb_base": 1000000,
                "make_vocab_size_divisible_by": 8,
                "inner_size_multiple_of": 128,
                "inner_mlp_size": 21888,
                "log_intermediate_values": False,
                "proj_groups": 1,
                "hyena_filter_groups": 1,
                "column_split_hyena": False,
                "column_split": True,
                "interleave": True,
                "evo2_style_activations": True,
                "use_fp8_input_projections": True,
                "model_parallel_size": 1,
                "pipe_parallel_size": 1,
                "tie_embeddings": True,
                "mha_out_proj_bias": True,
                "hyena_out_proj_bias": True,
                "hyena_flip_x1x2": False,
                "qkv_proj_bias": False,
                "max_seqlen": 8192,
                "max_batch_size": 1,
                "final_norm": True,
                "use_flash_attn": True,
                "use_flash_rmsnorm": False,
                "use_flash_depthwise": False,
                "use_flashfft": False,
                "use_laughing_hyena": False,
                "inference_mode": True,
                "prefill_style": "fft",
                "mlp_activation": "gelu",
                "print_activations": False,
            },
            "evo2-40b": {
                "model_name": "evo2_40b",
                "model_hf_name": "arcinstitute/evo2_40b",
                "default_embedding_layer": "blocks.49.mlp.l3",
                "vocab_size": 512,
                "hidden_size": 8192,
                "num_filters": 8192,
                "hcl_layer_idxs": [2, 6, 9, 13, 16, 20, 23, 27, 30, 34, 38, 41, 45, 48],
                "hcm_layer_idxs": [1, 5, 8, 12, 15, 19, 22, 26, 29, 33, 37, 40, 44, 47],
                "hcs_layer_idxs": [0, 4, 7, 11, 14, 18, 21, 25, 28, 32, 36, 39, 43, 46],
                "attn_layer_idxs": [3, 10, 17, 24, 31, 35, 42, 49],
                "hcm_filter_length": 128,
                "hcl_filter_groups": 8192,
                "hcm_filter_groups": 512,
                "hcs_filter_groups": 512,
                "hcs_filter_length": 7,
                "num_layers": 50,
                "short_filter_length": 3,
                "num_attention_heads": 64,
                "short_filter_bias": False,
                "mlp_init_method": "torch.nn.init.zeros_",
                "mlp_output_init_method": "torch.nn.init.zeros_",
                "eps": 0.000001,
                "state_size": 16,
                "rotary_emb_base": 100000000000,
                "rotary_emb_scaling_factor": 128,
                "use_interpolated_rotary_pos_emb": True,
                "make_vocab_size_divisible_by": 8,
                "inner_size_multiple_of": 128,  # force GLU inner_size to be a multiple of
                "inner_mlp_size": 22528,
                "log_intermediate_values": False,
                "proj_groups": 1,
                "hyena_filter_groups": 1,
                "column_split_hyena": False,
                "column_split": True,
                "interleave": True,
                "evo2_style_activations": True,
                "use_fp8_input_projections": True,
                "model_parallel_size": 1,
                "pipe_parallel_size": 1,
                "tie_embeddings": True,
                "mha_out_proj_bias": True,
                "hyena_out_proj_bias": True,
                "hyena_flip_x1x2": False,
                "qkv_proj_bias": False,
                "max_seqlen": 1048576,
                "max_batch_size": 1,
                "final_norm": True,
                "use_flash_attn": True,
                "use_flash_rmsnorm": False,
                "use_flash_depthwise": False,
                "use_flashfft": False,
                "use_laughing_hyena": False,
                "inference_mode": True,
                "prefill_style": "fft",
                "mlp_activation": "gelu",
                "print_activations": False,
            },
        }
        if model_name not in self.model_map:
            raise ValueError(
                f"Model name {model_name} not found in available models: {self.model_map.keys()}"
            )

        self.config = {
            "model_map": self.model_map[model_name],
            "batch_size": batch_size,
            "device": "cuda",
        }