BLIP-2 x STA

2024-03-08

本文目的

BLIP-2的思想：对于多模态大模型，同时训练Image Encoder和LLM的计算成本实在是太大。该文提出了一种新的思路，使用Q-former这个模块来对齐Image Encoder和LLM之间的gap。
STA的思想：尽管现有的ANN2SNN转换方法适用于卷积网络，新兴的Transformer模型引入了自注意力和测试时标准化等独特机制，导致当前SNNs难以实现的非因果非线性交互。文章提出了一种无需训练的ANN到SNN的转换方法，该方法通过时空近似（STA）将ANN激活转换为时间尖峰序列，几乎保留了源模型的所有功能。
本文思想：BLIP-2多模态对齐非常好用但是它是是现在ANN上面的，我们现在要使用STA的思想将BLIP-2的Image Encoder和LLM部分有ANN表示转换为SNN表示的

BLIP-2

.
├── Qformer.py
├── __init__.py
├── __pycache__
├── blip2.py
├── blip2_image_text_matching.py
├── blip2_opt.py
├── blip2_qformer.py
├── blip2_t5.py
├── blip2_t5_instruct.py
├── blip2_vicuna_instruct.py
├── modeling_llama.py
├── modeling_opt.py
└── modeling_t5.py

由于我只需要改LLM部分，那么涉及到的应该是opt和t5

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 2560, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
      (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (1): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (2): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (3): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (4): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (5): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (6): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (7): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (8): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (9): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (10): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (11): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (12): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (13): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (14): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (15): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (16): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (17): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (18): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (19): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (20): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (21): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (22): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (23): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (24): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (25): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (26): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (27): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (28): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (29): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (30): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (31): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (lm_head): Linear(in_features=2560, out_features=50272, bias=False)
)

VisionTransformer(
  (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
  (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

OPTDecoder:

(model): OPTModel(
  (decoder): OPTDecoder(
    (embed_tokens): Embedding(50272, 2560, padding_idx=1)
    (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
    (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0): OPTDecoderLayer(
        (self_attn): OPTAttention(
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (activation_fn): ReLU()
        (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=2560, out_features=10240, bias=True)
        (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
      )

linear
ReLU

vit:

(transformer): Transformer(
  (resblocks): Sequential(
    (0): ResidualAttentionBlock(
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (c_fc): Linear(in_features=768, out_features=3072, bias=True)
        (gelu): QuickGELU()
        (c_proj): Linear(in_features=3072, out_features=768, bias=True)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )

Linear
LN
GELU

gnn:

(transformer): Transformer(
  (resblocks): Sequential(
    (0): ResidualAttentionBlock(
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (c_fc): Linear(in_features=768, out_features=3072, bias=True)
        (gelu): Distilled_GELU(
          (approximator): Sequential(
            (0): Linear(in_features=1, out_features=64, bias=True)
            (1): ReLU()
            (2): Linear(in_features=64, out_features=1, bias=True)
          )
        )
        (c_proj): Linear(in_features=3072, out_features=768, bias=True)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )

snn:

(2): ResidualAttentionBlock(
  (attn): SpikeAttention(
    (product): SpikeProduct()
    (spike_x2x): X2X(
      (approximator): Sequential(
        (0): SpikeLinear_ReLU(
          (relu): ReLU()
        )
        (1): StraightThrough()
        (2): SpikeLinear_ReLU(
          (relu): StraightThrough()
        )
      )
    )
    (spike_x2x_pos): X2X_POS(
      (approximator): Sequential(
        (0): SpikeLinear_ReLU(
          (relu): ReLU()
        )
        (1): StraightThrough()
      )
    )
  )
  (ln_1): SpikeLN(
    (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (spike_sqrtinv): Distilled_SQRTINV(
      (approximator): Sequential(
        (0): SpikeLinear_ReLU(
          (relu): ReLU()
        )
        (1): StraightThrough()
        (2): SpikeLinear_ReLU(
          (relu): StraightThrough()
        )
      )
    )
    (spike_x2x): X2X(
      (approximator): Sequential(
        (0): SpikeLinear_ReLU(
          (relu): ReLU()
        )
        (1): StraightThrough()
        (2): SpikeLinear_ReLU(
          (relu): StraightThrough()
        )
      )
    )
  )
  (mlp): Sequential(
    (c_fc): SpikeLinear_ReLU(
      (relu): StraightThrough()
    )
    (gelu): Distilled_GELU(
      (approximator): Sequential(
        (0): SpikeLinear_ReLU(
          (relu): ReLU()
        )
        (1): StraightThrough()
        (2): SpikeLinear_ReLU(
          (relu): StraightThrough()
        )
      )
    )
    (c_proj): SpikeLinear_ReLU(
      (relu): StraightThrough()
    )
  )
  (ln_2): SpikeLN(
    (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (spike_sqrtinv): Distilled_SQRTINV(
      (approximator): Sequential(
        (0): SpikeLinear_ReLU(
          (relu): ReLU()
        )
        (1): StraightThrough()
        (2): SpikeLinear_ReLU(
          (relu): StraightThrough()
        )
      )
    )
    (spike_x2x): X2X(
      (approximator): Sequential(
        (0): SpikeLinear_ReLU(
          (relu): ReLU()
        )
        (1): StraightThrough()
        (2): SpikeLinear_ReLU(
          (relu): StraightThrough()
        )
      )
    )
  )
)

下面是代码是经过如下操作后的代码：

mse = False if args.method =='normal' else True
    get_maximum_activation(train_loader, model=snn, momentum=0.9, iters=args.iters, mse=mse, percentile=args.percentile, T=args.T, neuron_wise=args.neuron_wise)

    torch.set_num_threads(10)


    
    snn.set_spike_state(use_spike=True)

(2): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )

实验部分

BLIP-2 在各种零样本视觉语言任务上的结果概述。与以前最先进的模型相比。 BLIP-2 实现了最高的零样本性能，同时在视觉语言预训练期间需要最少数量的可训练参数。

零样本视觉问答与最先进方法的比较。

零样本 VQA。我们对零样本视觉问答任务进行定量评估。对于OPT模型，我们使用提示“问题：{}答案：”。对于 FlanT5 型号，我们使用提示“问题：{}简答：”。在生成过程中，我们使用波束宽度为 5 的波束搜索。我们还将长度惩罚设置为 -1，这鼓励更短的答案，更好地与人工注释保持一致。

如表 2 所示。BLIP-2 在 VQAv2 (Goyal et al., 2017) 和 GQA (Hudson & Manning, 2019) 数据集上取得了最先进的结果。 尽管可训练参数少了 54 倍，但它在 VQAv2 上的性能比 Flamingo80B 高出 8.7%。在 OK-VQA（Marino 等人，2019）数据集上，BLIP-2 次于 Flamingo80B。我们假设这是因为 OK-VQA 更注重开放世界知识而不是视觉理解，而来自 Flamingo80B 的 70B Chinchilla (Hoffmann et al., 2022) 语言模型比 11B FlanT5XXL 拥有更多的知识。

我们从表 2 中得出了一个有希望的观察结果：更强的图像编码器或更强的 LLM 都会带来更好的性能。这一观察结果得到了几个事实的支持：(1) ViT-g 在 OPT 和 FlanT5 方面均优于 ViT-L。 (2) 在同一LLM系列中，较大的模型优于较小的模型。 (3) FlanT5，一种指令调整的 LLM，在 VQA 上优于无监督训练的 OPT。这一观察结果验证了 BLIP-2 作为一种通用的视觉语言预训练方法，可以有效地收获视觉和自然语言社区的快速进步。

表 3：NoCaps 和 COCO Caption 上最先进的图像字幕方法的比较。所有方法都优化微调过程中的交叉熵损失。 C: CIDEr, S: SPICE, B@4: BLEU@4

我们针对图像字幕任务对 BLIP-2 模型进行了微调，该任务要求模型为图像的视觉内容生成文本描述。我们使用提示“a photo of”作为 LLM 的初始输入，并训练模型生成具有语言建模损失的标题。我们在微调期间保持 LLM 冻结，并与图像编码器一起更新 Q-Former 的参数。我们用 ViT-g 和各种 LLMs 进行实验。详细的超参数可以在附录中找到。我们对 COCO 进行微调，并对 COCO 测试集和零样本转移到 NoCaps（Agrawal 等人，2019）验证集进行评估。

结果如表 3 所示。BLIP-2 实现了最先进的性能，与现有方法相比，NoCap 有了显着改进，展示了对外域图像的强大泛化能力。

表 4：与针对视觉问答进行微调的最先进模型的比较。

给定带注释的 VQA 数据，我们微调 Q-Former 和图像编码器的参数，同时保持 LLM 冻结。我们对开放式答案生成损失进行微调，其中 LLM 接收 Q-Former 的输出和问题作为输入，并被要求生成答案。为了提取与问题更相关的图像特征，我们还针对问题设置了 Q-Former 条件。具体来说，问题标记作为 Q-Former 的输入给出，并通过自注意力层与查询进行交互，这可以引导 Q-Former 的交叉注意力层关注信息更丰富的图像区域。

遵循 BLIP，我们的 VQA 数据包括来自 VQAv2 的训练和验证分割，以及来自 Visual Genome 的训练样本。表 4 显示了 BLIP-2 在开放式生成模型中的最新结果。

表 5：与最先进的图像文本检索方法的比较，在 COCO 上进行微调，并零样本传输到 Flickr30K。

由于图像文本检索不涉及语言生成，因此我们直接对第一阶段预训练模型进行微调，无需LLM。具体来说，我们使用与预训练相同的目标（即 ITC、ITM 和 ITG）在 COCO 上与 Q-Former 一起微调图像编码器。然后，我们在 CO 和 Flickr30K 数据集上评估图像到文本检索和文本到图像检索的模型。在推理过程中，我们遵循 Li 等人的观点。 (2021, 2022) 首先根据图像文本特征相似性选择 k=128 候选者，然后根据成对的 ITM 分数重新排名。我们尝试使用 ViT-L 和 ViT-g 作为图像编码器。详细的超参数可以在附录中找到。

表 6：基于图像的文本生成 (ITG) 损失通过强制查询提取与语言相关的视觉特征来提高图像文本检索性能。

TC 和 ITM 损失对于图像文本检索至关重要，因为它们直接学习图像文本相似性。在表 6 中，我们表明 ITG（基于图像的文本生成）损失也有利于图像文本检索。这一结果支持了我们设计表示学习目标的直觉：ITG 损失强制查询提取与文本最相关的视觉特征，从而改善视觉语言对齐。

transformers导入与modeling_opt.py性能差别

整体来看，其实差不多

transforemrs导入：

2024-03-27 16:49:11,260 [INFO] Start training epoch 0, 2 iters per inner epoch.
/cyb/LAVIS/lavis/processors/randaugment.py:40: RuntimeWarning: overflow encountered in scalar negative
  offset = -low * scale
Train: data epoch: [0]  [0/2]  eta: 0:00:11  lr: 0.000001  loss: 4.7803  time: 5.9693  data: 0.0000  max mem: 8321
2024-03-27 16:49:17,694 [INFO] Reducer buckets have been rebuilt in this iteration.
Train: data epoch: [0]  [1/2]  eta: 0:00:03  lr: 0.000001  loss: 5.7962  time: 3.3131  data: 0.0000  max mem: 8714
Train: data epoch: [0] Total time: 0:00:06 (3.3137 s / it)
2024-03-27 16:49:17,888 [INFO] Averaged stats: lr: 0.0000  loss: 5.2883
2024-03-27 16:49:17,891 [INFO] No validation splits found.
2024-03-27 16:49:17,921 [INFO] Saving checkpoint at epoch 0 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_0.pth.
2024-03-27 16:49:18,958 [INFO] Start training
2024-03-27 16:49:19,008 [INFO] Start training epoch 1, 2 iters per inner epoch.
Train: data epoch: [1]  [0/2]  eta: 0:00:06  lr: 0.000098  loss: 4.2522  time: 3.2473  data: 0.0000  max mem: 8731
Train: data epoch: [1]  [1/2]  eta: 0:00:01  lr: 0.000098  loss: 6.3714  time: 1.8830  data: 0.0000  max mem: 8736
Train: data epoch: [1] Total time: 0:00:03 (1.8835 s / it)
2024-03-27 16:49:22,776 [INFO] Averaged stats: lr: 0.0001  loss: 5.3118
2024-03-27 16:49:22,778 [INFO] No validation splits found.
2024-03-27 16:49:22,808 [INFO] Saving checkpoint at epoch 1 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_1.pth.
2024-03-27 16:49:23,789 [INFO] Start training
2024-03-27 16:49:23,824 [INFO] Start training epoch 2, 2 iters per inner epoch.
Train: data epoch: [2]  [0/2]  eta: 0:00:06  lr: 0.000091  loss: 5.5883  time: 3.3091  data: 0.0000  max mem: 9036
Train: data epoch: [2]  [1/2]  eta: 0:00:01  lr: 0.000091  loss: 6.0959  time: 1.9160  data: 0.0000  max mem: 9533
Train: data epoch: [2] Total time: 0:00:03 (1.9168 s / it)
2024-03-27 16:49:27,658 [INFO] Averaged stats: lr: 0.0001  loss: 5.8421
2024-03-27 16:49:27,661 [INFO] No validation splits found.
2024-03-27 16:49:27,694 [INFO] Saving checkpoint at epoch 2 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_2.pth.
2024-03-27 16:49:29,868 [INFO] Start training
2024-03-27 16:49:29,904 [INFO] Start training epoch 3, 2 iters per inner epoch.
/cyb/LAVIS/lavis/processors/randaugment.py:40: RuntimeWarning: overflow encountered in scalar negative
  offset = -low * scale
Train: data epoch: [3]  [0/2]  eta: 0:00:06  lr: 0.000081  loss: 5.6558  time: 3.4074  data: 0.0000  max mem: 9534
Train: data epoch: [3]  [1/2]  eta: 0:00:02  lr: 0.000081  loss: 6.4650  time: 2.0847  data: 0.0000  max mem: 9555
Train: data epoch: [3] Total time: 0:00:04 (2.0856 s / it)
2024-03-27 16:49:34,076 [INFO] Averaged stats: lr: 0.0001  loss: 6.0604
2024-03-27 16:49:34,079 [INFO] No validation splits found.
2024-03-27 16:49:34,111 [INFO] Saving checkpoint at epoch 3 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_3.pth.
2024-03-27 16:49:36,305 [INFO] Start training
2024-03-27 16:49:36,340 [INFO] Start training epoch 4, 2 iters per inner epoch.
Train: data epoch: [4]  [0/2]  eta: 0:00:07  lr: 0.000069  loss: 4.9748  time: 3.5716  data: 0.0000  max mem: 9555
Train: data epoch: [4]  [1/2]  eta: 0:00:02  lr: 0.000069  loss: 6.2966  time: 2.1039  data: 0.0000  max mem: 9555
Train: data epoch: [4] Total time: 0:00:04 (2.1049 s / it)
2024-03-27 16:49:40,552 [INFO] Averaged stats: lr: 0.0001  loss: 5.6357
2024-03-27 16:49:40,562 [INFO] No validation splits found.
2024-03-27 16:49:40,647 [INFO] Saving checkpoint at epoch 4 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_4.pth.
2024-03-27 16:49:43,149 [INFO] Start training
2024-03-27 16:49:43,184 [INFO] Start training epoch 5, 2 iters per inner epoch.
/cyb/LAVIS/lavis/processors/randaugment.py:40: RuntimeWarning: overflow encountered in scalar negative
  offset = -low * scale
Train: data epoch: [5]  [0/2]  eta: 0:00:06  lr: 0.000055  loss: 4.6983  time: 3.4429  data: 0.0000  max mem: 9555
Train: data epoch: [5]  [1/2]  eta: 0:00:02  lr: 0.000055  loss: 7.9794  time: 2.1261  data: 0.0000  max mem: 9555
Train: data epoch: [5] Total time: 0:00:04 (2.1273 s / it)
2024-03-27 16:49:47,440 [INFO] Averaged stats: lr: 0.0001  loss: 6.3388
2024-03-27 16:49:47,443 [INFO] No validation splits found.
2024-03-27 16:49:47,475 [INFO] Saving checkpoint at epoch 5 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_5.pth.
2024-03-27 16:49:49,912 [INFO] Start training
2024-03-27 16:49:49,948 [INFO] Start training epoch 6, 2 iters per inner epoch.
Train: data epoch: [6]  [0/2]  eta: 0:00:07  lr: 0.000041  loss: 4.7854  time: 3.5252  data: 0.0000  max mem: 9555
Train: data epoch: [6]  [1/2]  eta: 0:00:02  lr: 0.000041  loss: 5.9835  time: 2.1675  data: 0.0000  max mem: 9555
Train: data epoch: [6] Total time: 0:00:04 (2.1683 s / it)
2024-03-27 16:49:54,286 [INFO] Averaged stats: lr: 0.0000  loss: 5.3845
2024-03-27 16:49:54,289 [INFO] No validation splits found.
2024-03-27 16:49:54,332 [INFO] Saving checkpoint at epoch 6 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_6.pth.
2024-03-27 16:49:56,880 [INFO] Start training
2024-03-27 16:49:56,914 [INFO] Start training epoch 7, 2 iters per inner epoch.
Train: data epoch: [7]  [0/2]  eta: 0:00:06  lr: 0.000029  loss: 4.6165  time: 3.4250  data: 0.0000  max mem: 9555
Train: data epoch: [7]  [1/2]  eta: 0:00:01  lr: 0.000029  loss: 6.2248  time: 1.9936  data: 0.0000  max mem: 9555
Train: data epoch: [7] Total time: 0:00:03 (1.9946 s / it)
2024-03-27 16:50:00,904 [INFO] Averaged stats: lr: 0.0000  loss: 5.4207
2024-03-27 16:50:00,908 [INFO] No validation splits found.
2024-03-27 16:50:00,967 [INFO] Saving checkpoint at epoch 7 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_7.pth.
2024-03-27 16:50:03,337 [INFO] Start training
2024-03-27 16:50:03,373 [INFO] Start training epoch 8, 2 iters per inner epoch.
/cyb/LAVIS/lavis/processors/randaugment.py:40: RuntimeWarning: overflow encountered in scalar negative
  offset = -low * scale
Train: data epoch: [8]  [0/2]  eta: 0:00:06  lr: 0.000019  loss: 5.0990  time: 3.4835  data: 0.0000  max mem: 9555
Train: data epoch: [8]  [1/2]  eta: 0:00:02  lr: 0.000019  loss: 6.2343  time: 2.1475  data: 0.0000  max mem: 9555
Train: data epoch: [8] Total time: 0:00:04 (2.1485 s / it)
2024-03-27 16:50:07,670 [INFO] Averaged stats: lr: 0.0000  loss: 5.6667
2024-03-27 16:50:07,674 [INFO] No validation splits found.
2024-03-27 16:50:07,705 [INFO] Saving checkpoint at epoch 8 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_8.pth.
2024-03-27 16:50:10,070 [INFO] Start training
2024-03-27 16:50:10,104 [INFO] Start training epoch 9, 2 iters per inner epoch.
Train: data epoch: [9]  [0/2]  eta: 0:00:06  lr: 0.000012  loss: 5.9895  time: 3.4054  data: 0.0000  max mem: 9556
Train: data epoch: [9]  [1/2]  eta: 0:00:02  lr: 0.000012  loss: 5.6200  time: 2.0422  data: 0.0000  max mem: 9556
Train: data epoch: [9] Total time: 0:00:04 (2.0434 s / it)
2024-03-27 16:50:14,192 [INFO] Averaged stats: lr: 0.0000  loss: 5.8048
2024-03-27 16:50:14,197 [INFO] No validation splits found.
2024-03-27 16:50:14,233 [INFO] Saving checkpoint at epoch 9 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327164/checkpoint_9.pth.
2024-03-27 16:50:16,610 [INFO] No validation splits found.
2024-03-27 16:50:16,610 [INFO] Training time 0:01:10

modeling_opt.py文件：

2024-03-27 16:58:03,450 [INFO] Start training epoch 0, 2 iters per inner epoch.
/cyb/LAVIS/lavis/processors/randaugment.py:40: RuntimeWarning: overflow encountered in scalar negative
  offset = -low * scale
Train: data epoch: [0]  [0/2]  eta: 0:00:12  lr: 0.000001  loss: 4.7803  time: 6.4565  data: 0.0000  max mem: 8321
2024-03-27 16:58:10,549 [INFO] Reducer buckets have been rebuilt in this iteration.
Train: data epoch: [0]  [1/2]  eta: 0:00:03  lr: 0.000001  loss: 5.7962  time: 3.6563  data: 0.0000  max mem: 8714
Train: data epoch: [0] Total time: 0:00:07 (3.6570 s / it)
2024-03-27 16:58:10,765 [INFO] Averaged stats: lr: 0.0000  loss: 5.2883
2024-03-27 16:58:10,770 [INFO] No validation splits found.
2024-03-27 16:58:10,815 [INFO] Saving checkpoint at epoch 0 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_0.pth.
2024-03-27 16:58:11,871 [INFO] Start training
2024-03-27 16:58:11,907 [INFO] Start training epoch 1, 2 iters per inner epoch.
Train: data epoch: [1]  [0/2]  eta: 0:00:06  lr: 0.000098  loss: 3.7817  time: 3.4140  data: 0.0000  max mem: 9036
Train: data epoch: [1]  [1/2]  eta: 0:00:02  lr: 0.000098  loss: 5.3590  time: 2.0641  data: 0.0000  max mem: 9555
Train: data epoch: [1] Total time: 0:00:04 (2.0650 s / it)
2024-03-27 16:58:16,037 [INFO] Averaged stats: lr: 0.0001  loss: 4.5704
2024-03-27 16:58:16,041 [INFO] No validation splits found.
2024-03-27 16:58:16,071 [INFO] Saving checkpoint at epoch 1 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_1.pth.
2024-03-27 16:58:18,357 [INFO] Start training
2024-03-27 16:58:18,399 [INFO] Start training epoch 2, 2 iters per inner epoch.
Train: data epoch: [2]  [0/2]  eta: 0:00:07  lr: 0.000091  loss: 4.9213  time: 3.6945  data: 0.0000  max mem: 9556
Train: data epoch: [2]  [1/2]  eta: 0:00:02  lr: 0.000091  loss: 6.5263  time: 2.1551  data: 0.0000  max mem: 9556
Train: data epoch: [2] Total time: 0:00:04 (2.1560 s / it)
2024-03-27 16:58:22,713 [INFO] Averaged stats: lr: 0.0001  loss: 5.7238
2024-03-27 16:58:22,717 [INFO] No validation splits found.
2024-03-27 16:58:22,765 [INFO] Saving checkpoint at epoch 2 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_2.pth.
2024-03-27 16:58:25,206 [INFO] Start training
2024-03-27 16:58:25,241 [INFO] Start training epoch 3, 2 iters per inner epoch.
Train: data epoch: [3]  [0/2]  eta: 0:00:06  lr: 0.000081  loss: 4.6143  time: 3.4231  data: 0.0000  max mem: 9556
Train: data epoch: [3]  [1/2]  eta: 0:00:01  lr: 0.000081  loss: 5.9560  time: 1.9856  data: 0.0000  max mem: 9556
Train: data epoch: [3] Total time: 0:00:03 (1.9866 s / it)
2024-03-27 16:58:29,215 [INFO] Averaged stats: lr: 0.0001  loss: 5.2851
2024-03-27 16:58:29,220 [INFO] No validation splits found.
2024-03-27 16:58:29,270 [INFO] Saving checkpoint at epoch 3 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_3.pth.
2024-03-27 16:58:31,729 [INFO] Start training
2024-03-27 16:58:31,772 [INFO] Start training epoch 4, 2 iters per inner epoch.
Train: data epoch: [4]  [0/2]  eta: 0:00:06  lr: 0.000069  loss: 4.4050  time: 3.3401  data: 0.0000  max mem: 9556
Train: data epoch: [4]  [1/2]  eta: 0:00:02  lr: 0.000069  loss: 5.5067  time: 2.1267  data: 0.0000  max mem: 9556
Train: data epoch: [4] Total time: 0:00:04 (2.1275 s / it)
2024-03-27 16:58:36,028 [INFO] Averaged stats: lr: 0.0001  loss: 4.9559
2024-03-27 16:58:36,031 [INFO] No validation splits found.
2024-03-27 16:58:36,064 [INFO] Saving checkpoint at epoch 4 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_4.pth.
2024-03-27 16:58:38,607 [INFO] Start training
2024-03-27 16:58:38,643 [INFO] Start training epoch 5, 2 iters per inner epoch.
Train: data epoch: [5]  [0/2]  eta: 0:00:07  lr: 0.000055  loss: 4.0671  time: 3.5544  data: 0.0000  max mem: 9556
Train: data epoch: [5]  [1/2]  eta: 0:00:02  lr: 0.000055  loss: 5.3782  time: 2.2167  data: 0.0000  max mem: 9556
Train: data epoch: [5] Total time: 0:00:04 (2.2181 s / it)
2024-03-27 16:58:43,080 [INFO] Averaged stats: lr: 0.0001  loss: 4.7227
2024-03-27 16:58:43,083 [INFO] No validation splits found.
2024-03-27 16:58:43,117 [INFO] Saving checkpoint at epoch 5 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_5.pth.
2024-03-27 16:58:45,519 [INFO] Start training
2024-03-27 16:58:45,555 [INFO] Start training epoch 6, 2 iters per inner epoch.
Train: data epoch: [6]  [0/2]  eta: 0:00:07  lr: 0.000041  loss: 4.1874  time: 3.5827  data: 0.0000  max mem: 9556
Train: data epoch: [6]  [1/2]  eta: 0:00:02  lr: 0.000041  loss: 3.9796  time: 2.1472  data: 0.0000  max mem: 9556
Train: data epoch: [6] Total time: 0:00:04 (2.1486 s / it)
2024-03-27 16:58:49,853 [INFO] Averaged stats: lr: 0.0000  loss: 4.0835
2024-03-27 16:58:49,856 [INFO] No validation splits found.
2024-03-27 16:58:49,889 [INFO] Saving checkpoint at epoch 6 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_6.pth.
2024-03-27 16:58:52,262 [INFO] Start training
2024-03-27 16:58:52,298 [INFO] Start training epoch 7, 2 iters per inner epoch.
Train: data epoch: [7]  [0/2]  eta: 0:00:06  lr: 0.000029  loss: 4.0438  time: 3.3334  data: 0.0000  max mem: 9556
Train: data epoch: [7]  [1/2]  eta: 0:00:02  lr: 0.000029  loss: 4.3911  time: 2.2152  data: 0.0000  max mem: 9556
Train: data epoch: [7] Total time: 0:00:04 (2.2175 s / it)
2024-03-27 16:58:56,737 [INFO] Averaged stats: lr: 0.0000  loss: 4.2175
2024-03-27 16:58:56,761 [INFO] No validation splits found.
2024-03-27 16:58:56,876 [INFO] Saving checkpoint at epoch 7 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_7.pth.
2024-03-27 16:58:59,494 [INFO] Start training
2024-03-27 16:58:59,534 [INFO] Start training epoch 8, 2 iters per inner epoch.
Train: data epoch: [8]  [0/2]  eta: 0:00:06  lr: 0.000019  loss: 3.9183  time: 3.3688  data: 0.0001  max mem: 9556
Train: data epoch: [8]  [1/2]  eta: 0:00:02  lr: 0.000019  loss: 4.3744  time: 2.1282  data: 0.0000  max mem: 9556
Train: data epoch: [8] Total time: 0:00:04 (2.1292 s / it)
2024-03-27 16:59:03,794 [INFO] Averaged stats: lr: 0.0000  loss: 4.1464
2024-03-27 16:59:03,797 [INFO] No validation splits found.
2024-03-27 16:59:03,830 [INFO] Saving checkpoint at epoch 8 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_8.pth.
2024-03-27 16:59:06,318 [INFO] Start training
2024-03-27 16:59:06,368 [INFO] Start training epoch 9, 2 iters per inner epoch.
Train: data epoch: [9]  [0/2]  eta: 0:00:06  lr: 0.000012  loss: 4.1729  time: 3.4050  data: 0.0000  max mem: 9556
Train: data epoch: [9]  [1/2]  eta: 0:00:02  lr: 0.000012  loss: 3.5403  time: 2.1497  data: 0.0000  max mem: 9556
Train: data epoch: [9] Total time: 0:00:04 (2.1513 s / it)
2024-03-27 16:59:10,672 [INFO] Averaged stats: lr: 0.0000  loss: 3.8566
2024-03-27 16:59:10,677 [INFO] No validation splits found.
2024-03-27 16:59:10,716 [INFO] Saving checkpoint at epoch 9 to /cyb/LAVIS/output/BLIP2/Pretrain_stage2/20240327165/checkpoint_9.pth.
2024-03-27 16:59:13,151 [INFO] No validation splits found.
2024-03-27 16:59:13,151 [INFO] Training time 0:01:13

dataloader

vg_captioin

class ImageTextPairDataset(BaseDataset, __DisplMixin):
    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
        """
        vis_root (string): Root directory of images (e.g. coco/images/)
        ann_root (string): directory to store the annotation file
        """
        super().__init__(vis_processor, text_processor, vis_root, ann_paths)

    def __getitem__(self, index):

        # TODO this assumes image input, not general enough
        ann = self.annotation[index]

        image_path = os.path.join(self.vis_root, ann["image"])
        image = Image.open(image_path).convert("RGB")

        image = self.vis_processor(image)
        caption = self.text_processor(ann["caption"])

        return {"image": image, "text_input": caption}

coco_caption

class CaptionDataset(BaseDataset, __DisplMixin):
    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
        """
        vis_root (string): Root directory of images (e.g. coco/images/)
        ann_root (string): directory to store the annotation file
        """
        super().__init__(vis_processor, text_processor, vis_root, ann_paths)

        self.img_ids = {}
        n = 0
        for ann in self.annotation:
            img_id = ann["image_id"]
            if img_id not in self.img_ids.keys():
                self.img_ids[img_id] = n
                n += 1

    def __getitem__(self, index):

        # TODO this assumes image input, not general enough
        ann = self.annotation[index]

        image_path = os.path.join(self.vis_root, ann["image"])
        image = Image.open(image_path).convert("RGB")

        image = self.vis_processor(image)
        caption = self.text_processor(ann["caption"])

        return {
            "image": image,
            "text_input": caption,
            # "image_id": self.img_ids[ann["image_id"]],
        }

model

SpikeModel(
  (model): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (1): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (2): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (3): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (4): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (5): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (6): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (7): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (8): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (9): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (10): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
        (11): ResidualAttentionBlock(
          (attn): SpikeAttention(
            (product): SpikeProduct()
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x_pos): X2X_POS(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
              )
            )
          )
          (ln_1): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
          (mlp): Sequential(
            (c_fc): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (gelu): Distilled_GELU(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (c_proj): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
          )
          (ln_2): SpikeLN(
            (module): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (spike_sqrtinv): Distilled_SQRTINV(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
            (spike_x2x): X2X(
              (approximator): Sequential(
                (0): SpikeLinear_ReLU(
                  (relu): ReLU()
                )
                (1): StraightThrough()
                (2): SpikeLinear_ReLU(
                  (relu): StraightThrough()
                )
              )
            )
          )
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)

model2

SpikeModel(
  (model): OPTForCausalLM(
    (model): OPTModel(
      (decoder): OPTDecoder(
        (embed_tokens): Embedding(50272, 2560, padding_idx=1)
        (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
        (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-31): 32 x OPTDecoderLayer(
            (self_attn): OPTAttention(
              (k_proj): SpikeLinear_ReLU(
                (relu): StraightThrough()
              )
              (v_proj): SpikeLinear_ReLU(
                (relu): StraightThrough()
              )
              (q_proj): SpikeLinear_ReLU(
                (relu): StraightThrough()
              )
              (out_proj): SpikeLinear_ReLU(
                (relu): ReLU()
              )
            )
            (activation_fn): StraightThrough()
            (self_attn_layer_norm): SpikeLN(
              (module): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
              (spike_sqrtinv): Distilled_SQRTINV(
                (approximator): Sequential(
                  (0): SpikeLinear_ReLU(
                    (relu): ReLU()
                  )
                  (1): StraightThrough()
                  (2): SpikeLinear_ReLU(
                    (relu): StraightThrough()
                  )
                )
              )
              (spike_x2x): X2X(
                (approximator): Sequential(
                  (0): SpikeLinear_ReLU(
                    (relu): ReLU()
                  )
                  (1): StraightThrough()
                  (2): SpikeLinear_ReLU(
                    (relu): StraightThrough()
                  )
                )
              )
            )
            (fc1): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (fc2): SpikeLinear_ReLU(
              (relu): StraightThrough()
            )
            (final_layer_norm): SpikeLN(
              (module): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
              (spike_sqrtinv): Distilled_SQRTINV(
                (approximator): Sequential(
                  (0): SpikeLinear_ReLU(
                    (relu): ReLU()
                  )
                  (1): StraightThrough()
                  (2): SpikeLinear_ReLU(
                    (relu): StraightThrough()
                  )
                )
              )
              (spike_x2x): X2X(
                (approximator): Sequential(
                  (0): SpikeLinear_ReLU(
                    (relu): ReLU()
                  )
                  (1): StraightThrough()
                  (2): SpikeLinear_ReLU(
                    (relu): StraightThrough()
                  )
                )
              )
            )
          )
        )
      )
    )
    (lm_head): Linear(in_features=2560, out_features=50272, bias=False)
  )
)