# if decoder, the last output is tuple of self-attn cache if self.is_decoder: outputs = self_attention_outputs[1:-1] present_key_value = self_attention_outputs[-1] else: outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
cross_attn_present_key_value = None if self.is_decoder and encoder_hidden_states isnotNone: ifnothasattr(self, "crossattention"): raise ValueError( f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" " by setting `config.add_cross_attention=True`" )
# cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple cross_attn_past_key_value = past_key_value[-2:] if past_key_value isnotNoneelseNone cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, cross_attn_past_key_value, output_attentions, ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
# add cross-attn cache to positions 3,4 of present_key_value tuple cross_attn_present_key_value = cross_attention_outputs[-1] present_key_value = present_key_value + cross_attn_present_key_value
# if self.gradient_checkpointing and self.training: # if use_cache: # logger.warning_once( # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." # ) # use_cache = False
next_decoder_cache = () if use_cache elseNone
###################### hidden_states = hidden_states.repeat(tuple([4] + torch.ones(len(hidden_states.size()), dtype=int).tolist())) # T B L D # hidden_states = hidden_states.transpose(0, 1) # B T L D ###################### for i, layer_module inenumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask isnotNoneelseNone past_key_value = past_key_values[i] if past_key_values isnotNoneelseNone
# if self.gradient_checkpointing and self.training:
ifnot return_dict: returntuple( v for v in [ hidden_states, next_decoder_cache, all_hidden_states, all_self_attentions, None, ] if v isnotNone ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, ########################## all_hidden_states attentions=all_self_attentions, cross_attentions=None, )
classPSN(nn.Module, base.MultiStepModule): def__init__(self, T: int, surrogate_function: surrogate.SurrogateFunctionBase = surrogate.ATan()): """ :param T: the number of time-steps :type T: int :param surrogate_function: the function for calculating surrogate gradients of the heaviside step function in backward :type surrogate_function: Callable .. admonition:: Note :class: note The PSN only supports the multi-step mode. """ super().__init__() self.T = T self.surrogate_function = surrogate_function weight = torch.zeros([T, T]) bias = torch.zeros([T, 1])
def__init__(self, k: int, T: int, lambda_init: float = 0., surrogate_function: surrogate.SurrogateFunctionBase = surrogate.ATan(), step_mode: str = 's'): """ :param k: the order of the Masked PSN :type k: int :param T: the number of time-steps :type T: int :param lambda_init: the initial value of :math:`\\lambda` to adjust the progressive masking process :type lambda_init: float :param surrogate_function: the function for calculating surrogate gradients of the heaviside step function in backward :type surrogate_function: Callable :param step_mode: the step mode, which can be `s` (single-step) or `m` (multi-step) :type step_mode: str . .. admonition:: Note :class: note The masked PSN supports both single-step and multi-step mode. But using the multi-step mode is much faster than the single-step mode. """ super().__init__() self.register_memory('time_step', 0) self.register_memory('queue', []) self.step_mode = step_mode self.k = k self.T = T self.surrogate_function = surrogate_function weight = torch.zeros([T, T]) bias = torch.zeros([T, 1]) self.register_buffer('_lambda_', torch.as_tensor(lambda_init))
defgen_gemm_weight(self, T: int): weight = torch.zeros([T, T], device=self.weight.device) for i inrange(T): end = i + 1 start = max(0, i + 1 - self.k) length = min(end - start, self.k) weight[i][start: end] = self.weight[self.k - length: self.k]
return weight
def__init__(self, k: int, exp_init: bool = True, surrogate_function: surrogate.SurrogateFunctionBase = surrogate.ATan(), step_mode: str = 's', backend: str = 'gemm'): """ :param k: the order of the Sliding PSN :type k: int :param exp_init: if ``True``, the weight will be initialized as ``(..., 1/4, 1/2, 1)``. If ``False``, the weight will be initialized by the kaiming uniform :type exp_init: bool :param surrogate_function: the function for calculating surrogate gradients of the heaviside step function in backward :type surrogate_function: Callable :param step_mode: the step mode, which can be `s` (single-step) or `m` (multi-step) :type step_mode: str :param backend: backend fot this neuron layer, which can be "gemm" or "conv". This option only works for the multi-step mode :type backend: str .. admonition:: Note :class: note The Sliding PSN supports both single-step and multi-step mode. But using the multi-step mode is much faster than the single-step mode. """
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 6; 23.70 GiB total capacity; 22.28 GiB already allocated; 231.69 MiB free; 22.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 4; 23.70 GiB total capacity; 20.87 GiB already allocated; 1.39 GiB free; 21.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 0; 23.70 GiB total capacity; 20.98 GiB already allocated; 1.27 GiB free; 21.12 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 3; 23.70 GiB total capacity; 20.82 GiB already allocated; 1.43 GiB free; 20.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 7; 23.70 GiB total capacity; 22.27 GiB already allocated; 235.69 MiB free; 22.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 3; 23.70 GiB total capacity; 20.82 GiB already allocated; 1.43 GiB free; 20.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 4; 23.70 GiB total capacity; 22.27 GiB already allocated; 235.69 MiB free; 22.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.00 GiB (GPU 3; 23.70 GiB total capacity; 19.34 GiB already allocated; 2.94 GiB free; 19.45 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF