InternLM · blankde · Nov 26, 2024 · Dec 5, 2024 · Dec 13, 2024 · Dec 18, 2024
diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py
@@ -3,6 +3,7 @@
 
 import math
 from typing import Optional
+from einops import rearrange
 
 import torch
 from torch import nn
@@ -26,6 +27,8 @@
 from internlm.solver.activation_checkpoint import activation_checkpoint
 from internlm.utils.logger import get_logger
 
+from internlm.model.moe.ampipe.ampipe import AttMoEPipe, bias_dropout_add_fused_train
+
 logger = get_logger(__file__)
 
 
@@ -217,30 +220,48 @@ def _dropout_and_norm_attn(_hidden_states):
             residual = residual.to(torch.float32)
 
         mixer_kwargs = convert_attn_args_to_kwargs(args, kwargs)
-        hidden_states = self.mixer(hidden_states, **mixer_kwargs)
-
-        def _dropout_and_norm_ffn(_residual, _hidden_states):
-            _dropped = self.dropout2(_hidden_states)
-            _residual = (_dropped + _residual) if _residual is not None else _dropped
-            _hidden_states = self.norm2(_residual.float())
-            return _residual, _hidden_states
+        if gpc.config.model.ampipe_degree < 1:
+            hidden_states = self.mixer(hidden_states, **mixer_kwargs)
+
+            def _dropout_and_norm_ffn(_residual, _hidden_states):
+                _dropped = self.dropout2(_hidden_states)
+                _residual = (_dropped + _residual) if _residual is not None else _dropped
+                _hidden_states = self.norm2(_residual.float())
+                return _residual, _hidden_states
+
+            if self.dropout_selective_checkpoint:
+                residual, hidden_states = activation_checkpoint(_dropout_and_norm_ffn, False, residual, hidden_states)
+            else:
+                residual, hidden_states = _dropout_and_norm_ffn(residual, hidden_states)
+
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+
+            # MLP.
+            if self.num_experts <= 1:  # dense mlp output
+                hidden_states = self.mlp(hidden_states)
+                moe_loss = torch.tensor(0.0, device=hidden_states.device, dtype=hidden_states.dtype)
+            else:  # MoE output
+                hidden_states, moe_loss, _ = self.mlp(hidden_states)
 
-        if self.dropout_selective_checkpoint:
-            residual, hidden_states = activation_checkpoint(_dropout_and_norm_ffn, False, residual, hidden_states)
         else:
-            residual, hidden_states = _dropout_and_norm_ffn(residual, hidden_states)
-
-        if self.residual_in_fp32:
-            residual = residual.to(torch.float32)
+            mixer_kwargs["skip_score"] = True
+            q, k, v = self.mixer(hidden_states, **mixer_kwargs)
 
-        # MLP.
-        if self.num_experts <= 1:  # dense mlp output
-            hidden_states = self.mlp(hidden_states)
-            moe_loss = torch.tensor(0.0, device=hidden_states.device, dtype=hidden_states.dtype)
-        else:  # MoE output
-            hidden_states, moe_loss, _ = self.mlp(hidden_states)
+            flash = self.mixer.inner_attn
+            dense_layer = self.mixer.out_proj
+            ln = self.norm2
+            k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [k, v]]
+            # torch.cuda.synchronize()
+            # breakpoint()
+            # print(q, flush=True)
+            hidden_states, residual = AttMoEPipe.apply(q, k, v, hidden_states,
+                                                        ln.weight, None, dense_layer.bias,
+                                                        [flash, dense_layer,
+                                                        gpc.config.model.ampipe_degree, ln, self.dropout2.p, \
+                                                        self.mlp.moe_layer])
 
-        return hidden_states + residual, moe_loss
+        return hidden_states + residual, None
 
 
 class Internlm1MoE(BaseModel):
@@ -315,6 +336,7 @@ def __init__(
         top_k: int = 1,
         num_shared_experts: int = 0,
         moe_layer_kwargs: dict = None,
+        ampipe_degree: str = None,  # pylint: disable=W0613
     ):
         super().__init__()
 

diff --git a/internlm/model/modules/linear.py b/internlm/model/modules/linear.py
@@ -555,6 +555,86 @@ def fused_dense_func(
             )
 
 
+def explicit_fused_dense_forward(
+    ctx,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    communicator: Union[TPCommunicator, WPCommunicator],
+    module: Optional[nn.Module] = None,
+    bias: Optional[torch.Tensor] = None,
+    return_residual: bool = False,
+    use_grouped_linear: bool = False,
+    **kwargs,
+):
+    if communicator.communication_mode() == "wp":
+        if not use_grouped_linear:
+            return WPFusedDenseFunc.forward(
+                ctx,
+                x,
+                weight,
+                bias,
+                module,
+                communicator,
+                return_residual,
+            )
+        else:
+            batch_sizes = kwargs.pop("batch_sizes", None)
+            backend = kwargs.pop("backend", "gmm")
+            full_weight_shape = kwargs.pop("full_weight_shape", None)
+            return GroupedGemmWPFusedDenseFunc.forward(
+                ctx,
+                x,
+                weight,
+                module,
+                communicator,
+                batch_sizes,
+                backend,
+                full_weight_shape,
+            )
+    else:  # mtp, msp, and fsp
+        if not use_grouped_linear:
+            return SPFusedDenseFunc.forward(
+                ctx,
+                x,
+                weight,
+                bias,
+                communicator,
+                return_residual,
+            )
+        else:
+            # TODO: support grouped linear for mtp, msp, and fsp
+            batch_sizes = kwargs.pop("batch_sizes", None)
+            backend = kwargs.pop("backend", "gmm")
+            return GroupedGemmSPFusedDenseFunc.forward(
+                ctx,
+                x,
+                weight,
+                batch_sizes,
+                backend,
+            )
+
+
+def explicit_fused_dense_backward(
+    ctx,
+    grad_output: torch.Tensor,
+):
+    if communicator.communication_mode() == "wp":
+        if not use_grouped_linear:
+            grad_input, grad_weight, grad_bias, *_ = WPFusedDenseFunc.backward(ctx, grad_output)
+        else:
+            grad_input, grad_weight = GroupedGemmWPFusedDenseFunc.backward(ctx, grad_output)
+            grad_bias = None
+    else:  # mtp, msp, and fsp
+        if not use_grouped_linear:
+            grad_input, grad_weight, grad_bias, *_ = SPFusedDenseFunc.backward(ctx, grad_output)
+        else:
+            # TODO: support grouped linear for mtp, msp, and fsp
+            grad_input, grad_weight =  GroupedGemmSPFusedDenseFunc.backward(ctx, grad_output)
+            grad_bias = None
+
+    return grad_input, grad_weight, grad_bias
+
+
 class ParallelLinearWithCommExt(nn.Linear):
     """
     Parallel linear with commuication extention.
@@ -637,6 +717,33 @@ def forward(self, input: torch.Tensor, batch_sizes: torch.Tensor = None) -> torc
             **mixer_kwargs,
         )
 
+    def explicit_fwd(self, ctx, input: torch.Tensor, batch_sizes: torch.Tensor = None) -> torch.Tensor:  # pylint: disable=W0622
+        _class_name = self.__class__.__name__
+        assert self._communicator is not None, f"{_class_name} should register with a communicator first."
+
+        mixer_kwargs = {}
+        use_grouped_linear = getattr(self, "is_grouped_linear", False)
+        if use_grouped_linear:
+            mixer_kwargs = {
+                "batch_sizes": batch_sizes,
+                "backend": self.backend,
+                "full_weight_shape": self.full_weight_shape if hasattr(self, "full_weight_shape") else None,
+            }
+
+        return explicit_fused_dense_forward(
+            ctx,
+            input,
+            self.weight,
+            communicator=self._communicator,
+            module=self,
+            bias=self.bias,
+            use_grouped_linear=use_grouped_linear,
+            **mixer_kwargs,
+        )
+
+    def explicit_bwd(self, ctx, grad_output: torch.Tensor):
+        return explicit_fused_dense_backward(ctx, grad_output)
+
 
 class ColumnParallelLinear(ParallelLinearWithCommExt):
     """

diff --git a/internlm/model/modules/mha.py b/internlm/model/modules/mha.py
@@ -206,9 +206,13 @@ def _training(self, x, **kwargs):
         # rotary embedding
         indexes = kwargs.pop("indexes", 0)
         max_seqlen = kwargs.get("max_seqlen", None)
+        skip_score = kwargs.get("skip_score", False)
         q = self.rotary_emb(q, offsets=indexes, cache_type="query", interleaved=self.interleaved, max_seqlen=max_seqlen)
         k = self.rotary_emb(k, offsets=indexes, cache_type="key", interleaved=self.interleaved, max_seqlen=max_seqlen)
 
+        if skip_score:
+            return q, k, v
+
         # self attention
         kwargs = _convert_cu_seqlens_for_qksplited(kwargs)
         if gpc.config.data.use_packed_dataset is False or self.training is False: