From ae35ea8a513443501093c1e714ae7df7e6839ff8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 25 Apr 2026 23:55:14 +0800
Subject: [PATCH 01/34] refactor(dpmodel): plumb comm_dict and extract
 _exchange_ghosts hook

Lifts the per-layer node_ebd_ext gather inside DescrptBlockRepflows.call
and DescrptBlockRepformers.call into a new _exchange_ghosts(...) method
so subclasses can override it. Default behaviour is byte-identical to
before for non-parallel inference (comm_dict is None).

Threads an optional comm_dict kwarg through:
  - make_model.call_common_lower / forward_common_atomic
  - {base,dp,linear,pairtab}_atomic_model
  - dpa1/dpa2/dpa3/hybrid/se_* descriptors
  - repflows/repformers blocks

Non-GNN descriptors accept and ignore comm_dict (noqa-marked unused).
DPA2 routes around its pre-block gather when comm_dict is supplied so
the repformers' per-layer override drives ghost exchange instead.

This is the dpmodel-side groundwork for pt_expt multi-rank LAMMPS
support; default behaviour unchanged.
---
 .../dpmodel/atomic_model/base_atomic_model.py |  5 +++
 .../dpmodel/atomic_model/dp_atomic_model.py   |  5 +++
 .../atomic_model/linear_atomic_model.py       |  6 +++
 .../atomic_model/pairtab_atomic_model.py      |  2 +
 deepmd/dpmodel/descriptor/dpa1.py             |  1 +
 deepmd/dpmodel/descriptor/dpa2.py             | 22 +++++++--
 deepmd/dpmodel/descriptor/dpa3.py             |  5 +++
 deepmd/dpmodel/descriptor/hybrid.py           |  5 ++-
 deepmd/dpmodel/descriptor/repflows.py         | 45 ++++++++++++++++---
 deepmd/dpmodel/descriptor/repformers.py       | 42 +++++++++++++++--
 deepmd/dpmodel/descriptor/se_e2_a.py          |  1 +
 deepmd/dpmodel/descriptor/se_r.py             |  1 +
 deepmd/dpmodel/descriptor/se_t.py             |  1 +
 deepmd/dpmodel/descriptor/se_t_tebd.py        |  1 +
 deepmd/dpmodel/model/make_model.py            |  9 ++++
 15 files changed, 137 insertions(+), 14 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 1120078bb2..debddba6e7 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -231,6 +231,7 @@ def forward_common_atomic(
         mapping: Array | None = None,
         fparam: Array | None = None,
         aparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> dict[str, Array]:
         """Common interface for atomic inference.
 
@@ -252,6 +253,9 @@ def forward_common_atomic(
             frame parameters, shape: nf x dim_fparam
         aparam
             atomic parameter, shape: nf x nloc x dim_aparam
+        comm_dict
+            MPI communication metadata for parallel inference. ``None`` for
+            non-parallel inference (default).
 
         Returns
         -------
@@ -279,6 +283,7 @@ def forward_common_atomic(
             mapping=mapping,
             fparam=fparam,
             aparam=aparam,
+            comm_dict=comm_dict,
         )
         ret_dict = self.apply_out_stat(ret_dict, atype)
 
diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
index 466e3ddd95..0505f63d83 100644
--- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -157,6 +157,7 @@ def forward_atomic(
         mapping: Array | None = None,
         fparam: Array | None = None,
         aparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> dict[str, Array]:
         """Models' atomic predictions.
 
@@ -174,6 +175,9 @@ def forward_atomic(
             frame parameter. nf x ndf
         aparam
             atomic parameter. nf x nloc x nda
+        comm_dict
+            MPI communication metadata for parallel inference. ``None`` for
+            non-parallel inference (default). Forwarded to the descriptor.
 
         Returns
         -------
@@ -215,6 +219,7 @@ def forward_atomic(
             nlist,
             mapping=mapping,
             fparam=fparam_input_for_des if self.add_chg_spin_ebd else None,
+            comm_dict=comm_dict,
         )
         ret = self.fitting_net(
             descriptor,
diff --git a/deepmd/dpmodel/atomic_model/linear_atomic_model.py b/deepmd/dpmodel/atomic_model/linear_atomic_model.py
index 3ed9077df7..05ff8499f8 100644
--- a/deepmd/dpmodel/atomic_model/linear_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/linear_atomic_model.py
@@ -224,6 +224,7 @@ def forward_atomic(
         mapping: Array | None = None,
         fparam: Array | None = None,
         aparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> dict[str, Array]:
         """Return atomic prediction.
 
@@ -241,6 +242,10 @@ def forward_atomic(
             frame parameter. (nframes, ndf)
         aparam
             atomic parameter. (nframes, nloc, nda)
+        comm_dict
+            MPI communication metadata. Forwarded to each sub-model so GNN
+            sub-descriptors can perform parallel ghost exchange. ``None`` for
+            non-parallel inference (default).
 
         Returns
         -------
@@ -280,6 +285,7 @@ def forward_atomic(
                     mapping,
                     fparam,
                     aparam,
+                    comm_dict,
                 )["energy"]
             )
         weights = self._compute_weight(extended_coord, extended_atype, nlists_)
diff --git a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
index 51c370eca0..c1ec9d2a00 100644
--- a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
@@ -253,7 +253,9 @@ def forward_atomic(
         mapping: Array | None = None,
         fparam: Array | None = None,
         aparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> dict[str, Array]:
+        del comm_dict  # pairtab is local; no MPI ghost exchange needed.
         xp = array_api_compat.array_namespace(extended_coord, extended_atype, nlist)
         nframes, nloc, nnei = nlist.shape
         extended_coord = xp.reshape(extended_coord, (nframes, -1, 3))
diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index bc2a04a836..9d138f422a 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -500,6 +500,7 @@ def call(
         nlist: Array,
         mapping: Array | None = None,
         fparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> Array:
         """Compute the descriptor.
 
diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py
index 2fa765f04b..851422cce0 100644
--- a/deepmd/dpmodel/descriptor/dpa2.py
+++ b/deepmd/dpmodel/descriptor/dpa2.py
@@ -831,6 +831,7 @@ def call(
         nlist: Array,
         mapping: Array | None = None,
         fparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> tuple[Array, Array, Array, Array, Array]:
         """Compute the descriptor.
 
@@ -844,6 +845,11 @@ def call(
             The neighbor list. shape: nf x nloc x nnei
         mapping
             The index mapping, maps extended region index to local region.
+        comm_dict
+            MPI communication metadata for parallel inference. Forwarded to
+            the repformer block (the message-passing part). The repinit
+            sub-block does no message passing and does not receive it.
+            ``None`` for non-parallel inference (default).
 
         Returns
         -------
@@ -912,9 +918,18 @@ def call(
             assert self.tebd_transform is not None
             g1 = g1 + self.tebd_transform(g1_inp)
         # mapping g1
-        assert mapping is not None
-        mapping_ext = xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, g1.shape[-1]))
-        g1_ext = xp_take_along_axis(g1, mapping_ext, axis=1)
+        if comm_dict is None:
+            # non-parallel: gather g1 -> g1_ext via mapping, hand the
+            # nall-sized embedding to the repformer block.
+            assert mapping is not None
+            mapping_ext = xp.tile(
+                xp.expand_dims(mapping, axis=-1), (1, 1, g1.shape[-1])
+            )
+            g1_ext = xp_take_along_axis(g1, mapping_ext, axis=1)
+        else:
+            # parallel mode: hand the local-only g1 to the repformer block;
+            # its per-layer override fills ghosts via the MPI exchange.
+            g1_ext = g1
         # repformer
         g1, g2, h2, rot_mat, sw = self.repformers(
             nlist_dict[
@@ -926,6 +941,7 @@ def call(
             atype_ext,
             g1_ext,
             mapping,
+            comm_dict=comm_dict,
         )
         if self.concat_output_tebd:
             g1 = xp.concat([g1, g1_inp], axis=-1)
diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py
index 5f5aea50e5..07d5481a91 100644
--- a/deepmd/dpmodel/descriptor/dpa3.py
+++ b/deepmd/dpmodel/descriptor/dpa3.py
@@ -616,6 +616,7 @@ def call(
         nlist: Array,
         mapping: Array | None = None,
         fparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> tuple[Array, Array, Array, Array, Array]:
         """Compute the descriptor.
 
@@ -629,6 +630,9 @@ def call(
             The neighbor list. shape: nf x nloc x nnei
         mapping
             The index mapping, mapps extended region index to local region.
+        comm_dict
+            MPI communication metadata for parallel inference. Forwarded to
+            the repflows block. ``None`` for non-parallel inference (default).
 
         Returns
         -------
@@ -695,6 +699,7 @@ def call(
             atype_ext,
             node_ebd_ext,
             mapping,
+            comm_dict=comm_dict,
         )
         if self.concat_output_tebd:
             node_ebd = xp.concat([node_ebd, node_ebd_inp], axis=-1)
diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py
index b15fbc15d2..512a753d25 100644
--- a/deepmd/dpmodel/descriptor/hybrid.py
+++ b/deepmd/dpmodel/descriptor/hybrid.py
@@ -276,6 +276,7 @@ def call(
         nlist: Array,
         mapping: Array | None = None,
         fparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> tuple[
         Array,
         Array | None,
@@ -332,7 +333,9 @@ def call(
                 # mixed_types is True, but descrpt.mixed_types is False
                 assert nl_distinguish_types is not None
                 nl = nl_distinguish_types[:, :, nci]
-            odescriptor, gr, g2, h2, sw = descrpt(coord_ext, atype_ext, nl, mapping)
+            odescriptor, gr, g2, h2, sw = descrpt(
+                coord_ext, atype_ext, nl, mapping, comm_dict=comm_dict
+            )
             out_descriptor.append(odescriptor)
             if gr is not None:
                 out_gr.append(gr)
diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py
index 30637dc75a..2dd64448b2 100644
--- a/deepmd/dpmodel/descriptor/repflows.py
+++ b/deepmd/dpmodel/descriptor/repflows.py
@@ -506,6 +506,27 @@ def reinit_exclude(
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
 
+    def _exchange_ghosts(
+        self,
+        node_ebd: Array,
+        mapping_tiled: Array | None,
+        comm_dict: dict | None,
+        nall: int,
+        nloc: int,
+    ) -> Array:
+        """Build node_ebd_ext (the ghost-aware embedding) for the per-layer loop.
+
+        Default: array-api gather via the pre-tiled `mapping_tiled`, or pass the
+        local-only `node_ebd` through when ``self.use_loc_mapping`` is set.
+        ``comm_dict``, ``nall``, ``nloc`` are unused in this default impl; they
+        exist so the pt_expt subclass can perform the per-layer MPI ghost
+        exchange (``deepmd_export::border_op``) when ``comm_dict is not None``.
+        """
+        del comm_dict, nall, nloc
+        if self.use_loc_mapping:
+            return node_ebd
+        return xp_take_along_axis(node_ebd, mapping_tiled, axis=1)
+
     def call(
         self,
         nlist: Array,
@@ -514,6 +535,7 @@ def call(
         atype_embd_ext: Array | None = None,
         mapping: Array | None = None,
         type_embedding: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> tuple[Array, Array, Array, Array, Array]:
         xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext)
         nframes, nloc, nnei = nlist.shape
@@ -641,15 +663,24 @@ def call(
         # nf x nloc x a_nnei x a_nnei x a_dim [OR] n_angle x a_dim
         angle_ebd = self.angle_embd(angle_input)
 
-        # nb x nall x n_dim
-        mapping = xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, self.n_dim))
+        # nb x nall x n_dim (pre-tiled mapping reused across layers when not
+        # using comm_dict). Skip the tile when mapping is None — pt_expt's
+        # parallel-mode override consults comm_dict instead.
+        mapping_tiled = (
+            xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, self.n_dim))
+            if mapping is not None
+            else None
+        )
         for idx, ll in enumerate(self.layers):
             # node_ebd:     nb x nloc x n_dim
-            # node_ebd_ext: nb x nall x n_dim
-            node_ebd_ext = (
-                node_ebd
-                if self.use_loc_mapping
-                else xp_take_along_axis(node_ebd, mapping, axis=1)
+            # node_ebd_ext: nb x nall x n_dim (or nb x nloc x n_dim when
+            #               use_loc_mapping=True)
+            node_ebd_ext = self._exchange_ghosts(
+                node_ebd,
+                mapping_tiled,
+                comm_dict,
+                nall,
+                nloc,
             )
             node_ebd, edge_ebd, angle_ebd = ll.call(
                 node_ebd_ext,
diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py
index 5881b3a0b3..3891c57c7d 100644
--- a/deepmd/dpmodel/descriptor/repformers.py
+++ b/deepmd/dpmodel/descriptor/repformers.py
@@ -480,6 +480,26 @@ def reinit_exclude(
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
 
+    def _exchange_ghosts(
+        self,
+        g1: Array,
+        mapping_tiled: Array | None,
+        comm_dict: dict | None,
+        nall: int,
+        nloc: int,
+    ) -> Array:
+        """Build g1_ext (the ghost-aware single-atom embedding) for the
+        per-layer loop.
+
+        Default: array-api gather via the pre-tiled ``mapping_tiled``.
+        ``comm_dict``, ``nall``, ``nloc`` are unused in this default impl;
+        they exist so the pt_expt subclass can perform the per-layer MPI
+        ghost exchange (``deepmd_export::border_op``) when ``comm_dict is
+        not None``.
+        """
+        del comm_dict, nall, nloc
+        return xp_take_along_axis(g1, mapping_tiled, axis=1)
+
     def call(
         self,
         nlist: Array,
@@ -488,6 +508,7 @@ def call(
         atype_embd_ext: Array | None = None,
         mapping: Array | None = None,
         type_embedding: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> Array:
         xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext)
         exclude_mask = self.emask.build_type_exclude_mask(nlist, atype_ext)
@@ -524,12 +545,27 @@ def call(
         # set all padding positions to index of 0
         # if a neighbor is real or not is indicated by nlist_mask
         nlist = xp.where(nlist == -1, xp.zeros_like(nlist), nlist)
-        # nf x nall x ng1
-        mapping = xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, self.g1_dim))
+        # nall computed for the pt_expt parallel-mode override (uses nall to
+        # size the pad before MPI ghost exchange). dpmodel default ignores it.
+        nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3
+        # nf x nall x ng1 (pre-tiled mapping reused across layers when not
+        # using comm_dict). Skip the tile when mapping is None — pt_expt's
+        # parallel-mode override consults comm_dict instead.
+        mapping_tiled = (
+            xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, self.g1_dim))
+            if mapping is not None
+            else None
+        )
         for idx, ll in enumerate(self.layers):
             # g1:     nf x nloc x ng1
             # g1_ext: nf x nall x ng1
-            g1_ext = xp_take_along_axis(g1, mapping, axis=1)
+            g1_ext = self._exchange_ghosts(
+                g1,
+                mapping_tiled,
+                comm_dict,
+                nall,
+                nloc,
+            )
             g1, g2, h2 = ll.call(
                 g1_ext,
                 g2,
diff --git a/deepmd/dpmodel/descriptor/se_e2_a.py b/deepmd/dpmodel/descriptor/se_e2_a.py
index 8997412325..6c20699c23 100644
--- a/deepmd/dpmodel/descriptor/se_e2_a.py
+++ b/deepmd/dpmodel/descriptor/se_e2_a.py
@@ -399,6 +399,7 @@ def call(
         nlist: Array,
         mapping: Array | None = None,
         fparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> Array:
         """Compute the descriptor.
 
diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py
index b5ba7a282f..55a774bb71 100644
--- a/deepmd/dpmodel/descriptor/se_r.py
+++ b/deepmd/dpmodel/descriptor/se_r.py
@@ -371,6 +371,7 @@ def call(
         nlist: Array,
         mapping: Array | None = None,
         fparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> Array:
         """Compute the descriptor.
 
diff --git a/deepmd/dpmodel/descriptor/se_t.py b/deepmd/dpmodel/descriptor/se_t.py
index e599669068..38eb7cc16c 100644
--- a/deepmd/dpmodel/descriptor/se_t.py
+++ b/deepmd/dpmodel/descriptor/se_t.py
@@ -346,6 +346,7 @@ def call(
         nlist: Array,
         mapping: Array | None = None,
         fparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> tuple[Array, Array]:
         """Compute the descriptor.
 
diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py
index 2d36994d61..445260b861 100644
--- a/deepmd/dpmodel/descriptor/se_t_tebd.py
+++ b/deepmd/dpmodel/descriptor/se_t_tebd.py
@@ -354,6 +354,7 @@ def call(
         nlist: Array,
         mapping: Array | None = None,
         fparam: Array | None = None,
+        comm_dict: dict | None = None,
     ) -> tuple[Array, Array]:
         """Compute the descriptor.
 
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 597f8ea006..d57b0b3790 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -326,6 +326,7 @@ def call_common_lower(
             aparam: Array | None = None,
             do_atomic_virial: bool = False,
             extended_coord_corr: Array | None = None,
+            comm_dict: dict | None = None,
         ) -> dict[str, Array]:
             """Return model prediction. Lower interface that takes
             extended atomic coordinates and types, nlist, and mapping
@@ -351,6 +352,11 @@ def call_common_lower(
             extended_coord_corr
                 coordinates correction for virial in extended region.
                 nf x (nall x 3)
+            comm_dict
+                MPI communication metadata for parallel inference (e.g.
+                LAMMPS multi-rank). Carries send/recv lists, processor IDs,
+                the MPI communicator handle, and per-rank nlocal/nghost.
+                ``None`` for non-parallel inference (default).
 
             Returns
             -------
@@ -379,6 +385,7 @@ def call_common_lower(
                 aparam=ap,
                 do_atomic_virial=do_atomic_virial,
                 extended_coord_corr=extended_coord_corr,
+                comm_dict=comm_dict,
             )
             model_predict = self._output_type_cast(model_predict, input_prec)
             return model_predict
@@ -393,6 +400,7 @@ def forward_common_atomic(
             aparam: Array | None = None,
             do_atomic_virial: bool = False,
             extended_coord_corr: Array | None = None,
+            comm_dict: dict | None = None,
         ) -> dict[str, Array]:
             atomic_ret = self.atomic_model.forward_common_atomic(
                 extended_coord,
@@ -401,6 +409,7 @@ def forward_common_atomic(
                 mapping=mapping,
                 fparam=fparam,
                 aparam=aparam,
+                comm_dict=comm_dict,
             )
             return fit_output_to_model_output(
                 atomic_ret,

From bfe650f70e821ee4f2b38ca5f9c0c8501d42b33e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 00:03:39 +0800
Subject: [PATCH 02/34] feat(op): expose deepmd::border_op_backward as a
 standalone op

Refactors Border::backward into a free function take/return interface
(positional comm tensors + grad_g1, returns grad_in) and registers it as
``torch.ops.deepmd.border_op_backward``.  The autograd Function's
backward delegates to the new symbol so existing pt-backend behaviour is
unchanged; the new symbol is what pt_expt's opaque op wrapper
(``deepmd_export::border_op``) dispatches to from its
``register_autograd`` callback.

The standalone op is needed because the ``custom_op`` API requires the
backward to be expressible as a registered op (it cannot reference the
autograd Function directly), and AOTInductor must serialise the call
into the compiled .pt2.
---
 source/op/pt/comm.cc | 128 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 108 insertions(+), 20 deletions(-)

diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
index 97466a4833..9dd9b50c3b 100644
--- a/source/op/pt/comm.cc
+++ b/source/op/pt/comm.cc
@@ -174,17 +174,6 @@ class Border : public torch::autograd::Function<Border> {
   static torch::autograd::variable_list backward(
       torch::autograd::AutogradContext* ctx,
       torch::autograd::variable_list grad_output) {
-    bool type_flag = (grad_output[0].dtype() == torch::kDouble) ? true : false;
-    if (type_flag) {
-      return backward_t<double>(ctx, grad_output);
-    } else {
-      return backward_t<float>(ctx, grad_output);
-    }
-  }
-  template <typename FPTYPE>
-  static torch::autograd::variable_list backward_t(
-      torch::autograd::AutogradContext* ctx,
-      torch::autograd::variable_list grad_output) {
     torch::autograd::variable_list saved_variables = ctx->get_saved_variables();
     torch::Tensor sendlist_tensor = saved_variables[0];
     torch::Tensor sendproc_tensor = saved_variables[1];
@@ -194,8 +183,41 @@ class Border : public torch::autograd::Function<Border> {
     torch::Tensor communicator_tensor = saved_variables[5];
     torch::Tensor nlocal_tensor = saved_variables[6];
     torch::Tensor nghost_tensor = saved_variables[7];
+    torch::Tensor d_in = border_op_backward_dispatch(
+        sendlist_tensor, sendproc_tensor, recvproc_tensor, sendnum_tensor,
+        recvnum_tensor, grad_output[0], communicator_tensor, nlocal_tensor,
+        nghost_tensor);
+    return {torch::Tensor(), torch::Tensor(), torch::Tensor(),
+            torch::Tensor(), torch::Tensor(), d_in,
+            torch::Tensor(), torch::Tensor(), torch::Tensor(),
+            torch::Tensor()};
+  }
 
-    torch::Tensor d_local_g1_tensor = grad_output[0].contiguous();
+  // Forward declaration; defined as a free function below so it can be
+  // registered as a separate torch op (deepmd::border_op_backward) used by
+  // the pt_expt opaque-op autograd wrapper.
+  static torch::Tensor border_op_backward_dispatch(
+      const torch::Tensor& sendlist_tensor,
+      const torch::Tensor& sendproc_tensor,
+      const torch::Tensor& recvproc_tensor,
+      const torch::Tensor& sendnum_tensor,
+      const torch::Tensor& recvnum_tensor,
+      const torch::Tensor& grad_g1,
+      const torch::Tensor& communicator_tensor,
+      const torch::Tensor& nlocal_tensor,
+      const torch::Tensor& nghost_tensor);
+
+  template <typename FPTYPE>
+  static torch::Tensor backward_t(const torch::Tensor& sendlist_tensor,
+                                  const torch::Tensor& sendproc_tensor,
+                                  const torch::Tensor& recvproc_tensor,
+                                  const torch::Tensor& sendnum_tensor,
+                                  const torch::Tensor& recvnum_tensor,
+                                  const torch::Tensor& grad_g1,
+                                  const torch::Tensor& communicator_tensor,
+                                  const torch::Tensor& nlocal_tensor,
+                                  const torch::Tensor& nghost_tensor) {
+    torch::Tensor d_local_g1_tensor = grad_g1.contiguous();
 #ifdef USE_MPI
     int mpi_init = 0;
     MPI_Initialized(&mpi_init);
@@ -216,8 +238,8 @@ class Border : public torch::autograd::Function<Border> {
       cuda_aware = MPIX_Query_cuda_support();
 #endif
       if (cuda_aware == 0) {
-        d_local_g1_tensor = torch::empty_like(grad_output[0]).to(torch::kCPU);
-        d_local_g1_tensor.copy_(grad_output[0]);
+        d_local_g1_tensor = torch::empty_like(grad_g1).to(torch::kCPU);
+        d_local_g1_tensor.copy_(grad_g1);
       }
     }
 #endif
@@ -312,15 +334,15 @@ class Border : public torch::autograd::Function<Border> {
 #ifdef USE_MPI
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
     if (cuda_aware == 0) {
-      grad_output[0].copy_(d_local_g1_tensor);
+      // Move result back to the device of the input grad. This replaces
+      // the original in-place copy_ into grad_output[0].
+      d_local_g1_tensor = d_local_g1_tensor.to(grad_g1.device());
     }
 #endif
 #endif
-
-    return {torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(),
-            torch::Tensor(), grad_output[0],  torch::Tensor(), torch::Tensor(),
-            torch::Tensor(), torch::Tensor()};
+    return d_local_g1_tensor;
   }
+
 #ifdef USE_MPI
   static void unpack_communicator(const torch::Tensor& communicator_tensor,
                                   MPI_Comm& mpi_comm) {
@@ -363,4 +385,70 @@ std::vector<torch::Tensor> border_op(const torch::Tensor& sendlist_tensor,
                        communicator_tensor, nlocal_tensor, nghost_tensor);
 }
 
-TORCH_LIBRARY_FRAGMENT(deepmd, m) { m.def("border_op", border_op); }
+// Define Border::border_op_backward_dispatch out-of-line so the type-flag
+// dispatch can refer to the templated backward_t members declared in the
+// class.
+torch::Tensor Border::border_op_backward_dispatch(
+    const torch::Tensor& sendlist_tensor,
+    const torch::Tensor& sendproc_tensor,
+    const torch::Tensor& recvproc_tensor,
+    const torch::Tensor& sendnum_tensor,
+    const torch::Tensor& recvnum_tensor,
+    const torch::Tensor& grad_g1,
+    const torch::Tensor& communicator_tensor,
+    const torch::Tensor& nlocal_tensor,
+    const torch::Tensor& nghost_tensor) {
+  bool type_flag = (grad_g1.dtype() == torch::kDouble);
+  if (type_flag) {
+    return backward_t<double>(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                              sendnum_tensor, recvnum_tensor, grad_g1,
+                              communicator_tensor, nlocal_tensor,
+                              nghost_tensor);
+  } else {
+    return backward_t<float>(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                             sendnum_tensor, recvnum_tensor, grad_g1,
+                             communicator_tensor, nlocal_tensor, nghost_tensor);
+  }
+}
+
+/**
+ * @brief Standalone backward of border_op for use by pt_expt's opaque-op
+ * autograd wrapper. Performs the symmetric MPI exchange that the autograd
+ * Border::backward applies, but without an autograd context — comm tensors
+ * are passed directly so the op can be registered as a torch op and
+ * embedded in an AOTInductor graph.
+ *
+ * The comm topology is symmetric: the same sendlist/sendnum/recvnum buffers
+ * encode the forward exchange; backward simply swaps send <-> recv and
+ * accumulates gradients into the local atom slots.
+ *
+ * @param[in]  sendlist_tensor  send-list pointer-array (forward direction)
+ * @param[in]  sendproc_tensor  send-proc IDs (forward direction)
+ * @param[in]  recvproc_tensor  recv-proc IDs (forward direction)
+ * @param[in]  sendnum_tensor   atoms sent per swap (forward direction)
+ * @param[in]  recvnum_tensor   atoms received per swap (forward direction)
+ * @param[in]  grad_g1          upstream gradient w.r.t. g1 of forward
+ * @param[in]  communicator_tensor MPI communicator handle as int64
+ * @param[in]  nlocal_tensor    number of local atoms (per rank)
+ * @param[in]  nghost_tensor    number of ghost atoms (per rank)
+ * @return d_in (gradient w.r.t. forward g1 input), same shape as grad_g1.
+ */
+torch::Tensor border_op_backward(const torch::Tensor& sendlist_tensor,
+                                 const torch::Tensor& sendproc_tensor,
+                                 const torch::Tensor& recvproc_tensor,
+                                 const torch::Tensor& sendnum_tensor,
+                                 const torch::Tensor& recvnum_tensor,
+                                 const torch::Tensor& grad_g1,
+                                 const torch::Tensor& communicator_tensor,
+                                 const torch::Tensor& nlocal_tensor,
+                                 const torch::Tensor& nghost_tensor) {
+  return Border::border_op_backward_dispatch(
+      sendlist_tensor, sendproc_tensor, recvproc_tensor, sendnum_tensor,
+      recvnum_tensor, grad_g1, communicator_tensor, nlocal_tensor,
+      nghost_tensor);
+}
+
+TORCH_LIBRARY_FRAGMENT(deepmd, m) {
+  m.def("border_op", border_op);
+  m.def("border_op_backward", border_op_backward);
+}

From 3af514aedb50d7710839841f3787e8460b74a9ef Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 00:18:58 +0800
Subject: [PATCH 03/34] feat(pt_expt): add deepmd_export::border_op opaque
 wrapper + block overrides
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three changes that together let GNN models drive MPI ghost-atom
exchange through the pt_expt forward pass:

1. ``deepmd/pt_expt/utils/comm.py`` registers a NEW torch op
   ``deepmd_export::border_op`` via ``torch.library.custom_op``. The
   wrapper:
     - Forwards to the existing ``torch.ops.deepmd.border_op`` (clones
       the in-place output to satisfy custom_op aliasing rules).
     - Has a ``register_fake`` impl returning ``empty_like(g1)`` so
       ``torch.export`` / ``make_fx`` can trace through it.
     - Has a ``register_autograd`` callback that dispatches to
       ``torch.ops.deepmd.border_op_backward`` (the standalone op
       added in the previous commit).

   The existing ``deepmd::border_op`` is registered as
   ``CompositeImplicitAutograd`` and therefore tries to decompose
   into primitive aten ops during export — which fails because the
   C++ kernel calls ``data_ptr()`` on FakeTensors. The new opaque
   wrapper sidesteps this by being registered as an opaque op that
   ``torch.export`` records as a single black-box call.

2. ``deepmd/pt_expt/descriptor/{repflows,repformers}.py`` add pt_expt
   subclasses of ``DescrptBlockRepflows`` / ``DescrptBlockRepformers``
   that override ``_exchange_ghosts``. When ``comm_dict is None`` the
   override defers to the dpmodel default; otherwise it pads
   ``node_ebd`` to nall and calls the opaque wrapper. Includes the
   spin-aware ``has_spin`` path (split real/virtual + concat_switch
   _virtual) ported from pt's repflows.

3. ``forward_common_lower_exportable_with_comm`` is added on the
   pt_expt CM (and SpinModel) classes. Same as the existing
   ``forward_common_lower_exportable`` but accepts the 8 comm tensors
   as additional positional inputs and reconstructs ``comm_dict``
   inside the traced function (spin variant injects ``has_spin`` so
   the override takes the spin branch). This becomes the new traced
   entry point for the with-comm AOTI artifact (next commit).

Existing pt_expt descriptor wrappers (dpa1, dpa2, se_*) and the
``CM.forward_common_atomic`` override get an extra ``comm_dict`` kwarg
that is plumbed straight through to the dpmodel call — no behavioural
change for ``comm_dict is None``.

Phase 0 de-risk experiment (scratch/derisk_border_op.py) verified that
the opaque wrapper survives ``torch.export.export`` +
``aoti_compile_and_package`` + ``aoti_load_package`` round-trips for
both forward and backward.
---
 deepmd/pt_expt/descriptor/__init__.py   |   6 +-
 deepmd/pt_expt/descriptor/dpa1.py       |   1 +
 deepmd/pt_expt/descriptor/dpa2.py       |  10 +-
 deepmd/pt_expt/descriptor/repflows.py   | 103 ++++++++++++++++
 deepmd/pt_expt/descriptor/repformers.py |  88 ++++++++++++++
 deepmd/pt_expt/descriptor/se_e2_a.py    |   1 +
 deepmd/pt_expt/descriptor/se_r.py       |   1 +
 deepmd/pt_expt/descriptor/se_t.py       |   1 +
 deepmd/pt_expt/descriptor/se_t_tebd.py  |   1 +
 deepmd/pt_expt/model/make_model.py      |  93 +++++++++++++++
 deepmd/pt_expt/model/spin_model.py      |  94 +++++++++++++++
 deepmd/pt_expt/utils/__init__.py        |   3 +
 deepmd/pt_expt/utils/comm.py            | 149 ++++++++++++++++++++++++
 13 files changed, 549 insertions(+), 2 deletions(-)
 create mode 100644 deepmd/pt_expt/descriptor/repflows.py
 create mode 100644 deepmd/pt_expt/descriptor/repformers.py
 create mode 100644 deepmd/pt_expt/utils/comm.py

diff --git a/deepmd/pt_expt/descriptor/__init__.py b/deepmd/pt_expt/descriptor/__init__.py
index 1667182d84..8253ed6338 100644
--- a/deepmd/pt_expt/descriptor/__init__.py
+++ b/deepmd/pt_expt/descriptor/__init__.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 # Import to register converters
-from . import se_t_tebd_block  # noqa: F401
+from . import (  # noqa: F401
+    repflows,
+    repformers,
+    se_t_tebd_block,
+)
 from .base_descriptor import (
     BaseDescriptor,
 )
diff --git a/deepmd/pt_expt/descriptor/dpa1.py b/deepmd/pt_expt/descriptor/dpa1.py
index 01df91abd6..c43b07f9c2 100644
--- a/deepmd/pt_expt/descriptor/dpa1.py
+++ b/deepmd/pt_expt/descriptor/dpa1.py
@@ -183,6 +183,7 @@ def call(
         nlist: torch.Tensor,
         mapping: torch.Tensor | None = None,
         fparam: torch.Tensor | None = None,
+        comm_dict: dict | None = None,
     ) -> Any:
         if not self.compress:
             return DescrptDPA1DP.call.__wrapped__(
diff --git a/deepmd/pt_expt/descriptor/dpa2.py b/deepmd/pt_expt/descriptor/dpa2.py
index 1723df5a30..21c392cd3c 100644
--- a/deepmd/pt_expt/descriptor/dpa2.py
+++ b/deepmd/pt_expt/descriptor/dpa2.py
@@ -233,11 +233,19 @@ def call(
         nlist: torch.Tensor,
         mapping: torch.Tensor | None = None,
         fparam: torch.Tensor | None = None,
+        comm_dict: dict | None = None,
     ) -> Any:
         if not self.compress:
             return DescrptDPA2DP.call.__wrapped__(
-                self, coord_ext, atype_ext, nlist, mapping
+                self,
+                coord_ext,
+                atype_ext,
+                nlist,
+                mapping,
+                fparam,
+                comm_dict=comm_dict,
             )
+        # Compressed path is local-only (no message passing during compress).
         return self._call_compressed(coord_ext, atype_ext, nlist, mapping)
 
     def _call_compressed(
diff --git a/deepmd/pt_expt/descriptor/repflows.py b/deepmd/pt_expt/descriptor/repflows.py
new file mode 100644
index 0000000000..2f680703bf
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/repflows.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""pt_expt wrapper around dpmodel ``DescrptBlockRepflows``.
+
+The wrapper overrides ``_exchange_ghosts`` so that, when running under
+LAMMPS multi-rank with a non-None ``comm_dict``, each layer of the
+RepFlow message-passing block exchanges ghost-atom embeddings via the
+opaque ``deepmd_export::border_op`` wrapper (registered in
+``deepmd/pt_expt/utils/comm.py``). This survives ``torch.export`` and
+AOTInductor packaging.
+
+When ``comm_dict is None`` (single-rank inference / training), the
+default array-api ``_exchange_ghosts`` from the dpmodel block is used —
+zero behavioural change.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import torch
+
+from deepmd.dpmodel.descriptor.repflows import (
+    DescrptBlockRepflows as DescrptBlockRepflowsDP,
+)
+from deepmd.pt.utils.spin import (
+    concat_switch_virtual,
+)
+from deepmd.pt_expt.common import (
+    register_dpmodel_mapping,
+    torch_module,
+)
+
+
+@torch_module
+class DescrptBlockRepflows(DescrptBlockRepflowsDP):
+    """pt_expt wrapper for the RepFlow descriptor block."""
+
+    def _exchange_ghosts(
+        self,
+        node_ebd: torch.Tensor,
+        mapping_tiled: torch.Tensor | None,
+        comm_dict: dict | None,
+        nall: int,
+        nloc: int,
+    ) -> torch.Tensor:
+        if comm_dict is None:
+            return super()._exchange_ghosts(
+                node_ebd,
+                mapping_tiled,
+                comm_dict,
+                nall,
+                nloc,
+            )
+
+        has_spin = "has_spin" in comm_dict
+        if has_spin:
+            real_nloc, real_nall = nloc // 2, nall // 2
+            real_pad = real_nall - real_nloc
+            node_real, node_virt = torch.split(
+                node_ebd,
+                [real_nloc, real_nloc],
+                dim=1,
+            )
+            # combine real + virtual along feature dim, then pad to nall.
+            mix = torch.cat([node_real, node_virt], dim=2)
+            padded = torch.nn.functional.pad(
+                mix.squeeze(0),
+                (0, 0, 0, real_pad),
+                value=0.0,
+            )
+        else:
+            padded = torch.nn.functional.pad(
+                node_ebd.squeeze(0),
+                (0, 0, 0, nall - nloc),
+                value=0.0,
+            )
+
+        exchanged = torch.ops.deepmd_export.border_op(
+            comm_dict["send_list"],
+            comm_dict["send_proc"],
+            comm_dict["recv_proc"],
+            comm_dict["send_num"],
+            comm_dict["recv_num"],
+            padded,
+            comm_dict["communicator"],
+            comm_dict["nlocal"],
+            comm_dict["nghost"],
+        ).unsqueeze(0)
+
+        if has_spin:
+            n_dim = node_ebd.shape[-1]
+            real_ext, virt_ext = torch.split(exchanged, [n_dim, n_dim], dim=2)
+            return concat_switch_virtual(real_ext, virt_ext, real_nloc)
+        return exchanged
+
+
+# Register the converter so dpmodel's auto-wrap path picks up our pt_expt
+# subclass instead of the generic _auto_wrap_native_op fallback. Without
+# this, the override above would never fire.
+register_dpmodel_mapping(
+    DescrptBlockRepflowsDP,
+    lambda v: DescrptBlockRepflows.deserialize(v.serialize()),
+)
diff --git a/deepmd/pt_expt/descriptor/repformers.py b/deepmd/pt_expt/descriptor/repformers.py
new file mode 100644
index 0000000000..f106a7a240
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/repformers.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""pt_expt wrapper around dpmodel ``DescrptBlockRepformers``.
+
+Mirrors ``deepmd/pt_expt/descriptor/repflows.py``: overrides
+``_exchange_ghosts`` so the per-layer ghost exchange uses the opaque
+``deepmd_export::border_op`` when a ``comm_dict`` is provided.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import torch
+
+from deepmd.dpmodel.descriptor.repformers import (
+    DescrptBlockRepformers as DescrptBlockRepformersDP,
+)
+from deepmd.pt.utils.spin import (
+    concat_switch_virtual,
+)
+from deepmd.pt_expt.common import (
+    register_dpmodel_mapping,
+    torch_module,
+)
+
+
+@torch_module
+class DescrptBlockRepformers(DescrptBlockRepformersDP):
+    """pt_expt wrapper for the Repformers descriptor block."""
+
+    def _exchange_ghosts(
+        self,
+        g1: torch.Tensor,
+        mapping_tiled: torch.Tensor | None,
+        comm_dict: dict | None,
+        nall: int,
+        nloc: int,
+    ) -> torch.Tensor:
+        if comm_dict is None:
+            return super()._exchange_ghosts(
+                g1,
+                mapping_tiled,
+                comm_dict,
+                nall,
+                nloc,
+            )
+
+        has_spin = "has_spin" in comm_dict
+        if has_spin:
+            real_nloc, real_nall = nloc // 2, nall // 2
+            real_pad = real_nall - real_nloc
+            g1_real, g1_virt = torch.split(g1, [real_nloc, real_nloc], dim=1)
+            mix = torch.cat([g1_real, g1_virt], dim=2)
+            padded = torch.nn.functional.pad(
+                mix.squeeze(0),
+                (0, 0, 0, real_pad),
+                value=0.0,
+            )
+        else:
+            padded = torch.nn.functional.pad(
+                g1.squeeze(0),
+                (0, 0, 0, nall - nloc),
+                value=0.0,
+            )
+
+        exchanged = torch.ops.deepmd_export.border_op(
+            comm_dict["send_list"],
+            comm_dict["send_proc"],
+            comm_dict["recv_proc"],
+            comm_dict["send_num"],
+            comm_dict["recv_num"],
+            padded,
+            comm_dict["communicator"],
+            comm_dict["nlocal"],
+            comm_dict["nghost"],
+        ).unsqueeze(0)
+
+        if has_spin:
+            ng1 = g1.shape[-1]
+            real_ext, virt_ext = torch.split(exchanged, [ng1, ng1], dim=2)
+            return concat_switch_virtual(real_ext, virt_ext, real_nloc)
+        return exchanged
+
+
+register_dpmodel_mapping(
+    DescrptBlockRepformersDP,
+    lambda v: DescrptBlockRepformers.deserialize(v.serialize()),
+)
diff --git a/deepmd/pt_expt/descriptor/se_e2_a.py b/deepmd/pt_expt/descriptor/se_e2_a.py
index 61d611036e..45120c6d5d 100644
--- a/deepmd/pt_expt/descriptor/se_e2_a.py
+++ b/deepmd/pt_expt/descriptor/se_e2_a.py
@@ -139,6 +139,7 @@ def call(
         nlist: torch.Tensor,
         mapping: torch.Tensor | None = None,
         fparam: torch.Tensor | None = None,
+        comm_dict: dict | None = None,
     ) -> Any:
         if not self.compress:
             return DescrptSeADP.call.__wrapped__(
diff --git a/deepmd/pt_expt/descriptor/se_r.py b/deepmd/pt_expt/descriptor/se_r.py
index 22302f54e6..ab32be1131 100644
--- a/deepmd/pt_expt/descriptor/se_r.py
+++ b/deepmd/pt_expt/descriptor/se_r.py
@@ -128,6 +128,7 @@ def call(
         nlist: torch.Tensor,
         mapping: torch.Tensor | None = None,
         fparam: torch.Tensor | None = None,
+        comm_dict: dict | None = None,
     ) -> Any:
         if not self.compress:
             return DescrptSeRDP.call.__wrapped__(
diff --git a/deepmd/pt_expt/descriptor/se_t.py b/deepmd/pt_expt/descriptor/se_t.py
index 061306f281..69d6183642 100644
--- a/deepmd/pt_expt/descriptor/se_t.py
+++ b/deepmd/pt_expt/descriptor/se_t.py
@@ -139,6 +139,7 @@ def call(
         nlist: torch.Tensor,
         mapping: torch.Tensor | None = None,
         fparam: torch.Tensor | None = None,
+        comm_dict: dict | None = None,
     ) -> Any:
         if not self.compress:
             return DescrptSeTDP.call.__wrapped__(
diff --git a/deepmd/pt_expt/descriptor/se_t_tebd.py b/deepmd/pt_expt/descriptor/se_t_tebd.py
index c0ae308971..cbcaf3822c 100644
--- a/deepmd/pt_expt/descriptor/se_t_tebd.py
+++ b/deepmd/pt_expt/descriptor/se_t_tebd.py
@@ -166,6 +166,7 @@ def call(
         nlist: torch.Tensor,
         mapping: torch.Tensor | None = None,
         fparam: torch.Tensor | None = None,
+        comm_dict: dict | None = None,
     ) -> Any:
         if not self.compress:
             return DescrptSeTTebdDP.call.__wrapped__(
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 4bd9792420..45a8cb10ea 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -257,6 +257,7 @@ def forward_common_atomic(
             aparam: torch.Tensor | None = None,
             do_atomic_virial: bool = False,
             extended_coord_corr: torch.Tensor | None = None,
+            comm_dict: dict | None = None,
         ) -> dict[str, torch.Tensor]:
             atomic_ret = self.atomic_model.forward_common_atomic(
                 extended_coord,
@@ -265,6 +266,7 @@ def forward_common_atomic(
                 mapping=mapping,
                 fparam=fparam,
                 aparam=aparam,
+                comm_dict=comm_dict,
             )
             model_ret = fit_output_to_model_output(
                 atomic_ret,
@@ -365,4 +367,95 @@ def fn(
                 aparam,
             )
 
+        def forward_common_lower_exportable_with_comm(
+            self,
+            extended_coord: torch.Tensor,
+            extended_atype: torch.Tensor,
+            nlist: torch.Tensor,
+            mapping: torch.Tensor | None,
+            fparam: torch.Tensor | None,
+            aparam: torch.Tensor | None,
+            send_list: torch.Tensor,
+            send_proc: torch.Tensor,
+            recv_proc: torch.Tensor,
+            send_num: torch.Tensor,
+            recv_num: torch.Tensor,
+            communicator: torch.Tensor,
+            nlocal: torch.Tensor,
+            nghost: torch.Tensor,
+            do_atomic_virial: bool = False,
+            **make_fx_kwargs: Any,
+        ) -> torch.nn.Module:
+            """Trace forward_common_lower with comm_dict tensors as positional inputs.
+
+            Used to compile a parallel-inference variant of the model
+            (.pt2 with-comm artifact) that drives MPI ghost-atom exchange
+            for GNN descriptors via the opaque
+            ``deepmd_export::border_op`` wrapper. The comm tensors enter
+            the exported program as 8 additional positional inputs after
+            the usual (coord, atype, nlist, mapping, fparam, aparam) —
+            this fixes the C++ ABI for ``DeepPotPTExpt`` (Phase 4).
+
+            Tracing requires ``nswap >= 1`` (Phase 0 finding); with
+            ``nswap == 0`` the dim specializes and the artifact would
+            only run for that exact value. The C++ caller must always
+            provide ``nswap >= 1``.
+            """
+            model = self
+
+            def fn(
+                extended_coord: torch.Tensor,
+                extended_atype: torch.Tensor,
+                nlist: torch.Tensor,
+                mapping: torch.Tensor | None,
+                fparam: torch.Tensor | None,
+                aparam: torch.Tensor | None,
+                send_list: torch.Tensor,
+                send_proc: torch.Tensor,
+                recv_proc: torch.Tensor,
+                send_num: torch.Tensor,
+                recv_num: torch.Tensor,
+                communicator: torch.Tensor,
+                nlocal: torch.Tensor,
+                nghost: torch.Tensor,
+            ) -> dict[str, torch.Tensor]:
+                extended_coord = extended_coord.detach().requires_grad_(True)
+                comm_dict = {
+                    "send_list": send_list,
+                    "send_proc": send_proc,
+                    "recv_proc": recv_proc,
+                    "send_num": send_num,
+                    "recv_num": recv_num,
+                    "communicator": communicator,
+                    "nlocal": nlocal,
+                    "nghost": nghost,
+                }
+                return model.forward_common_lower(
+                    extended_coord,
+                    extended_atype,
+                    nlist,
+                    mapping,
+                    fparam=fparam,
+                    aparam=aparam,
+                    do_atomic_virial=do_atomic_virial,
+                    comm_dict=comm_dict,
+                )
+
+            return make_fx(fn, **make_fx_kwargs)(
+                extended_coord,
+                extended_atype,
+                nlist,
+                mapping,
+                fparam,
+                aparam,
+                send_list,
+                send_proc,
+                recv_proc,
+                send_num,
+                recv_num,
+                communicator,
+                nlocal,
+                nghost,
+            )
+
     return CM
diff --git a/deepmd/pt_expt/model/spin_model.py b/deepmd/pt_expt/model/spin_model.py
index 70f41f0701..e361999b17 100644
--- a/deepmd/pt_expt/model/spin_model.py
+++ b/deepmd/pt_expt/model/spin_model.py
@@ -117,6 +117,100 @@ def fn(
             aparam,
         )
 
+    def forward_common_lower_exportable_with_comm(
+        self,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        extended_spin: torch.Tensor,
+        nlist: torch.Tensor,
+        mapping: torch.Tensor | None,
+        fparam: torch.Tensor | None,
+        aparam: torch.Tensor | None,
+        send_list: torch.Tensor,
+        send_proc: torch.Tensor,
+        recv_proc: torch.Tensor,
+        send_num: torch.Tensor,
+        recv_num: torch.Tensor,
+        communicator: torch.Tensor,
+        nlocal: torch.Tensor,
+        nghost: torch.Tensor,
+        do_atomic_virial: bool = False,
+        **make_fx_kwargs: Any,
+    ) -> torch.nn.Module:
+        """Spin variant of ``forward_common_lower_exportable_with_comm``.
+
+        Mirrors the non-spin version (see ``make_model.py``) but threads
+        ``extended_spin`` through and injects ``has_spin`` into
+        ``comm_dict`` so the pt_expt Repflow/Repformer override takes
+        the spin branch (split real/virtual + concat_switch_virtual).
+        """
+        model = self
+
+        def fn(
+            extended_coord: torch.Tensor,
+            extended_atype: torch.Tensor,
+            extended_spin: torch.Tensor,
+            nlist: torch.Tensor,
+            mapping: torch.Tensor | None,
+            fparam: torch.Tensor | None,
+            aparam: torch.Tensor | None,
+            send_list: torch.Tensor,
+            send_proc: torch.Tensor,
+            recv_proc: torch.Tensor,
+            send_num: torch.Tensor,
+            recv_num: torch.Tensor,
+            communicator: torch.Tensor,
+            nlocal: torch.Tensor,
+            nghost: torch.Tensor,
+        ) -> dict[str, torch.Tensor]:
+            extended_coord = extended_coord.detach().requires_grad_(True)
+            comm_dict = {
+                "send_list": send_list,
+                "send_proc": send_proc,
+                "recv_proc": recv_proc,
+                "send_num": send_num,
+                "recv_num": recv_num,
+                "communicator": communicator,
+                "nlocal": nlocal,
+                "nghost": nghost,
+                # Trace-time marker so the override takes the spin path.
+                # Value is irrelevant — only key presence matters.
+                "has_spin": torch.tensor(
+                    [1],
+                    dtype=torch.int32,
+                    device=extended_coord.device,
+                ),
+            }
+            return model.forward_common_lower(
+                extended_coord,
+                extended_atype,
+                extended_spin,
+                nlist,
+                mapping,
+                fparam=fparam,
+                aparam=aparam,
+                do_atomic_virial=do_atomic_virial,
+                comm_dict=comm_dict,
+            )
+
+        return make_fx(fn, **make_fx_kwargs)(
+            extended_coord,
+            extended_atype,
+            extended_spin,
+            nlist,
+            mapping,
+            fparam,
+            aparam,
+            send_list,
+            send_proc,
+            recv_proc,
+            send_num,
+            recv_num,
+            communicator,
+            nlocal,
+            nghost,
+        )
+
     def forward_common_lower(
         self, *args: Any, **kwargs: Any
     ) -> dict[str, torch.Tensor]:
diff --git a/deepmd/pt_expt/utils/__init__.py b/deepmd/pt_expt/utils/__init__.py
index efb026f7f1..99da68fe4f 100644
--- a/deepmd/pt_expt/utils/__init__.py
+++ b/deepmd/pt_expt/utils/__init__.py
@@ -22,7 +22,10 @@
 # as it's a stateless utility class
 register_dpmodel_mapping(EnvMat, lambda v: v)
 
+# Register opaque deepmd_export::border_op wrapper (used by GNN MPI
+# parallel inference; see comm.py module docstring).
 # Register fake tensor implementations for custom tabulate ops
+from deepmd.pt_expt.utils import comm  # noqa: F401
 from deepmd.pt_expt.utils import tabulate_ops  # noqa: F401
 
 __all__ = [
diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py
new file mode 100644
index 0000000000..cfa92bcf6c
--- /dev/null
+++ b/deepmd/pt_expt/utils/comm.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Opaque torch.export wrapper around the deepmd MPI border_op.
+
+The existing ``torch.ops.deepmd.border_op`` (registered by
+``libdeepmd_op_pt.so``) is a ``CompositeImplicitAutograd`` op that wraps
+``Border::apply`` for the torch.jit (pt) backend. ``torch.export`` /
+AOTInductor try to *decompose* such ops into primitive aten ops, which
+fails because the C++ kernel calls ``data_ptr()`` on inputs — illegal
+during tracing on FakeTensors.
+
+This module defines a NEW op ``deepmd_export::border_op`` via
+``torch.library.custom_op``, marked opaque so ``torch.export`` records it
+as a single black-box call. At runtime the loaded ``.pt2`` dispatches
+back into ``torch.ops.deepmd.border_op`` (forward) or
+``torch.ops.deepmd.border_op_backward`` (backward), preserving the MPI
+exchange semantics.
+
+Constraints discovered during de-risking (scratch/derisk_border_op.py):
+    1. ``custom_op`` forbids returning a tensor that aliases an input —
+       the underlying C++ op returns ``g1`` itself, so we ``.clone()``.
+    2. The fake (meta) impl honours ``g1.dtype`` (no float64 hardcoding).
+    3. ``register_autograd`` makes the op differentiable; the backward
+       dispatches to ``deepmd::border_op_backward`` which performs the
+       symmetric MPI exchange.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import torch
+
+
+@torch.library.custom_op("deepmd_export::border_op", mutates_args=())
+def border_op_export(
+    sendlist: torch.Tensor,
+    sendproc: torch.Tensor,
+    recvproc: torch.Tensor,
+    sendnum: torch.Tensor,
+    recvnum: torch.Tensor,
+    g1: torch.Tensor,
+    communicator: torch.Tensor,
+    nlocal: torch.Tensor,
+    nghost: torch.Tensor,
+) -> torch.Tensor:
+    """Opaque wrapper around ``torch.ops.deepmd.border_op``.
+
+    Performs MPI ghost-atom exchange of the embedding tensor ``g1`` so
+    GNN message-passing layers can run under multi-rank LAMMPS. Inputs
+    and outputs match the underlying op exactly except for the aliasing
+    fix (see module docstring).
+    """
+    out = torch.ops.deepmd.border_op(
+        sendlist,
+        sendproc,
+        recvproc,
+        sendnum,
+        recvnum,
+        g1,
+        communicator,
+        nlocal,
+        nghost,
+    )
+    if isinstance(out, (list, tuple)):
+        out = out[0]
+    # custom_op forbids output aliasing inputs; underlying op returns g1.
+    return out.clone()
+
+
+@border_op_export.register_fake
+def _border_op_export_fake(
+    sendlist: torch.Tensor,
+    sendproc: torch.Tensor,
+    recvproc: torch.Tensor,
+    sendnum: torch.Tensor,
+    recvnum: torch.Tensor,
+    g1: torch.Tensor,
+    communicator: torch.Tensor,
+    nlocal: torch.Tensor,
+    nghost: torch.Tensor,
+) -> torch.Tensor:
+    return torch.empty_like(g1)
+
+
+def _border_op_setup_context(
+    ctx: torch.autograd.function.FunctionCtx,
+    inputs: tuple,
+    output: torch.Tensor,
+) -> None:
+    (
+        sendlist,
+        sendproc,
+        recvproc,
+        sendnum,
+        recvnum,
+        _g1,
+        communicator,
+        nlocal,
+        nghost,
+    ) = inputs
+    ctx.save_for_backward(
+        sendlist,
+        sendproc,
+        recvproc,
+        sendnum,
+        recvnum,
+        communicator,
+        nlocal,
+        nghost,
+    )
+
+
+def _border_op_backward(
+    ctx: torch.autograd.function.FunctionCtx,
+    grad_output: torch.Tensor,
+) -> tuple:
+    (sendlist, sendproc, recvproc, sendnum, recvnum, communicator, nlocal, nghost) = (
+        ctx.saved_tensors
+    )
+    grad_in = torch.ops.deepmd.border_op_backward(
+        sendlist,
+        sendproc,
+        recvproc,
+        sendnum,
+        recvnum,
+        grad_output,
+        communicator,
+        nlocal,
+        nghost,
+    )
+    # Same aliasing concern as forward: the C++ backward returns the same
+    # tensor object it modified; clone before handing back to autograd.
+    return (
+        None,
+        None,
+        None,
+        None,
+        None,  # sendlist..recvnum
+        grad_in.clone(),  # g1
+        None,
+        None,
+        None,  # communicator, nlocal, nghost
+    )
+
+
+border_op_export.register_autograd(
+    _border_op_backward,
+    setup_context=_border_op_setup_context,
+)

From 2936bd4459cc393bbec4164a23c6586f8c7ea29b Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 00:43:05 +0800
Subject: [PATCH 04/34] fix(pt_expt): plumb comm_dict through SpinModel +
 guards

Three small follow-ups uncovered by the spin export-with-comm test:

1. ``dpmodel/model/spin_model.py::call_common_lower`` was missing
   the ``comm_dict`` kwarg added by the Phase 1 plumbing. Added it
   and forward to ``backbone_model.call_common_lower`` so spin GNN
   models can drive parallel inference.

2. ``pt_expt/descriptor/repflows.py`` raises a clear ``RuntimeError``
   when ``use_loc_mapping=True`` is combined with a non-None
   ``comm_dict``. The local-mapping codepath skips per-layer ghost
   exchange entirely so combining it with ``comm_dict`` would
   silently drop the parallel behaviour.

3. ``pt_expt/utils/comm.py`` ``_check_underlying_ops_loaded`` is
   called on first wrapper invocation; surfaces a clearer error
   when libdeepmd_op_pt.so is unloaded ("rebuild the pt custom-op
   library") rather than the cryptic "torch.ops.deepmd has no
   attribute 'border_op'" from torch's dispatcher.
---
 deepmd/dpmodel/model/spin_model.py    |  2 ++
 deepmd/pt_expt/descriptor/repflows.py | 13 ++++++++++++
 deepmd/pt_expt/utils/comm.py          | 29 +++++++++++++++++++++++++++
 3 files changed, 44 insertions(+)

diff --git a/deepmd/dpmodel/model/spin_model.py b/deepmd/dpmodel/model/spin_model.py
index be6566e303..2de41945f3 100644
--- a/deepmd/dpmodel/model/spin_model.py
+++ b/deepmd/dpmodel/model/spin_model.py
@@ -748,6 +748,7 @@ def call_common_lower(
         fparam: Array | None = None,
         aparam: Array | None = None,
         do_atomic_virial: bool = False,
+        comm_dict: dict | None = None,
     ) -> dict[str, Array]:
         """Return model prediction with raw internal keys. Lower interface that takes
         extended atomic coordinates, types and spins, nlist, and mapping
@@ -800,6 +801,7 @@ def call_common_lower(
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
             extended_coord_corr=extended_coord_corr,
+            comm_dict=comm_dict,
         )
         model_output_type = self.backbone_model.model_output_type()
         if "mask" in model_output_type:
diff --git a/deepmd/pt_expt/descriptor/repflows.py b/deepmd/pt_expt/descriptor/repflows.py
index 2f680703bf..efd7cba7ba 100644
--- a/deepmd/pt_expt/descriptor/repflows.py
+++ b/deepmd/pt_expt/descriptor/repflows.py
@@ -51,6 +51,19 @@ def _exchange_ghosts(
                 nall,
                 nloc,
             )
+        # Pt's parallel branch (repflows.py:580-587) requires the
+        # extended-region pathway (use_loc_mapping=False).  The
+        # local-mapping codepath skips the per-layer ghost exchange
+        # entirely, so combining it with comm_dict is contradictory.
+        # Surface this as a clear error rather than producing silently
+        # wrong results.
+        if getattr(self, "use_loc_mapping", False):
+            raise RuntimeError(
+                "DescrptBlockRepflows._exchange_ghosts: comm_dict is "
+                "set but use_loc_mapping=True. Multi-rank parallel "
+                "inference requires use_loc_mapping=False so per-layer "
+                "ghost exchange is meaningful."
+            )
 
         has_spin = "has_spin" in comm_dict
         if has_spin:
diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py
index cfa92bcf6c..442a232a6f 100644
--- a/deepmd/pt_expt/utils/comm.py
+++ b/deepmd/pt_expt/utils/comm.py
@@ -31,6 +31,34 @@
 import torch
 
 
+def _check_underlying_ops_loaded() -> None:
+    """Surface a clearer error when libdeepmd_op_pt.so isn't loaded.
+
+    pt_expt depends on libdeepmd_op_pt.so for the underlying
+    ``deepmd::border_op`` and ``deepmd::border_op_backward`` C++ ops.
+    Without them, callers get cryptic
+    ``AttributeError: '_OpNamespace' object has no attribute 'border_op'``
+    errors. We translate that into actionable advice.
+
+    Called once on first wrapper invocation (not at import time, since
+    pt_expt may legitimately be imported on systems where the .so is
+    not built — e.g. eager-only smoke tests of dpmodel-side code).
+    """
+    if not (
+        hasattr(torch.ops, "deepmd")
+        and hasattr(torch.ops.deepmd, "border_op")
+        and hasattr(torch.ops.deepmd, "border_op_backward")
+    ):
+        raise RuntimeError(
+            "deepmd_export::border_op wrapper requires "
+            "torch.ops.deepmd.border_op and "
+            "torch.ops.deepmd.border_op_backward (from "
+            "libdeepmd_op_pt.so) to be loaded. Build the pt custom-op "
+            "library and ensure deepmd.pt is imported before the "
+            "first call to this wrapper."
+        )
+
+
 @torch.library.custom_op("deepmd_export::border_op", mutates_args=())
 def border_op_export(
     sendlist: torch.Tensor,
@@ -50,6 +78,7 @@ def border_op_export(
     and outputs match the underlying op exactly except for the aliasing
     fix (see module docstring).
     """
+    _check_underlying_ops_loaded()
     out = torch.ops.deepmd.border_op(
         sendlist,
         sendproc,

From b22feb792de3990f791c5cadb08cdcb8679dbfdd Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 00:54:49 +0800
Subject: [PATCH 05/34] feat(pt_expt): two-mode AOTInductor export with
 comm_dict
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a ``with_comm_dict: bool`` flag to ``_trace_and_export`` and
``_make_sample_inputs``/``_build_dynamic_shapes``.  When True, the
trace runs through ``forward_common_lower_exportable_with_comm``
(which threads 8 comm tensors as positional inputs and reconstructs
``comm_dict`` inside the traced function), and the resulting export
accepts comm tensors as additional positional inputs.

Constraints enforced for the with-comm trace:
  * ``nframes=1`` static (the pt-parity override uses
    squeeze(0)/unsqueeze(0) which only works for nb=1; LAMMPS always
    drives one frame anyway).  Avoids the regular-variants
    ``nframes=2`` collision-avoidance bumping (irrelevant when
    nframes is static — duck-sizing only unifies dynamic dims).
  * ``nswap`` static at the trace value.  ``nswap`` is fixed once at
    LAMMPS init (depends on the processor grid which doesnt change
    at runtime), so the dim doesnt need to be dynamic.

For GNN models, ``_deserialize_to_file_pt2`` now compiles BOTH the
regular and with-comm artifacts and packs the latter inside the .pt2
ZIP at ``extra/forward_lower_with_comm.pt2``.  Metadata gains:
  * ``has_message_passing`` (true if the descriptor has GNN block).
  * ``has_comm_artifact`` (true iff a with-comm artifact was packed).
Old .pt2 files lack these keys; the C++ loader (Phase 4) must default
to False when the field is missing.

The non-GNN path is unchanged: a single regular artifact + the
existing metadata layout, so existing .pt2 readers keep working.
---
 deepmd/pt_expt/utils/serialization.py | 349 ++++++++++++++++++++++----
 1 file changed, 301 insertions(+), 48 deletions(-)

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index f59c397525..74fbe67111 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import ctypes
 import json
 
 import numpy as np
@@ -75,6 +76,87 @@ def _json_to_numpy(model_obj: dict) -> dict:
     )
 
 
+def _has_message_passing(model: torch.nn.Module) -> bool:
+    """Detect whether a model's descriptor uses GNN-style message passing.
+
+    GNN descriptors (DPA2 with repformers, DPA3 with repflows) require
+    a per-layer ghost-atom MPI exchange when running multi-rank LAMMPS,
+    which means a separate ``with-comm`` AOTInductor artifact must be
+    compiled.  Non-GNN descriptors (se_e2_a, se_r, se_t, se_t_tebd,
+    DPA1, hybrid-of-non-GNN) need only the regular artifact.
+
+    Returns False if the descriptor's ``has_message_passing()`` query
+    cannot be answered (e.g. linear/zbl/frozen models without a single
+    descriptor) — those are assumed local.
+    """
+    try:
+        descriptor = model.atomic_model.descriptor
+    except AttributeError:
+        return False
+    if hasattr(descriptor, "has_message_passing"):
+        try:
+            return bool(descriptor.has_message_passing())
+        except (AttributeError, NotImplementedError):
+            return False
+    return False
+
+
+# Module-level cache for the trace-time sendlist buffer. The pointer
+# value embedded in ``send_list_tensor`` references this numpy array's
+# data; the array must outlive the trace + export call.  Caching here
+# (rather than per-call) is fine because the contents are never read by
+# the exported graph at runtime — only by the eager call inside
+# ``make_fx`` when extracting output keys, and by ``torch.export`` when
+# materializing example inputs.
+_TRACE_SENDLIST_KEEPALIVE: list[np.ndarray] = []
+
+
+def _make_comm_sample_inputs(
+    nloc: int,
+    nghost: int,
+    device: torch.device,
+) -> tuple[torch.Tensor, ...]:
+    """Build trivial-but-valid comm tensors for tracing the with-comm variant.
+
+    Phase 0 finding: tracing with ``nswap == 0`` causes the dim to
+    specialize, so we must use ``nswap >= 1``.  We use ``nswap == 1``
+    with a single self-send swap whose sendlist points to ``nghost``
+    local atoms (the actual indices don't matter for the trace — only
+    the validity of the pointer matters; ``border_op`` is opaque to
+    ``torch.export`` via the ``deepmd_export::border_op`` wrapper).
+
+    Returns ``(send_list, send_proc, recv_proc, send_num, recv_num,
+    communicator, nlocal_ts, nghost_ts)`` — 8 tensors, matching the
+    canonical positional order of
+    ``forward_common_lower_exportable_with_comm``.
+    """
+    nswap = 1
+    send_count = max(1, nghost)
+    # The trace-time sendlist must be a real ``int**``: a tensor of
+    # int64 values, each value the address of a contiguous int32 array.
+    indices = np.zeros(send_count, dtype=np.int32)
+    _TRACE_SENDLIST_KEEPALIVE.append(indices)
+    addr = indices.ctypes.data_as(ctypes.c_void_p).value
+    send_list = torch.tensor([addr], dtype=torch.int64, device=device)
+    send_proc = torch.zeros(nswap, dtype=torch.int32, device=device)
+    recv_proc = torch.zeros(nswap, dtype=torch.int32, device=device)
+    send_num = torch.tensor([send_count], dtype=torch.int32, device=device)
+    recv_num = torch.tensor([send_count], dtype=torch.int32, device=device)
+    communicator = torch.zeros(1, dtype=torch.int64, device=device)
+    nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device)
+    nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device)
+    return (
+        send_list,
+        send_proc,
+        recv_proc,
+        send_num,
+        recv_num,
+        communicator,
+        nlocal_ts,
+        nghost_ts,
+    )
+
+
 def _make_sample_inputs(
     model: torch.nn.Module,
     nframes: int = 1,
@@ -178,22 +260,42 @@ def _make_sample_inputs(
 def _build_dynamic_shapes(
     *sample_inputs: torch.Tensor | None,
     has_spin: bool = False,
+    with_comm_dict: bool = False,
 ) -> tuple:
     """Build dynamic shape specifications for torch.export.
 
     Marks nframes, nloc and nall as dynamic dimensions so the exported
     program handles arbitrary frame and atom counts.
 
+    When ``with_comm_dict`` is True, 8 additional comm tensors are
+    appended to the returned tuple — matching the positional order of
+    ``forward_common_lower_exportable_with_comm``.  ``nswap`` is the
+    only dynamic dim among them; the rest are scalar or fixed-size.
+
     Parameters
     ----------
     *sample_inputs : torch.Tensor | None
-        Sample inputs: either 6 tensors (non-spin) or 7 tensors (spin).
+        Sample inputs: 6 tensors (non-spin) or 7 (spin), optionally
+        followed by 8 comm tensors when ``with_comm_dict``.
     has_spin : bool
         Whether the inputs include an extended_spin tensor.
+    with_comm_dict : bool
+        Whether the inputs include the 8 comm tensors.
     Returns a tuple (not dict) to match positional args of the make_fx
     traced module, whose arg names may have suffixes like ``_1``.
     """
-    nframes_dim = torch.export.Dim("nframes", min=1)
+    # When tracing the with-comm variant, nframes is static at 1.
+    # Rationale: pt_expt's Repflow/Repformer parallel-mode override
+    # mirrors pt's repflows.py:593 ``node_ebd.squeeze(0)`` /
+    # ``…unsqueeze(0)`` pattern, which only works for nb=1. LAMMPS
+    # always drives inference with one frame so this matches reality.
+    # Marking nframes static (not dynamic) means it does not
+    # participate in duck-sizing — so the nframes==2 collision-avoidance
+    # chosen for the regular variant is *not* needed here, and the
+    # static value (1) is safe regardless of other tensors' sizes.
+    nframes_dim: torch.export.Dim | int = (
+        1 if with_comm_dict else torch.export.Dim("nframes", min=1)
+    )
     nall_dim = torch.export.Dim("nall", min=1)
     nloc_dim = torch.export.Dim("nloc", min=1)
 
@@ -201,7 +303,7 @@ def _build_dynamic_shapes(
         # (ext_coord, ext_atype, ext_spin, nlist, mapping, fparam, aparam)
         fparam = sample_inputs[5]
         aparam = sample_inputs[6]
-        return (
+        base = (
             {0: nframes_dim, 1: nall_dim},  # extended_coord: (nframes, nall, 3)
             {0: nframes_dim, 1: nall_dim},  # extended_atype: (nframes, nall)
             {0: nframes_dim, 1: nall_dim},  # extended_spin: (nframes, nall, 3)
@@ -214,7 +316,7 @@ def _build_dynamic_shapes(
         # (ext_coord, ext_atype, nlist, mapping, fparam, aparam)
         fparam = sample_inputs[4]
         aparam = sample_inputs[5]
-        return (
+        base = (
             {0: nframes_dim, 1: nall_dim},  # extended_coord: (nframes, nall, 3)
             {0: nframes_dim, 1: nall_dim},  # extended_atype: (nframes, nall)
             {0: nframes_dim, 1: nloc_dim},  # nlist: (nframes, nloc, nnei)
@@ -223,6 +325,21 @@ def _build_dynamic_shapes(
             {0: nframes_dim, 1: nloc_dim} if aparam is not None else None,  # aparam
         )
 
+    if not with_comm_dict:
+        return base
+
+    # All 8 comm tensors have static shapes:
+    #   send_list, send_proc, recv_proc, send_num, recv_num: (nswap,)
+    #   communicator: (1,)
+    #   nlocal, nghost: scalar
+    # nswap is fixed once at LAMMPS init (it depends on the processor
+    # grid which doesn't change at runtime), so it's safe to bake it
+    # in as static at the trace value.  Marking nswap dynamic instead
+    # raises a Constraints-violated error because the trace specialises
+    # it to the sample value (1) downstream of border_op anyway —
+    # there is no graph variation across nswap values.
+    return base + (None, None, None, None, None, None, None, None)
+
 
 def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict:
     """Collect metadata from the model for C++ inference.
@@ -268,6 +385,11 @@ def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict:
     if is_spin:
         meta["ntypes_spin"] = model.spin.get_ntypes_spin()
         meta["use_spin"] = [bool(v) for v in model.spin.use_spin]
+    # Record whether the model uses GNN-style message passing.  When
+    # True, .pt2 deserialization compiles a second ``with-comm`` artifact
+    # so multi-rank LAMMPS can drive ghost-atom MPI exchange through
+    # the model.  C++ DeepPotPTExpt branches on this flag at load time.
+    meta["has_message_passing"] = _has_message_passing(model)
     return meta
 
 
@@ -366,9 +488,27 @@ def deserialize_to_file(
 def _trace_and_export(
     data: dict,
     model_json_override: dict | None = None,
+    with_comm_dict: bool = False,
 ) -> tuple:
     """Common logic: build model, trace, export.
 
+    Parameters
+    ----------
+    data
+        Serialized model dict (with "model" and optionally
+        "model_def_script" keys).
+    model_json_override
+        Optional alternate dict to embed as model.json (used by
+        ``dp compress`` to store the compressed model dict while
+        tracing the uncompressed one).
+    with_comm_dict
+        If True, trace ``forward_common_lower_exportable_with_comm``
+        instead of the regular variant. The resulting exported program
+        accepts 8 additional positional comm tensors (``send_list``,
+        ``send_proc``, ``recv_proc``, ``send_num``, ``recv_num``,
+        ``communicator``, ``nlocal``, ``nghost``) used by the pt_expt
+        Repflow/Repformer override to drive MPI ghost-atom exchange.
+        Only valid for GNN models (see ``_has_message_passing``).
     Returns (exported, metadata, data_for_json, output_keys).
     """
     from copy import (
@@ -412,19 +552,37 @@ def _trace_and_export(
     _orig_device = _env.DEVICE
     _env.DEVICE = torch.device("cpu")
     try:
-        nframes = 2
-        sample_inputs = _make_sample_inputs(model, nframes=nframes, has_spin=is_spin)
-        # Collect all dimension sizes except dim-0 (nframes) from every tensor
-        other_dims: set[int] = set()
-        for t in sample_inputs:
-            if t is not None:
-                other_dims.update(t.shape[1:])
-        while nframes in other_dims:
-            nframes += 1
-        if nframes != 2:
+        if with_comm_dict:
+            # The pt_expt parallel-mode override (in pt's repflows.py
+            # line 593 too) uses ``squeeze(0)`` / ``unsqueeze(0)`` on
+            # ``node_ebd`` and so requires ``nframes == 1``.  LAMMPS
+            # always drives inference with one frame, so this is the
+            # only realistic shape — and we mark dim 0 static in
+            # ``_build_dynamic_shapes`` to match.
+            nframes = 1
+            sample_inputs = _make_sample_inputs(
+                model,
+                nframes=nframes,
+                has_spin=is_spin,
+            )
+        else:
+            nframes = 2
             sample_inputs = _make_sample_inputs(
-                model, nframes=nframes, has_spin=is_spin
+                model,
+                nframes=nframes,
+                has_spin=is_spin,
             )
+            # Collect all dimension sizes except dim-0 (nframes) from every tensor
+            other_dims: set[int] = set()
+            for t in sample_inputs:
+                if t is not None:
+                    other_dims.update(t.shape[1:])
+            while nframes in other_dims:
+                nframes += 1
+            if nframes != 2:
+                sample_inputs = _make_sample_inputs(
+                    model, nframes=nframes, has_spin=is_spin
+                )
     finally:
         _env.DEVICE = _orig_device
 
@@ -435,40 +593,87 @@ def _trace_and_export(
     else:
         ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam = sample_inputs
 
+    # 3b. Build comm-tensor sample inputs when tracing the with-comm
+    # variant (only valid for GNN models). The actual values don't
+    # matter for tracing — only that they're valid tensors of the right
+    # shape and dtype.  See ``_make_comm_sample_inputs``.
+    if with_comm_dict:
+        if not metadata.get("has_message_passing"):
+            raise ValueError(
+                "with_comm_dict=True requested but model has no GNN "
+                "message-passing descriptor — there's nothing to compile."
+            )
+        nloc_sample = nlist_t.shape[1]
+        nall_sample = ext_atype.shape[1]
+        nghost_sample = nall_sample - nloc_sample
+        comm_inputs = _make_comm_sample_inputs(
+            nloc=nloc_sample,
+            nghost=nghost_sample,
+            device=torch.device("cpu"),
+        )
+        sample_inputs = sample_inputs + comm_inputs
+
     # 4. Trace via make_fx on CPU.
     # This decomposes torch.autograd.grad into aten ops so the resulting
     # GraphModule no longer contains autograd calls.
     if is_spin:
-        traced = model.forward_common_lower_exportable(
-            ext_coord,
-            ext_atype,
-            ext_spin,
-            nlist_t,
-            mapping_t,
-            fparam=fparam,
-            aparam=aparam,
-            do_atomic_virial=True,
-            tracing_mode="symbolic",
-            _allow_non_fake_inputs=True,
-        )
+        if with_comm_dict:
+            traced = model.forward_common_lower_exportable_with_comm(
+                ext_coord,
+                ext_atype,
+                ext_spin,
+                nlist_t,
+                mapping_t,
+                fparam,
+                aparam,
+                *comm_inputs,
+                do_atomic_virial=True,
+                tracing_mode="symbolic",
+                _allow_non_fake_inputs=True,
+            )
+        else:
+            traced = model.forward_common_lower_exportable(
+                ext_coord,
+                ext_atype,
+                ext_spin,
+                nlist_t,
+                mapping_t,
+                fparam=fparam,
+                aparam=aparam,
+                do_atomic_virial=True,
+                tracing_mode="symbolic",
+                _allow_non_fake_inputs=True,
+            )
         # 5. Extract output keys from the CPU-traced module.
-        sample_out = traced(
-            ext_coord, ext_atype, ext_spin, nlist_t, mapping_t, fparam, aparam
-        )
+        sample_out = traced(*sample_inputs)
     else:
-        traced = model.forward_common_lower_exportable(
-            ext_coord,
-            ext_atype,
-            nlist_t,
-            mapping_t,
-            fparam=fparam,
-            aparam=aparam,
-            do_atomic_virial=True,
-            tracing_mode="symbolic",
-            _allow_non_fake_inputs=True,
-        )
+        if with_comm_dict:
+            traced = model.forward_common_lower_exportable_with_comm(
+                ext_coord,
+                ext_atype,
+                nlist_t,
+                mapping_t,
+                fparam,
+                aparam,
+                *comm_inputs,
+                do_atomic_virial=True,
+                tracing_mode="symbolic",
+                _allow_non_fake_inputs=True,
+            )
+        else:
+            traced = model.forward_common_lower_exportable(
+                ext_coord,
+                ext_atype,
+                nlist_t,
+                mapping_t,
+                fparam=fparam,
+                aparam=aparam,
+                do_atomic_virial=True,
+                tracing_mode="symbolic",
+                _allow_non_fake_inputs=True,
+            )
         # 5. Extract output keys from the CPU-traced module.
-        sample_out = traced(ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam)
+        sample_out = traced(*sample_inputs)
 
     output_keys = list(sample_out.keys())
 
@@ -477,7 +682,11 @@ def _trace_and_export(
     # graph.  Exporting on CPU keeps devices consistent; we move the
     # ExportedProgram to the target device afterwards via the official
     # move_to_device_pass (avoids FakeTensor device-propagation errors).
-    dynamic_shapes = _build_dynamic_shapes(*sample_inputs, has_spin=is_spin)
+    dynamic_shapes = _build_dynamic_shapes(
+        *sample_inputs,
+        has_spin=is_spin,
+        with_comm_dict=with_comm_dict,
+    )
     exported = torch.export.export(
         traced,
         sample_inputs,
@@ -543,27 +752,71 @@ def _deserialize_to_file_pt2(
     Uses torch._inductor.aoti_compile_and_package to compile the exported
     program into a .pt2 package (ZIP archive with compiled shared libraries),
     then embeds metadata into the archive.
+
+    For GNN models (descriptor.has_message_passing() is True), compiles
+    a SECOND ``with-comm`` artifact and packs it alongside the regular
+    one.  The ``with-comm`` variant accepts comm-dict tensors as
+    additional positional inputs and drives MPI ghost-atom exchange via
+    ``deepmd_export::border_op``.  The C++ ``DeepPotPTExpt`` loader picks
+    the artifact based on the LAMMPS rank count at runtime.
+
+    Layout inside the .pt2 ZIP:
+        regular   →  artifact at the top of the archive (existing layout)
+        with-comm →  ``extra/forward_lower_with_comm.pt2`` (nested ZIP)
+        metadata  →  ``extra/metadata.json`` with ``has_message_passing``
+                     and ``has_comm_artifact`` flags.
+
+    Old .pt2 files (pre-this-change) lack ``has_comm_artifact`` so the
+    C++ loader must default to ``False`` when the field is missing.
     """
+    import os
+    import tempfile
     import zipfile
 
     from torch._inductor import (
         aoti_compile_and_package,
     )
 
+    # First artifact: regular (no comm). Always produced.
     exported, metadata, data_for_json, output_keys = _trace_and_export(
         data, model_json_override
     )
-
-    # Compile via AOTInductor into a .pt2 package
     aoti_compile_and_package(exported, package_path=model_file)
+    metadata["output_keys"] = output_keys
 
-    # Embed metadata into the .pt2 ZIP archive
+    # Second artifact: with-comm. Only for GNN models.
+    has_comm_artifact = bool(metadata.get("has_message_passing"))
+    metadata["has_comm_artifact"] = has_comm_artifact
+    with_comm_bytes: bytes | None = None
+    with_comm_output_keys: list[str] | None = None
+    if has_comm_artifact:
+        exported_wc, _meta_wc, _data_wc, with_comm_output_keys = _trace_and_export(
+            data,
+            model_json_override,
+            with_comm_dict=True,
+        )
+        with tempfile.TemporaryDirectory() as td:
+            wc_path = os.path.join(td, "forward_lower_with_comm.pt2")
+            aoti_compile_and_package(exported_wc, package_path=wc_path)
+            with open(wc_path, "rb") as f:
+                with_comm_bytes = f.read()
+        # The output keys are identical between the two artifacts (same
+        # forward_lower output dict); record only one set in metadata.
+        # If they ever diverge we'll surface a hard error here.
+        if with_comm_output_keys != output_keys:
+            raise RuntimeError(
+                "with-comm artifact output keys diverge from regular: "
+                f"regular={output_keys} vs with_comm={with_comm_output_keys}"
+            )
+
+    # Embed metadata + supplementary files into the .pt2 ZIP archive
     model_def_script = data.get("model_def_script") or {}
-    metadata["output_keys"] = output_keys
-    with zipfile.ZipFile(model_file, "a") as zf:
+    with zipfile.ZipFile(model_file, "a", zipfile.ZIP_STORED) as zf:
         zf.writestr("extra/metadata.json", json.dumps(metadata))
         zf.writestr("extra/model_def_script.json", json.dumps(model_def_script))
         zf.writestr(
             "extra/model.json",
             json.dumps(data_for_json, separators=(",", ":")),
         )
+        if with_comm_bytes is not None:
+            zf.writestr("extra/forward_lower_with_comm.pt2", with_comm_bytes)

From 4b707a761655114a21b48e5818402c4ccbbd7ae0 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 10:04:51 +0800
Subject: [PATCH 06/34] test(pt_expt): add comm_dict eager parity + export
 round-trip suite

Five new test files covering the GNN MPI plumbing:

* test_repflow_parallel.py / test_repformer_parallel.py
  Eager parity for DescrptBlockRepflows / DescrptBlockRepformers
  override.  Single-rank self-exchange via ctypes pointer-array
  sendlist; verifies override output equals dpmodel default for both
  with-mapping and none-mapping variants.  Includes a structural
  test for the spin branch and a guard test that
  use_loc_mapping=True + comm_dict raises RuntimeError.

* test_border_op_backward.py
  Direct unit tests for torch.ops.deepmd.border_op_backward (float32
  + float64) and the autograd path through deepmd_export::border_op.

* test_export_with_comm.py
  Phase 3 round-trip for the dual-artifact .pt2 layout: GNN models
  produce both regular and forward_lower_with_comm artifacts; both
  load via aoti_load_package; outputs match for self-exchange.
  Plus three coverage tests for previously-untested branches:
  zero-nghost clamp in _make_comm_sample_inputs, hybrid-with-GNN
  detection in _has_message_passing, .pte with-comm trace round-trip.

* test_spin_export_with_comm.py
  Spin model trace machinery (smoke test on se_e2_a) and end-to-end
  eager value parity for spin DPA3 models running through
  SpinModel.call_common_lower with comm_dict.
---
 .../descriptor/test_repflow_parallel.py       | 411 ++++++++++++++++++
 .../descriptor/test_repformer_parallel.py     | 207 +++++++++
 .../pt_expt/model/test_export_with_comm.py    | 342 +++++++++++++++
 .../model/test_spin_export_with_comm.py       | 316 ++++++++++++++
 .../pt_expt/utils/test_border_op_backward.py  | 248 +++++++++++
 5 files changed, 1524 insertions(+)
 create mode 100644 source/tests/pt_expt/descriptor/test_repflow_parallel.py
 create mode 100644 source/tests/pt_expt/descriptor/test_repformer_parallel.py
 create mode 100644 source/tests/pt_expt/model/test_export_with_comm.py
 create mode 100644 source/tests/pt_expt/model/test_spin_export_with_comm.py
 create mode 100644 source/tests/pt_expt/utils/test_border_op_backward.py

diff --git a/source/tests/pt_expt/descriptor/test_repflow_parallel.py b/source/tests/pt_expt/descriptor/test_repflow_parallel.py
new file mode 100644
index 0000000000..61b84fe5af
--- /dev/null
+++ b/source/tests/pt_expt/descriptor/test_repflow_parallel.py
@@ -0,0 +1,411 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Eager parity test for the pt_expt RepFlow parallel-mode override.
+
+Verifies that ``DescrptBlockRepflows._exchange_ghosts`` (the pt_expt
+override) produces output identical to the dpmodel default
+``_exchange_ghosts`` when the supplied ``comm_dict`` describes a
+single-rank, self-only MPI exchange whose effect equals the per-layer
+gather that the default does via ``mapping``.
+
+This is a Phase 2.5 gate: it exercises the override code path *eagerly*
+(no torch.export, no AOTInductor) before we attempt the export round
+trip in Phase 3. End-to-end multi-rank validation is deferred to the
+Phase 5 LAMMPS test (``test_lammps_dpa3_pt2_mpi``).
+
+Implementation note: the underlying ``torch.ops.deepmd.border_op``
+treats ``sendlist_tensor`` as a packed pointer-array (``int**``). We
+build that pointer array using numpy contiguous int32 arrays and pack
+their addresses into an int64 tensor.  In single-rank mode (no MPI
+init) the C++ op enters the ``sendproc == me`` self-send branch and
+performs an in-process memcpy from the sendlist-indexed rows into the
+ghost slots — no MPI runtime needed.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import ctypes
+
+import numpy as np
+import pytest
+import torch
+
+# Trigger registration of the deepmd_export::border_op opaque wrapper.
+import deepmd.pt_expt.utils.comm  # noqa: F401
+from deepmd.dpmodel.descriptor.dpa3 import (
+    RepFlowArgs,
+)
+from deepmd.pt_expt.descriptor.dpa3 import (
+    DescrptDPA3,
+)
+from deepmd.pt_expt.utils import (
+    env,
+)
+from deepmd.pt_expt.utils.env import (
+    PRECISION_DICT,
+)
+
+from ...common.test_mixins import (
+    TestCaseSingleFrameWithNlist,
+    get_tols,
+)
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+# ---------------------------------------------------------------------------
+# Helpers for building the comm_dict tensors
+
+
+def _addr_of(np_arr: np.ndarray) -> int:
+    """Return the raw int address of a numpy array's data buffer."""
+    return np_arr.ctypes.data_as(ctypes.c_void_p).value
+
+
+def _build_self_comm_dict(
+    *,
+    nloc: int,
+    nghost: int,
+    sendlist_indices: np.ndarray,
+    device: torch.device,
+    keepalive: list,
+) -> dict:
+    """Build a comm_dict for a single-rank self-exchange.
+
+    Parameters
+    ----------
+    nloc, nghost
+        Atom counts; ``nall = nloc + nghost``.
+    sendlist_indices
+        int32 array of length ``nghost`` giving local indices to copy
+        into successive ghost slots [nloc, nloc+1, ...].
+    device
+        Target torch device for tensors.
+    keepalive
+        List into which we store numpy buffers that must outlive the
+        forward pass (their addresses are referenced by sendlist_tensor).
+    """
+    sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32)
+    keepalive.append(sendlist_indices)
+    nswap = 1
+    addr = _addr_of(sendlist_indices)
+    # int** packed as one int64 entry per swap.
+    sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device=device)
+    sendproc = torch.zeros(nswap, dtype=torch.int32, device=device)
+    recvproc = torch.zeros(nswap, dtype=torch.int32, device=device)
+    sendnum = torch.tensor([nghost], dtype=torch.int32, device=device)
+    recvnum = torch.tensor([nghost], dtype=torch.int32, device=device)
+    communicator = torch.zeros(1, dtype=torch.int64, device=device)
+    nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device)
+    nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device)
+    return {
+        "send_list": sendlist_tensor,
+        "send_proc": sendproc,
+        "recv_proc": recvproc,
+        "send_num": sendnum,
+        "recv_num": recvnum,
+        "communicator": communicator,
+        "nlocal": nlocal_ts,
+        "nghost": nghost_ts,
+    }
+
+
+# ---------------------------------------------------------------------------
+
+
+class TestRepflowParallel(TestCaseSingleFrameWithNlist):
+    def setup_method(self) -> None:
+        TestCaseSingleFrameWithNlist.setUp(self)
+        self.device = env.DEVICE
+
+    # ``mapping_at_parallel`` toggles between two scenarios:
+    #   - "with-mapping": parallel call still receives the mapping tensor
+    #     (matches what pt's DeepPotPT.cc does in production).
+    #   - "none-mapping": parallel call receives ``mapping=None`` so the
+    #     dpmodel branches that gate on ``mapping is not None`` are
+    #     exercised (the regular code path still uses mapping for the
+    #     reference, which proves the comm_dict path's correctness
+    #     does not depend on mapping when override consumes comm_dict).
+    @pytest.mark.parametrize("mapping_at_parallel", ["with-mapping", "none-mapping"])
+    @pytest.mark.parametrize(
+        "prec", ["float64"]
+    )  # precision (single is enough for parity)
+    def test_parallel_matches_default(
+        self,
+        prec: str,
+        mapping_at_parallel: str,
+    ) -> None:
+        """Override with comm_dict matching mapping must match default path."""
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+        dtype = PRECISION_DICT[prec]
+        rtol, atol = get_tols(prec)
+
+        repflow = RepFlowArgs(
+            n_dim=8,
+            e_dim=6,
+            a_dim=4,
+            nlayers=2,
+            e_rcut=self.rcut,
+            e_rcut_smth=self.rcut_smth,
+            e_sel=nnei,
+            a_rcut=self.rcut - 0.1,
+            a_rcut_smth=self.rcut_smth,
+            a_sel=nnei - 1,
+            axis_neuron=4,
+            update_angle=False,
+            update_style="res_residual",
+            update_residual_init="const",
+            smooth_edge_update=True,
+        )
+
+        dd = DescrptDPA3(
+            self.nt,
+            repflow=repflow,
+            exclude_types=[],
+            precision=prec,
+            use_econf_tebd=False,
+            type_map=None,
+            seed=GLOBAL_SEED,
+            use_loc_mapping=False,  # need extended-region indexing for parity
+        ).to(self.device)
+        dd.repflows.mean = torch.tensor(davg, dtype=dtype, device=self.device)
+        dd.repflows.stddev = torch.tensor(dstd, dtype=dtype, device=self.device)
+
+        # use only the first frame to keep the test simple — single rank,
+        # one frame, simple mapping ([0, 1, 2, 0]: ghost atom 3 mirrors local 0).
+        coord_ext = torch.tensor(
+            self.coord_ext[:1],
+            dtype=dtype,
+            device=self.device,
+        )
+        atype_ext = torch.tensor(
+            self.atype_ext[:1],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        nlist = torch.tensor(self.nlist[:1], dtype=torch.int64, device=self.device)
+        mapping = torch.tensor(
+            self.mapping[:1],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        nall = self.nall
+
+        # Default path (comm_dict=None) — uses gather via mapping.
+        rd_default, _, _, _, _ = dd(coord_ext, atype_ext, nlist, mapping)
+
+        # Parallel path: build a comm_dict whose sendlist mirrors the
+        # extended portion of mapping.  For each ghost slot ii in
+        # [nloc, nall), border_op writes node_ebd[sendlist[ii - nloc]],
+        # so sendlist must match mapping[nloc:nall].
+        keepalive: list = []
+        ghost_sources = self.mapping[0, nloc:].astype(np.int32)
+        comm_dict = _build_self_comm_dict(
+            nloc=nloc,
+            nghost=nall - nloc,
+            sendlist_indices=ghost_sources,
+            device=self.device,
+            keepalive=keepalive,
+        )
+
+        mapping_for_parallel = (
+            mapping if mapping_at_parallel == "with-mapping" else None
+        )
+        rd_parallel, _, _, _, _ = dd(
+            coord_ext,
+            atype_ext,
+            nlist,
+            mapping_for_parallel,
+            comm_dict=comm_dict,
+        )
+
+        np.testing.assert_allclose(
+            rd_parallel.detach().cpu().numpy(),
+            rd_default.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+        )
+
+    def test_use_loc_mapping_with_comm_dict_raises(self) -> None:
+        """``use_loc_mapping=True`` + ``comm_dict`` is contradictory.
+
+        The local-mapping codepath skips per-layer ghost exchange
+        entirely, so combining it with ``comm_dict`` would silently
+        drop the parallel behaviour.  Verify the override raises a
+        clear error rather than producing wrong output.
+        """
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(rng.normal(size=(self.nt, nnei, 4)))
+
+        repflow = RepFlowArgs(
+            n_dim=8,
+            e_dim=6,
+            a_dim=4,
+            nlayers=1,
+            e_rcut=self.rcut,
+            e_rcut_smth=self.rcut_smth,
+            e_sel=nnei,
+            a_rcut=self.rcut - 0.1,
+            a_rcut_smth=self.rcut_smth,
+            a_sel=nnei - 1,
+            axis_neuron=4,
+            update_angle=False,
+            update_style="res_residual",
+            update_residual_init="const",
+            smooth_edge_update=True,
+        )
+        dd = DescrptDPA3(
+            self.nt,
+            repflow=repflow,
+            exclude_types=[],
+            precision="float64",
+            use_econf_tebd=False,
+            type_map=None,
+            seed=GLOBAL_SEED,
+            use_loc_mapping=True,  # contradictory with comm_dict
+        ).to(self.device)
+        dd.repflows.mean = torch.tensor(davg, dtype=torch.float64, device=self.device)
+        dd.repflows.stddev = torch.tensor(dstd, dtype=torch.float64, device=self.device)
+
+        coord_ext = torch.tensor(
+            self.coord_ext[:1],
+            dtype=torch.float64,
+            device=self.device,
+        )
+        atype_ext = torch.tensor(
+            self.atype_ext[:1],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        nlist = torch.tensor(self.nlist[:1], dtype=torch.int64, device=self.device)
+        mapping = torch.tensor(
+            self.mapping[:1],
+            dtype=torch.int64,
+            device=self.device,
+        )
+
+        keepalive: list = []
+        ghost_sources = self.mapping[0, nloc:].astype(np.int32)
+        comm_dict = _build_self_comm_dict(
+            nloc=nloc,
+            nghost=self.nall - nloc,
+            sendlist_indices=ghost_sources,
+            device=self.device,
+            keepalive=keepalive,
+        )
+
+        with pytest.raises(RuntimeError, match="use_loc_mapping=True"):
+            dd(coord_ext, atype_ext, nlist, mapping, comm_dict=comm_dict)
+
+    def test_spin_branch_runs(self) -> None:
+        """Structural test for the ``has_spin`` branch of _exchange_ghosts.
+
+        Builds a synthetic input that satisfies the spin path's atom-
+        doubling invariant (``nloc`` and ``nall`` even), invokes the
+        override directly with ``comm_dict["has_spin"]`` set, and
+        verifies the output shape matches the input.  This catches
+        regressions in the split-real-virtual + concat_switch_virtual
+        code path without requiring a full spin model.
+        """
+        from deepmd.pt_expt.descriptor.repflows import (
+            DescrptBlockRepflows,
+        )
+
+        # Build a minimally-initialised block instance via deserialize
+        # of a tiny dpmodel block. We just need an instance to call
+        # the method on; method behaviour is independent of weights.
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(rng.normal(size=(self.nt, nnei, 4)))
+
+        repflow = RepFlowArgs(
+            n_dim=8,
+            e_dim=6,
+            a_dim=4,
+            nlayers=1,
+            e_rcut=self.rcut,
+            e_rcut_smth=self.rcut_smth,
+            e_sel=nnei,
+            a_rcut=self.rcut - 0.1,
+            a_rcut_smth=self.rcut_smth,
+            a_sel=nnei - 1,
+            axis_neuron=4,
+            update_angle=False,
+            update_style="res_residual",
+            update_residual_init="const",
+            smooth_edge_update=True,
+        )
+        dd = DescrptDPA3(
+            self.nt,
+            repflow=repflow,
+            exclude_types=[],
+            precision="float64",
+            use_econf_tebd=False,
+            type_map=None,
+            seed=GLOBAL_SEED,
+            use_loc_mapping=False,
+        ).to(self.device)
+        dd.repflows.mean = torch.tensor(davg, dtype=torch.float64, device=self.device)
+        dd.repflows.stddev = torch.tensor(dstd, dtype=torch.float64, device=self.device)
+        block = dd.repflows
+        assert isinstance(block, DescrptBlockRepflows)
+
+        # Pseudo-spin shapes: nloc and nall are even; n_dim from the
+        # model. The spin path splits along dim 1 into real/virtual
+        # halves and concats along dim 2.
+        n_dim = block.n_dim
+        nloc_spin, nghost_spin = 4, 2
+        nall_spin = nloc_spin + nghost_spin
+        # node_ebd: (1, nloc_spin, n_dim)
+        node_ebd = torch.randn(
+            1,
+            nloc_spin,
+            n_dim,
+            dtype=torch.float64,
+            device=self.device,
+        )
+
+        keepalive: list = []
+        # sendlist mirrors local-to-ghost slot for one ghost rank.
+        # Real ghost slots are real_nall-real_nloc = 1 atoms -> sendlist
+        # has 1 entry. Self-send branch will copy local index 0.
+        sendlist_indices = np.array([0], dtype=np.int32)
+        comm_dict = _build_self_comm_dict(
+            nloc=nloc_spin // 2,
+            nghost=nghost_spin // 2,
+            sendlist_indices=sendlist_indices,
+            device=self.device,
+            keepalive=keepalive,
+        )
+        comm_dict["has_spin"] = torch.tensor(
+            [1],
+            dtype=torch.int32,
+            device=self.device,
+        )
+
+        # Direct invocation of _exchange_ghosts on the block.
+        out = block._exchange_ghosts(
+            node_ebd,
+            mapping_tiled=None,
+            comm_dict=comm_dict,
+            nall=nall_spin,
+            nloc=nloc_spin,
+        )
+        # concat_switch_virtual produces a tensor of shape
+        # (1, nall_spin, n_dim) — 4 real + 2 virtual + 2 ghost-real +
+        # 2 ghost-virtual interleaved per the helper's contract.
+        # The exact structure is: out[1] dim is doubled relative to the
+        # real_nall (real_nloc + real_nghost = 3); for nloc_spin=4,
+        # nall_spin=6, the helper outputs 2*real_nall = 6 rows.
+        assert out.shape[0] == 1
+        assert out.shape[2] == n_dim
+        # Spin path returns shape (1, 2*real_nall, n_dim) = (1, nall_spin, n_dim).
+        assert out.shape[1] == nall_spin
diff --git a/source/tests/pt_expt/descriptor/test_repformer_parallel.py b/source/tests/pt_expt/descriptor/test_repformer_parallel.py
new file mode 100644
index 0000000000..ca0bd035e7
--- /dev/null
+++ b/source/tests/pt_expt/descriptor/test_repformer_parallel.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Eager parity test for the pt_expt Repformer parallel-mode override.
+
+Mirror of ``test_repflow_parallel.py`` but for DPA2 (which uses
+``DescrptBlockRepformers``).  Same single-rank self-exchange trick:
+``sendlist`` mirrors ``mapping[nloc:]`` so the C++ ``border_op``'s
+self-send branch reproduces the gather that the dpmodel default does.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import ctypes
+
+import numpy as np
+import pytest
+import torch
+
+# Trigger registration of the deepmd_export::border_op opaque wrapper.
+import deepmd.pt_expt.utils.comm  # noqa: F401
+from deepmd.dpmodel.descriptor.dpa2 import (
+    RepformerArgs,
+    RepinitArgs,
+)
+from deepmd.pt_expt.descriptor.dpa2 import (
+    DescrptDPA2,
+)
+from deepmd.pt_expt.utils import (
+    env,
+)
+from deepmd.pt_expt.utils.env import (
+    PRECISION_DICT,
+)
+
+from ...common.test_mixins import (
+    TestCaseSingleFrameWithNlist,
+    get_tols,
+)
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+
+def _addr_of(np_arr: np.ndarray) -> int:
+    return np_arr.ctypes.data_as(ctypes.c_void_p).value
+
+
+def _build_self_comm_dict(
+    *,
+    nloc: int,
+    nghost: int,
+    sendlist_indices: np.ndarray,
+    device: torch.device,
+    keepalive: list,
+) -> dict:
+    sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32)
+    keepalive.append(sendlist_indices)
+    nswap = 1
+    addr = _addr_of(sendlist_indices)
+    sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device=device)
+    sendproc = torch.zeros(nswap, dtype=torch.int32, device=device)
+    recvproc = torch.zeros(nswap, dtype=torch.int32, device=device)
+    sendnum = torch.tensor([nghost], dtype=torch.int32, device=device)
+    recvnum = torch.tensor([nghost], dtype=torch.int32, device=device)
+    communicator = torch.zeros(1, dtype=torch.int64, device=device)
+    nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device)
+    nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device)
+    return {
+        "send_list": sendlist_tensor,
+        "send_proc": sendproc,
+        "recv_proc": recvproc,
+        "send_num": sendnum,
+        "recv_num": recvnum,
+        "communicator": communicator,
+        "nlocal": nlocal_ts,
+        "nghost": nghost_ts,
+    }
+
+
+class TestRepformerParallel(TestCaseSingleFrameWithNlist):
+    def setup_method(self) -> None:
+        TestCaseSingleFrameWithNlist.setUp(self)
+        self.device = env.DEVICE
+
+    # See test_repflow_parallel.py for rationale on the "none-mapping"
+    # variant — exercises dpa2's "skip pre-block gather" branch with
+    # mapping=None, which is the realistic LAMMPS multi-rank shape.
+    @pytest.mark.parametrize("mapping_at_parallel", ["with-mapping", "none-mapping"])
+    @pytest.mark.parametrize("prec", ["float64"])  # precision
+    def test_parallel_matches_default(
+        self,
+        prec: str,
+        mapping_at_parallel: str,
+    ) -> None:
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+        davg_2 = rng.normal(size=(self.nt, nnei // 2, 4))
+        dstd_2 = rng.normal(size=(self.nt, nnei // 2, 4))
+        dstd_2 = 0.1 + np.abs(dstd_2)
+
+        dtype = PRECISION_DICT[prec]
+        rtol, atol = get_tols(prec)
+        if prec == "float64":
+            atol = 1e-8
+
+        repinit = RepinitArgs(
+            rcut=self.rcut,
+            rcut_smth=self.rcut_smth,
+            nsel=self.sel_mix,
+            tebd_input_mode="concat",
+            set_davg_zero=True,
+        )
+        repformer = RepformerArgs(
+            rcut=self.rcut / 2,
+            rcut_smth=self.rcut_smth,
+            nsel=nnei // 2,
+            nlayers=2,
+            g1_dim=12,
+            g2_dim=8,
+            axis_neuron=4,
+            update_g1_has_conv=True,
+            update_g1_has_drrd=True,
+            update_g1_has_grrg=True,
+            update_g1_has_attn=True,
+            update_g2_has_g1g1=True,
+            update_g2_has_attn=True,
+            update_h2=False,
+            attn1_hidden=12,
+            attn1_nhead=2,
+            attn2_hidden=8,
+            attn2_nhead=2,
+            attn2_has_gate=False,
+            update_style="res_avg",
+            set_davg_zero=True,
+            use_sqrt_nnei=False,
+            g1_out_conv=False,
+            g1_out_mlp=False,
+        )
+
+        dd = DescrptDPA2(
+            self.nt,
+            repinit=repinit,
+            repformer=repformer,
+            smooth=True,
+            exclude_types=[],
+            add_tebd_to_repinit_out=False,
+            precision=prec,
+            use_econf_tebd=False,
+            type_map=None,
+            seed=GLOBAL_SEED,
+        ).to(self.device)
+        dd.repinit.mean = torch.tensor(davg, dtype=dtype, device=self.device)
+        dd.repinit.stddev = torch.tensor(dstd, dtype=dtype, device=self.device)
+        dd.repformers.mean = torch.tensor(davg_2, dtype=dtype, device=self.device)
+        dd.repformers.stddev = torch.tensor(dstd_2, dtype=dtype, device=self.device)
+
+        coord_ext = torch.tensor(
+            self.coord_ext[:1],
+            dtype=dtype,
+            device=self.device,
+        )
+        atype_ext = torch.tensor(
+            self.atype_ext[:1],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        nlist = torch.tensor(self.nlist[:1], dtype=torch.int64, device=self.device)
+        mapping = torch.tensor(
+            self.mapping[:1],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        nall = self.nall
+
+        rd_default, _, _, _, _ = dd(coord_ext, atype_ext, nlist, mapping)
+
+        keepalive: list = []
+        ghost_sources = self.mapping[0, nloc:].astype(np.int32)
+        comm_dict = _build_self_comm_dict(
+            nloc=nloc,
+            nghost=nall - nloc,
+            sendlist_indices=ghost_sources,
+            device=self.device,
+            keepalive=keepalive,
+        )
+
+        mapping_for_parallel = (
+            mapping if mapping_at_parallel == "with-mapping" else None
+        )
+        rd_parallel, _, _, _, _ = dd(
+            coord_ext,
+            atype_ext,
+            nlist,
+            mapping_for_parallel,
+            comm_dict=comm_dict,
+        )
+
+        np.testing.assert_allclose(
+            rd_parallel.detach().cpu().numpy(),
+            rd_default.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+        )
diff --git a/source/tests/pt_expt/model/test_export_with_comm.py b/source/tests/pt_expt/model/test_export_with_comm.py
new file mode 100644
index 0000000000..24c27310ee
--- /dev/null
+++ b/source/tests/pt_expt/model/test_export_with_comm.py
@@ -0,0 +1,342 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Phase 3 round-trip test for the with-comm AOTInductor artifact.
+
+For a GNN model (DPA3 here), ``deserialize_to_file`` produces a .pt2
+archive containing TWO compiled artifacts:
+  * the regular forward_lower (no comm), packed at the top of the ZIP.
+  * a ``forward_lower_with_comm`` variant nested at
+    ``extra/forward_lower_with_comm.pt2``.
+
+This test verifies:
+  1. Both artifacts are present in the archive.
+  2. ``metadata.json`` carries the new ``has_message_passing`` and
+     ``has_comm_artifact`` flags.
+  3. The with-comm artifact loads via ``aoti_load_package`` and runs
+     when fed valid comm-dict tensors built via the ctypes pointer
+     trick (see ``test_repflow_parallel.py``).
+  4. The with-comm artifact's output matches the regular artifact's
+     output for a single-rank self-exchange whose effect is identity
+     (sendlist mirrors the extended-region mapping, which is what the
+     gather in the regular path produces).
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import ctypes
+import json
+import os
+import tempfile
+import zipfile
+
+import numpy as np
+import pytest
+import torch
+
+# Trigger registration of the deepmd_export::border_op opaque wrapper
+# (needed by the with-comm artifact at runtime).
+import deepmd.pt_expt.utils.comm  # noqa: F401
+from deepmd.pt_expt.model.get_model import (
+    get_model,
+)
+from deepmd.pt_expt.utils.serialization import (
+    _make_sample_inputs,
+    deserialize_to_file,
+)
+
+_DPA3_CONFIG = {
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "dpa3",
+        "repflow": {
+            "n_dim": 8,
+            "e_dim": 6,
+            "a_dim": 4,
+            "nlayers": 1,
+            "e_rcut": 4.0,
+            "e_rcut_smth": 0.5,
+            "e_sel": 12,
+            "a_rcut": 3.5,
+            "a_rcut_smth": 0.5,
+            "a_sel": 8,
+            "axis_neuron": 4,
+            "update_angle": False,
+        },
+        "use_loc_mapping": False,
+    },
+    "fitting_net": {"neuron": [16, 16], "seed": 1},
+}
+
+
+def _addr_of(np_arr: np.ndarray) -> int:
+    return np_arr.ctypes.data_as(ctypes.c_void_p).value
+
+
+def _build_self_comm_inputs(
+    nloc: int,
+    nghost: int,
+    sendlist_indices: np.ndarray,
+    keepalive: list,
+) -> tuple[torch.Tensor, ...]:
+    """Build runtime comm tensors for a single-rank self-send."""
+    sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32)
+    keepalive.append(sendlist_indices)
+    nswap = 1
+    addr = _addr_of(sendlist_indices)
+    send_list = torch.tensor([addr], dtype=torch.int64)
+    send_proc = torch.zeros(nswap, dtype=torch.int32)
+    recv_proc = torch.zeros(nswap, dtype=torch.int32)
+    send_num = torch.tensor([nghost], dtype=torch.int32)
+    recv_num = torch.tensor([nghost], dtype=torch.int32)
+    communicator = torch.zeros(1, dtype=torch.int64)
+    nlocal_ts = torch.tensor(nloc, dtype=torch.int32)
+    nghost_ts = torch.tensor(nghost, dtype=torch.int32)
+    return (
+        send_list,
+        send_proc,
+        recv_proc,
+        send_num,
+        recv_num,
+        communicator,
+        nlocal_ts,
+        nghost_ts,
+    )
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="AOTInductor compile is slow (~30s); run locally only by default.",
+)
+def test_pt2_dual_artifact_for_gnn(tmp_path) -> None:
+    """End-to-end: GNN model produces dual-artifact .pt2; both load."""
+    model = get_model(_DPA3_CONFIG)
+    model.to("cpu")
+    model.eval()
+
+    # Serialize → deserialize_to_file (compiles and packs both artifacts)
+    pt2_path = str(tmp_path / "test_dpa3.pt2")
+    data = {"model": model.serialize()}
+    deserialize_to_file(pt2_path, data)
+    assert os.path.exists(pt2_path)
+
+    # 1. ZIP layout sanity
+    with zipfile.ZipFile(pt2_path, "r") as zf:
+        names = set(zf.namelist())
+        meta = json.loads(zf.read("extra/metadata.json").decode("utf-8"))
+        assert "extra/forward_lower_with_comm.pt2" in names, (
+            f"with-comm artifact missing; names={sorted(names)}"
+        )
+    assert meta["has_message_passing"] is True
+    assert meta["has_comm_artifact"] is True
+
+    # 2. Both artifacts load.
+    from torch._inductor import (
+        aoti_load_package,
+    )
+
+    regular = aoti_load_package(pt2_path)
+
+    with tempfile.TemporaryDirectory() as td:
+        wc_path = os.path.join(td, "fl_wc.pt2")
+        with zipfile.ZipFile(pt2_path, "r") as zf:
+            with open(wc_path, "wb") as f:
+                f.write(zf.read("extra/forward_lower_with_comm.pt2"))
+        with_comm = aoti_load_package(wc_path)
+
+    # 3. Run both artifacts with nframes=1 (matches what the with-comm
+    # artifact requires; LAMMPS always passes one frame anyway).
+    sample = _make_sample_inputs(model, nframes=1, has_spin=False)
+    ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam = sample
+    nloc = nlist_t.shape[1]
+    nall = ext_atype.shape[1]
+    nghost = nall - nloc
+
+    out_regular = regular(ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam)
+
+    # 4. Build runtime comm tensors mirroring the mapping (single-rank
+    # self-send: ghost slot ii receives node[mapping[ii]], identical to
+    # the gather in the regular path).
+    keepalive: list = []
+    ghost_sources = mapping_t[0, nloc:].cpu().numpy().astype(np.int32)
+    comm_inputs = _build_self_comm_inputs(
+        nloc=nloc,
+        nghost=nghost,
+        sendlist_indices=ghost_sources,
+        keepalive=keepalive,
+    )
+
+    out_with_comm = with_comm(
+        ext_coord,
+        ext_atype,
+        nlist_t,
+        mapping_t,
+        fparam,
+        aparam,
+        *comm_inputs,
+    )
+
+    # 5. Outputs must match (parity gate, eager-mode equivalent).
+    for key in out_regular:
+        np.testing.assert_allclose(
+            out_with_comm[key].detach().cpu().numpy(),
+            out_regular[key].detach().cpu().numpy(),
+            rtol=0,
+            atol=1e-10,
+            err_msg=f"output[{key}] differs between regular and with-comm",
+        )
+
+
+# ---------------------------------------------------------------------------
+# Coverage for previously-untested branches
+# ---------------------------------------------------------------------------
+
+
+def test_make_comm_sample_inputs_clamps_zero_nghost() -> None:
+    """``_make_comm_sample_inputs(nghost=0)`` must produce valid tensors.
+
+    The clamp ``send_count = max(1, nghost)`` ensures we never pass an
+    empty pointer-array to border_op. This test exercises the
+    ``nghost == 0`` branch (a model exported on a system whose entire
+    domain fits in one rank with no ghosts) — the trace must still
+    produce well-formed comm tensors of shape (1,).
+    """
+    from deepmd.pt_expt.utils.serialization import (
+        _make_comm_sample_inputs,
+    )
+
+    comm_inputs = _make_comm_sample_inputs(
+        nloc=4,
+        nghost=0,
+        device=torch.device("cpu"),
+    )
+    assert len(comm_inputs) == 8
+    (
+        send_list,
+        send_proc,
+        recv_proc,
+        send_num,
+        recv_num,
+        communicator,
+        nlocal,
+        nghost_t,
+    ) = comm_inputs
+    # nswap stays at 1 (Phase 0: nswap=0 specializes during export).
+    assert send_list.shape == (1,)
+    assert send_proc.shape == (1,)
+    assert recv_proc.shape == (1,)
+    assert send_num.shape == (1,)
+    assert recv_num.shape == (1,)
+    # send_count is clamped to >=1, so send_num is also clamped.
+    assert send_num.item() == 1
+    assert recv_num.item() == 1
+    # Scalar metadata reports the original (un-clamped) values.
+    assert nlocal.item() == 4
+    assert nghost_t.item() == 0
+
+
+def test_has_message_passing_for_hybrid_with_gnn() -> None:
+    """``_has_message_passing`` correctly reports True for hybrid
+    descriptors whose children include a GNN block.
+
+    The hybrid descriptor delegates ``has_message_passing()`` to its
+    children — if any child has message passing, the hybrid does too.
+    Our metadata flag (``has_message_passing``) is what
+    ``_deserialize_to_file_pt2`` uses to decide whether to compile
+    the with-comm artifact, so the hybrid case must route correctly.
+    """
+    from deepmd.pt_expt.model.get_model import get_model as get_pt_expt_model
+    from deepmd.pt_expt.utils.serialization import (
+        _has_message_passing,
+    )
+
+    config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "hybrid",
+            "list": [
+                # Non-GNN child.
+                {
+                    "type": "se_e2_a",
+                    "sel": [12, 12],
+                    "rcut": 4.0,
+                    "rcut_smth": 0.5,
+                    "neuron": [4, 8],
+                    "axis_neuron": 4,
+                    "seed": 1,
+                },
+                # GNN child (DPA3).
+                {
+                    "type": "dpa3",
+                    "repflow": {
+                        "n_dim": 4,
+                        "e_dim": 4,
+                        "a_dim": 4,
+                        "nlayers": 1,
+                        "e_rcut": 4.0,
+                        "e_rcut_smth": 0.5,
+                        "e_sel": 8,
+                        "a_rcut": 3.5,
+                        "a_rcut_smth": 0.5,
+                        "a_sel": 4,
+                        "axis_neuron": 4,
+                        "update_angle": False,
+                    },
+                    "use_loc_mapping": False,
+                },
+            ],
+        },
+        "fitting_net": {"neuron": [8, 8], "seed": 1},
+    }
+    model = get_pt_expt_model(config)
+    model.to("cpu")
+    model.eval()
+    assert _has_message_passing(model) is True, (
+        "hybrid model with a GNN child must report has_message_passing=True"
+    )
+
+
+def test_pte_with_comm_dict_traces_and_loads(tmp_path) -> None:
+    """``_trace_and_export(with_comm_dict=True)`` produces a valid
+    ExportedProgram that can be saved as .pte and loaded back.
+
+    .pte is Python-only (the multi-rank consumer is C++/LAMMPS via
+    .pt2), so production has no business calling this path. But the
+    trace machinery is the same as the .pt2 path, so .pte serves as
+    a cheap (no AOTI compile) round-trip test for the with-comm
+    export pipeline.
+    """
+    from deepmd.pt_expt.utils.serialization import (
+        _trace_and_export,
+    )
+
+    model = get_model(_DPA3_CONFIG)
+    model.to("cpu")
+    model.eval()
+    data = {"model": model.serialize()}
+
+    exported, metadata, _data_for_json, output_keys = _trace_and_export(
+        data,
+        model_json_override=None,
+        with_comm_dict=True,
+    )
+    assert metadata["has_message_passing"] is True
+    # output_keys mirrors what the regular trace would produce; at
+    # least one energy-related key must be present.
+    assert any(k.startswith("energy") for k in output_keys), (
+        f"expected an 'energy*' output key; got {output_keys}"
+    )
+
+    # Save as .pte and reload — verifies the ExportedProgram is
+    # structurally valid (no broken graph or missing constants).
+    pte_path = str(tmp_path / "fl_with_comm.pte")
+    torch.export.save(exported, pte_path)
+    assert os.path.exists(pte_path)
+    loaded = torch.export.load(pte_path)
+    # Sanity: the loaded program has the expected number of inputs
+    # (6 base + 8 comm = 14).
+    spec = loaded.module().graph.find_nodes(op="placeholder")
+    assert len(spec) == 14, (
+        f"with-comm exported program must accept 14 positional inputs "
+        f"(6 base + 8 comm); got {len(spec)}"
+    )
diff --git a/source/tests/pt_expt/model/test_spin_export_with_comm.py b/source/tests/pt_expt/model/test_spin_export_with_comm.py
new file mode 100644
index 0000000000..93b22bf864
--- /dev/null
+++ b/source/tests/pt_expt/model/test_spin_export_with_comm.py
@@ -0,0 +1,316 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Tests for SpinModel + comm_dict end-to-end.
+
+Two coverage levels:
+
+1. ``test_spin_forward_common_lower_exportable_with_comm_traces``:
+   verifies the trace machinery (positional comm-tensor plumbing,
+   has_spin injection, make_fx symbolic mode) on a spin model with a
+   non-GNN descriptor (se_e2_a). The non-GNN case is the cheapest
+   smoke test since se_e2_a's `call` accepts and drops comm_dict —
+   exercising the wrapper/spin model layers without paying for GNN
+   compile cost.
+
+2. ``test_spin_dpa3_eager_parity``: end-to-end value-correctness for
+   a spin DPA3 model running through ``call_common_lower`` in eager
+   mode, with a comm_dict whose self-exchange mirrors the mapping.
+   Asserts the result matches the no-comm reference. This proves
+   ``SpinModel.call_common_lower`` correctly forwards comm_dict
+   through to the GNN repflow, AND that the spin branch of
+   ``_exchange_ghosts`` (real/virtual split + concat_switch_virtual)
+   reproduces the regular gather path on real values.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import ctypes
+
+import numpy as np
+import torch
+
+import deepmd.pt_expt.utils.comm  # noqa: F401  - opaque op registration
+from deepmd.dpmodel.model.model import get_model as get_model_dp
+from deepmd.pt_expt.model.spin_ener_model import (
+    SpinEnergyModel,
+)
+
+SPIN_GNN_DATA = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "se_e2_a",
+        "sel": [20, 20, 20],
+        "rcut_smth": 0.50,
+        "rcut": 4.00,
+        "neuron": [3, 6],
+        "resnet_dt": False,
+        "axis_neuron": 2,
+        "precision": "float64",
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [5, 5],
+        "resnet_dt": True,
+        "precision": "float64",
+        "seed": 1,
+    },
+    "spin": {
+        "use_spin": [True, False, False],
+        "virtual_scale": [0.3140],
+    },
+}
+
+
+def _addr_of(np_arr: np.ndarray) -> int:
+    return np_arr.ctypes.data_as(ctypes.c_void_p).value
+
+
+def _build_self_comm_inputs(nloc: int, nghost: int):
+    """Build trivial-but-valid comm tensors for tracing."""
+    keepalive: list[np.ndarray] = []
+    indices = np.zeros(max(1, nghost), dtype=np.int32)
+    keepalive.append(indices)
+    addr = _addr_of(indices)
+    nswap = 1
+    return (
+        torch.tensor([addr], dtype=torch.int64),  # send_list
+        torch.zeros(nswap, dtype=torch.int32),  # send_proc
+        torch.zeros(nswap, dtype=torch.int32),  # recv_proc
+        torch.tensor([max(1, nghost)], dtype=torch.int32),  # send_num
+        torch.tensor([max(1, nghost)], dtype=torch.int32),  # recv_num
+        torch.zeros(1, dtype=torch.int64),  # communicator
+        torch.tensor(nloc, dtype=torch.int32),  # nlocal
+        torch.tensor(nghost, dtype=torch.int32),  # nghost
+    ), keepalive
+
+
+def test_spin_forward_common_lower_exportable_with_comm_traces() -> None:
+    """The spin variant of forward_common_lower_exportable_with_comm
+    produces a callable traced GraphModule.
+    """
+    dp_model = get_model_dp(SPIN_GNN_DATA)
+    model = SpinEnergyModel.deserialize(dp_model.serialize()).to("cpu")
+    model.eval()
+
+    # Build sample inputs (nframes=1 to match the override's nb=1
+    # constraint; spin doubles natoms).
+    nloc = 6  # 3 real + 3 virtual
+    nall = 8  # 1 ghost on each side
+    n_dim_coord = 3
+    ext_coord = torch.zeros(1, nall, n_dim_coord, dtype=torch.float64)
+    ext_atype = torch.zeros(1, nall, dtype=torch.int64)
+    ext_spin = torch.zeros(1, nall, n_dim_coord, dtype=torch.float64)
+    nlist = torch.zeros(1, nloc, 6, dtype=torch.int64)  # nnei from sel
+    mapping = torch.zeros(1, nall, dtype=torch.int64)
+    fparam = None
+    aparam = None
+
+    comm_inputs, _keepalive = _build_self_comm_inputs(nloc=nloc, nghost=nall - nloc)
+
+    # The trace should succeed without raising. We do NOT verify
+    # numerical correctness here — that would require a real spin GNN
+    # model + live MPI (deferred to Phase 5 LAMMPS).  This test only
+    # checks the trace-time machinery: positional arg plumbing,
+    # has_spin injection, and that make_fx symbolic mode produces a
+    # valid GraphModule.
+    traced = model.forward_common_lower_exportable_with_comm(
+        ext_coord,
+        ext_atype,
+        ext_spin,
+        nlist,
+        mapping,
+        fparam,
+        aparam,
+        *comm_inputs,
+        do_atomic_virial=True,
+        tracing_mode="symbolic",
+        _allow_non_fake_inputs=True,
+    )
+    # The traced module must be a torch.nn.Module that can be invoked.
+    assert isinstance(traced, torch.nn.Module)
+    # And calling it with the same inputs returns a dict with the
+    # expected keys.
+    out = traced(
+        ext_coord,
+        ext_atype,
+        ext_spin,
+        nlist,
+        mapping,
+        fparam,
+        aparam,
+        *comm_inputs,
+    )
+    assert isinstance(out, dict)
+    # forward_common_lower internal output names; specifics depend on
+    # the model's output def, just check at least one is present.
+    assert any(k.startswith("energy") for k in out), (
+        f"expected an 'energy*' key in trace output; got {list(out.keys())}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# 2. End-to-end value parity for spin DPA3 in eager mode
+# ---------------------------------------------------------------------------
+
+
+SPIN_DPA3_DATA = {
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "dpa3",
+        "repflow": {
+            "n_dim": 8,
+            "e_dim": 6,
+            "a_dim": 4,
+            "nlayers": 1,
+            "e_rcut": 4.0,
+            "e_rcut_smth": 0.5,
+            "e_sel": 8,
+            "a_rcut": 3.5,
+            "a_rcut_smth": 0.5,
+            "a_sel": 4,
+            "axis_neuron": 4,
+            "update_angle": False,
+        },
+        "use_loc_mapping": False,
+    },
+    "fitting_net": {"neuron": [16, 16], "seed": 1},
+    "spin": {"use_spin": [True, False], "virtual_scale": [0.314]},
+}
+
+
+def test_spin_dpa3_eager_parity() -> None:
+    """SpinModel.call_common_lower with comm_dict (self-exchange) must
+    match the no-comm reference for a spin DPA3 model.
+
+    Setup mirrors the per-block parity tests but at the SpinModel
+    level so it exercises the full plumbing chain:
+      ``SpinModel.call_common_lower(comm_dict=...)``
+       → process_spin_input_lower (atom-doubling)
+       → backbone EnergyModel.call_common_lower(comm_dict=...)
+       → atomic_model.forward_common_atomic(comm_dict=...)
+       → DescrptDPA3.call(comm_dict=...)
+       → DescrptBlockRepflows.call(comm_dict=...)
+       → DescrptBlockRepflows._exchange_ghosts (pt_expt override,
+         spin branch via has_spin in comm_dict)
+
+    The comm_dict has has_spin=tensor([1]) and a sendlist that
+    mirrors the real-atom portion of the mapping.  The override's
+    spin branch splits node_ebd into real/virtual halves, stacks
+    along feature dim, exchanges, then de-interleaves with
+    concat_switch_virtual.  When the exchange produces the same
+    result as the gather (which it should for a self-mirror
+    sendlist), the spin model output must equal the no-comm output
+    bit-for-bit (atol 1e-12 for float64).
+    """
+    dp_model = get_model_dp(SPIN_DPA3_DATA)
+    model = SpinEnergyModel.deserialize(dp_model.serialize()).to("cpu")
+    model.eval()
+
+    # Build a 2-atom test system: 1 real + 1 ghost real for type 0,
+    # plus the same in spin (use_spin=[True, False] means type 0 is
+    # spin-doubled, type 1 is not).  After atom-doubling the model
+    # processes 2 real + 2 virtual = 4 atoms locally and 4 ghost
+    # slots.  We use minimal nloc to keep the test fast.
+    nframes = 1
+    nloc_real = 2  # 2 real atoms (both type 0 to keep simple)
+    nghost_real = 2  # 2 ghost real atoms
+    nall_real = nloc_real + nghost_real
+    rng = np.random.default_rng(42)
+
+    # Coordinates and types (real only — spin model doubles internally).
+    coord_real = rng.uniform(0, 4.0, size=(nframes, nall_real, 3)).astype(np.float64)
+    atype_real = np.zeros((nframes, nall_real), dtype=np.int64)  # all type 0
+    spin_real = rng.uniform(-0.1, 0.1, size=(nframes, nall_real, 3)).astype(np.float64)
+    # mapping: ghost atoms mirror local atoms (ghost 0 → local 0, ghost 1 → local 1)
+    mapping_real = np.array(
+        [[0, 1, 0, 1]],
+        dtype=np.int64,
+    )  # nframes=1, nall_real=4
+
+    # Build extended-region nlist for the real atoms. Each real atom's
+    # neighbour list points to the other 3 atoms (within rcut by
+    # construction of small box). We don't need physically meaningful
+    # values — just well-formed nlist so the model runs.
+    nnei = 8  # matches e_sel
+    nlist_real = np.full((nframes, nloc_real, nnei), -1, dtype=np.int64)
+    for ii in range(nloc_real):
+        # neighbours = all other atoms (real + ghost) up to nnei
+        others = [j for j in range(nall_real) if j != ii][:nnei]
+        nlist_real[0, ii, : len(others)] = others
+
+    # ``call_common_lower`` runs through ``transform_output`` which
+    # calls ``torch.autograd.grad`` on coord, so coord must require
+    # grad in eager mode.
+    ext_coord = torch.tensor(coord_real, dtype=torch.float64, requires_grad=True)
+    ext_atype = torch.tensor(atype_real, dtype=torch.int64)
+    ext_spin = torch.tensor(spin_real, dtype=torch.float64)
+    nlist_t = torch.tensor(nlist_real, dtype=torch.int64)
+    mapping_t = torch.tensor(mapping_real, dtype=torch.int64)
+
+    # 1. No-comm reference.
+    out_ref = model.call_common_lower(
+        ext_coord,
+        ext_atype,
+        ext_spin,
+        nlist_t,
+        mapping_t,
+        fparam=None,
+        aparam=None,
+        do_atomic_virial=False,
+    )
+
+    # 2. With comm_dict.  The SpinModel internally doubles atoms to
+    # nloc=2*nloc_real=4 and nall=2*nall_real=8.  The override's spin
+    # branch peels back to real_nloc=nloc_real and real_nall=nall_real.
+    # Sendlist must point to REAL local indices for each real ghost
+    # slot (mapping_real[nloc_real:nall_real]).
+    keepalive: list = []
+    sendlist_indices = mapping_real[0, nloc_real:].astype(np.int32)
+    keepalive.append(sendlist_indices)
+    addr = sendlist_indices.ctypes.data_as(ctypes.c_void_p).value
+    nswap = 1
+    nghost_real_count = nall_real - nloc_real
+    comm_dict = {
+        "send_list": torch.tensor([addr], dtype=torch.int64),
+        "send_proc": torch.zeros(nswap, dtype=torch.int32),
+        "recv_proc": torch.zeros(nswap, dtype=torch.int32),
+        "send_num": torch.tensor([nghost_real_count], dtype=torch.int32),
+        "recv_num": torch.tensor([nghost_real_count], dtype=torch.int32),
+        "communicator": torch.zeros(1, dtype=torch.int64),
+        # nlocal/nghost are the REAL counts (the override's spin branch
+        # halves nloc/nall internally).  In production C++ side passes
+        # real counts here too — see DeepSpinPT.cc.
+        "nlocal": torch.tensor(nloc_real, dtype=torch.int32),
+        "nghost": torch.tensor(nghost_real_count, dtype=torch.int32),
+        # Triggers spin branch in the override.
+        "has_spin": torch.tensor([1], dtype=torch.int32),
+    }
+
+    # Fresh coord tensor (the first call's backward graph would otherwise
+    # be reused / cause double-backward errors).
+    ext_coord_2 = torch.tensor(coord_real, dtype=torch.float64, requires_grad=True)
+    out_parallel = model.call_common_lower(
+        ext_coord_2,
+        ext_atype,
+        ext_spin,
+        nlist_t,
+        mapping_t,
+        fparam=None,
+        aparam=None,
+        do_atomic_virial=False,
+        comm_dict=comm_dict,
+    )
+
+    # 3. Compare every output key.
+    for key in out_ref:
+        ref = out_ref[key].detach().cpu().numpy()
+        par = out_parallel[key].detach().cpu().numpy()
+        np.testing.assert_allclose(
+            par,
+            ref,
+            atol=1e-10,
+            rtol=0,
+            err_msg=f"output[{key}] mismatch between no-comm and comm_dict path",
+        )
diff --git a/source/tests/pt_expt/utils/test_border_op_backward.py b/source/tests/pt_expt/utils/test_border_op_backward.py
new file mode 100644
index 0000000000..c46705ad8a
--- /dev/null
+++ b/source/tests/pt_expt/utils/test_border_op_backward.py
@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Unit tests for the new C++ symbol ``deepmd::border_op_backward`` and
+the pt_expt autograd path that dispatches to it.
+
+Tests two distinct surfaces:
+
+1. **Direct op call** — invokes ``torch.ops.deepmd.border_op_backward``
+   with hand-built comm tensors (single-rank self-exchange via ctypes
+   pointer trick). Verifies the symbol is registered, accepts the
+   expected positional args, and produces a correctly-shaped output
+   for both ``float32`` and ``float64`` (covers the ``backward_t``
+   template's two specializations).
+
+2. **Through the opaque wrapper** — exercises
+   ``torch.ops.deepmd_export.border_op``'s ``register_autograd``
+   pathway. Calls the wrapper inside an autograd context, asks for
+   ``grad`` w.r.t. the ``g1`` input, and verifies the gradient flows
+   through (matches the gradient produced by an equivalent
+   ``index_select`` + ``index_add_`` Python implementation, which is
+   the reference for the symmetric MPI exchange in single-rank).
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import ctypes
+
+import numpy as np
+import pytest
+import torch
+
+# Ensure the new C++ symbol is loaded.  pt_expt imports deepmd.pt for
+# the custom-op .so.
+import deepmd.pt
+import deepmd.pt_expt.utils.comm  # noqa: F401  - registers deepmd_export::border_op
+
+
+def _addr_of(np_arr: np.ndarray) -> int:
+    return np_arr.ctypes.data_as(ctypes.c_void_p).value
+
+
+def _build_self_swap(
+    nloc: int,
+    nghost: int,
+    sendlist_indices: np.ndarray,
+    keepalive: list,
+    dtype: torch.dtype,
+):
+    """Build comm tensors for a single self-exchange swap."""
+    sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32)
+    keepalive.append(sendlist_indices)
+    nswap = 1
+    addr = _addr_of(sendlist_indices)
+    sendlist = torch.tensor([addr], dtype=torch.int64)
+    sendproc = torch.zeros(nswap, dtype=torch.int32)
+    recvproc = torch.zeros(nswap, dtype=torch.int32)
+    sendnum = torch.tensor([nghost], dtype=torch.int32)
+    recvnum = torch.tensor([nghost], dtype=torch.int32)
+    communicator = torch.zeros(1, dtype=torch.int64)
+    nlocal_ts = torch.tensor(nloc, dtype=torch.int32)
+    nghost_ts = torch.tensor(nghost, dtype=torch.int32)
+    return (
+        sendlist,
+        sendproc,
+        recvproc,
+        sendnum,
+        recvnum,
+        communicator,
+        nlocal_ts,
+        nghost_ts,
+    )
+
+
+# ---------------------------------------------------------------------------
+# 1. Direct op call: border_op_backward as a standalone op
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float64])
+def test_border_op_backward_direct(dtype: torch.dtype) -> None:
+    """``torch.ops.deepmd.border_op_backward`` is callable for both
+    float32 and float64 inputs and returns a tensor of the expected
+    shape on the input's device.
+    """
+    assert hasattr(torch.ops.deepmd, "border_op_backward"), (
+        "Symbol not registered; rebuild libdeepmd_op_pt.so."
+    )
+    nloc, nghost = 5, 3
+    nall = nloc + nghost
+    n_dim = 4
+
+    keepalive: list = []
+    sendlist_indices = np.array([0, 1, 2], dtype=np.int32)
+    comm = _build_self_swap(nloc, nghost, sendlist_indices, keepalive, dtype)
+
+    grad_g1 = torch.ones(nall, n_dim, dtype=dtype)
+
+    grad_in = torch.ops.deepmd.border_op_backward(
+        comm[0],
+        comm[1],
+        comm[2],
+        comm[3],
+        comm[4],
+        grad_g1,
+        comm[5],
+        comm[6],
+        comm[7],
+    )
+
+    # backward must preserve dtype and shape, and run on the same device.
+    assert grad_in.dtype == grad_g1.dtype
+    assert tuple(grad_in.shape) == tuple(grad_g1.shape)
+    assert grad_in.device == grad_g1.device
+
+
+def test_border_op_backward_accumulation_semantics() -> None:
+    """Single-rank self-exchange backward: each ghost slot's grad is
+    accumulated into the local atom whose index sendlist points to.
+
+    Reference: for forward ``g_ext[nloc + i] = g[sendlist[i]]``, the
+    reverse is ``grad_g[sendlist[i]] += grad_g_ext[nloc + i]``.
+    """
+    nloc, nghost = 4, 4
+    nall = nloc + nghost
+    n_dim = 3
+
+    # Each ghost slot maps back to a local atom: ghost 0->local 0, ghost
+    # 1->local 1, etc. So backward should add grad_g_ext[nloc+i] into
+    # grad_g[i] for i in [0, nghost).
+    keepalive: list = []
+    sendlist_indices = np.array([0, 1, 2, 3], dtype=np.int32)
+    comm = _build_self_swap(
+        nloc,
+        nghost,
+        sendlist_indices,
+        keepalive,
+        torch.float64,
+    )
+
+    # Distinct values per ghost slot so we can identify the routing.
+    grad_g1 = torch.zeros(nall, n_dim, dtype=torch.float64)
+    grad_g1[nloc + 0, 0] = 7.0
+    grad_g1[nloc + 1, 1] = 11.0
+    grad_g1[nloc + 2, 2] = 13.0
+    grad_g1[nloc + 3, 0] = 17.0
+    # Local part has its own grad too — must pass through unchanged.
+    grad_g1[0, 1] = 1.0
+    grad_g1[2, 2] = 2.0
+    # Capture the input BEFORE the call: the C++ op writes
+    # ``index_add_`` into the same tensor and returns it, so once
+    # we've called the op the ``grad_g1`` reference points to the
+    # modified buffer.  Snapshot first.
+    grad_g1_orig = grad_g1.clone()
+    grad_in = torch.ops.deepmd.border_op_backward(
+        comm[0],
+        comm[1],
+        comm[2],
+        comm[3],
+        comm[4],
+        grad_g1,
+        comm[5],
+        comm[6],
+        comm[7],
+    )
+
+    # Expected: grad_g_local += grad_g_ext[nloc:] indexed by sendlist.
+    # Ghost rows pass through unchanged (the C++ backward does not
+    # zero them; the wrapper's autograd consumer is F.pad whose
+    # backward drops them anyway).
+    expected = grad_g1_orig.clone()
+    for i, src_local_idx in enumerate(sendlist_indices.tolist()):
+        expected[src_local_idx] += grad_g1_orig[nloc + i]
+    np.testing.assert_allclose(
+        grad_in.numpy(),
+        expected.numpy(),
+        atol=1e-12,
+        rtol=0,
+    )
+
+
+# ---------------------------------------------------------------------------
+# 2. Autograd path through the deepmd_export::border_op opaque wrapper
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float64])
+def test_border_op_export_autograd(dtype: torch.dtype) -> None:
+    """End-to-end autograd through the opaque wrapper.
+
+    Builds an inputs tensor with ``requires_grad=True``, calls the
+    wrapper, sums the output, and asks for ``grad`` w.r.t. the input.
+    The reported gradient must match a hand-computed reference based
+    on the same self-exchange routing.
+    """
+    nloc, nghost = 3, 2
+    nall = nloc + nghost
+    n_dim = 4
+
+    keepalive: list = []
+    sendlist_indices = np.array([0, 1], dtype=np.int32)  # ghosts mirror locals 0,1
+    comm = _build_self_swap(nloc, nghost, sendlist_indices, keepalive, dtype)
+
+    # g1 is full nall-shape pre-padded; ghosts initialised to zero
+    # (mirroring how repflows.forward feeds the wrapper).
+    rng = np.random.default_rng(123)
+    g1_np = rng.normal(size=(nall, n_dim)).astype(
+        np.float32 if dtype == torch.float32 else np.float64,
+    )
+    g1_np[nloc:] = 0.0
+    g1 = torch.tensor(g1_np, dtype=dtype, requires_grad=True)
+
+    out = torch.ops.deepmd_export.border_op(
+        comm[0],
+        comm[1],
+        comm[2],
+        comm[3],
+        comm[4],
+        g1,
+        comm[5],
+        comm[6],
+        comm[7],
+    )
+    # Sum so the upstream grad is all-ones at every position.
+    loss = out.sum()
+    (grad_in,) = torch.autograd.grad(loss, g1, create_graph=False)
+
+    # Reference for LOCAL rows only: forward sets
+    # ``out[nloc + i] = g1[sendlist[i]]`` for each ghost slot i and
+    # passes local rows through.  With ``loss = out.sum()`` the
+    # upstream grad is ones everywhere, so each local row k receives
+    # 1 (from ``out[k] = g1[k]``) plus 1 for every ghost slot that
+    # references k via ``sendlist``.
+    expected_local = torch.ones(nloc, n_dim, dtype=dtype)
+    for s in sendlist_indices:
+        expected_local[int(s)] += 1.0
+    rtol, atol = (0.0, 1e-5) if dtype == torch.float32 else (0.0, 1e-12)
+    np.testing.assert_allclose(
+        grad_in[:nloc].numpy(),
+        expected_local.numpy(),
+        atol=atol,
+        rtol=rtol,
+    )
+    # Ghost rows of grad_in are not semantically meaningful: in
+    # production the wrapper's input is ``F.pad(node_ebd, value=0)``
+    # so the ghost-row gradient is consumed by ``F.pad``'s backward
+    # (which drops it).  The C++ backward leaves them as the upstream
+    # grad (here, ones), but we don't assert on it.

From 0bd131ad79be47bcf65434bca66c79a390ea3c21 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 14:27:27 +0800
Subject: [PATCH 07/34] fix(cc): link TORCH_LIBRARIES in api_cc tests so
 pt_expt tests run

Without TORCH_LIBRARIES on the test binary, the
``__has_include(<torch/csrc/inductor/aoti_package/model_package_loader.h>)``
check in DeepPotPTExpt.h evaluates to false and the test files compile
with BUILD_PT_EXPT=0, causing every pt_expt test case to silently
GTEST_SKIP("PyTorch support is not enabled").  The bug was masked by
ctest reporting a green run with all skips counted as success.

Adding ``target_link_libraries(runUnitTests_cc "${TORCH_LIBRARIES}")``
under the existing ``ENABLE_PYTORCH`` branch makes the AOTI header
visible to the test compilation. After this fix, the 148 pt_expt
tests actually run instead of being silently skipped.
---
 source/api_cc/tests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index a3e7d067f7..a570747f29 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -11,6 +11,10 @@ if(ENABLE_TENSORFLOW)
 endif()
 if(ENABLE_PYTORCH)
   target_compile_definitions(runUnitTests_cc PRIVATE BUILD_PYTORCH)
+  # Link torch so __has_include(<torch/csrc/inductor/...>) succeeds and
+  # BUILD_PT_EXPT is set for the test binary; otherwise pt_expt tests all
+  # GTEST_SKIP() with "PyTorch support is not enabled".
+  target_link_libraries(runUnitTests_cc "${TORCH_LIBRARIES}")
 endif()
 if(ENABLE_JAX)
   target_compile_definitions(runUnitTests_cc PRIVATE BUILD_JAX)

From cdef9d5214e48b78c8a93064ef391ed54008968a Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 14:33:28 +0800
Subject: [PATCH 08/34] feat(cc): wire DeepPotPTExpt and DeepSpinPTExpt for
 multi-rank GNN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 4 of the GNN MPI plumbing.  When a .pt2 archive carries a
nested forward_lower_with_comm.pt2 (added by Phase 3 for GNN models),
the C++ inference path now optionally extracts and loads it as a
second AOTInductor module.  Each compute() call dispatches between
the regular and with-comm artifacts based on lmp_list.nswap: LAMMPS
sets nswap=0 in single-rank mode and >0 in multi-rank, so single-rank
inference keeps using the regular artifact (mapping-tensor gather)
and multi-rank routes to the with-comm artifact (MPI ghost exchange).

Three additions:

1. commonPTExpt.h adds:
   - TempFile RAII handle for the extracted nested artifact (mkstemp,
     unlinked at destruction).
   - TempFile::from_zip_entry reads a ZIP entry from the outer .pt2
     and writes it to a temp file (atomic, 0600).
   - build_comm_tensors_positional packs the 8 comm tensors in
     canonical positional order (send_list, send_proc, recv_proc,
     send_num, recv_num, communicator, nlocal, nghost) for the
     with-comm AOTI module input vector.

2. DeepPotPTExpt:
   - Reads has_comm_artifact from metadata.json (defaults false for
     old .pt2 files lacking the field).
   - When true, extracts extra/forward_lower_with_comm.pt2 to a
     TempFile and loads it as with_comm_loader.
   - run_model_with_comm appends the 8 comm tensors to the base
     inputs and dispatches to with_comm_loader->run.
   - compute() chooses regular vs with-comm based on nswap.

3. DeepSpinPTExpt:
   - Same pattern; the Phase 3 export injects has_spin=1 into the
     traced graph comm_dict, so the C++ side passes the same 8 comm
     tensors as the non-spin case.  nlocal/nghost carry the real-atom
     counts (the spin override halves them internally to get the
     atom-doubled counts).

All 148 existing pt_expt C++ tests pass — the with-comm path is
gated behind nswap > 0 so single-rank tests dont exercise it (that
coverage is Phase 5 multi-rank LAMMPS test).
---
 source/api_cc/include/DeepPotPTExpt.h  |  32 +++++
 source/api_cc/include/DeepSpinPTExpt.h |  23 ++++
 source/api_cc/src/DeepPotPTExpt.cc     |  75 ++++++++++-
 source/api_cc/src/DeepSpinPTExpt.cc    |  72 +++++++++-
 source/api_cc/src/commonPTExpt.h       | 177 ++++++++++++++++++++++++-
 5 files changed, 371 insertions(+), 8 deletions(-)

diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h
index 0d42324d24..10135b58db 100644
--- a/source/api_cc/include/DeepPotPTExpt.h
+++ b/source/api_cc/include/DeepPotPTExpt.h
@@ -16,6 +16,12 @@
 
 #include "DeepPot.h"
 
+// Forward-declare to keep TempFile out of public header. Defined in
+// commonPTExpt.h.
+namespace deepmd::ptexpt {
+class TempFile;
+}
+
 namespace torch::inductor {
 class AOTIModelPackageLoader;
 }
@@ -212,6 +218,14 @@ class DeepPotPTExpt : public DeepPotBackend {
   std::vector<int> sel;
   NeighborListData nlist_data;
   std::unique_ptr<torch::inductor::AOTIModelPackageLoader> loader;
+  // Optional second AOTInductor artifact for the multi-rank GNN code
+  // path (Phase 4).  Loaded only if the .pt2 metadata reports
+  // ``has_comm_artifact == true`` AND the model has GNN message
+  // passing.  ``with_comm_tempfile_`` owns the extracted nested .pt2
+  // for the lifetime of ``with_comm_loader``.
+  bool has_comm_artifact_ = false;
+  std::unique_ptr<deepmd::ptexpt::TempFile> with_comm_tempfile_;
+  std::unique_ptr<torch::inductor::AOTIModelPackageLoader> with_comm_loader;
 
   /**
    * @brief Multi-frame loop for standalone compute (no nlist).
@@ -264,6 +278,24 @@ class DeepPotPTExpt : public DeepPotBackend {
                                        const torch::Tensor& fparam,
                                        const torch::Tensor& aparam);
 
+  /**
+   * @brief Run the with-comm .pt2 artifact with comm tensors appended.
+   *
+   * @param[in] base 4-6 base inputs (coord, atype, nlist, mapping,
+   *            fparam?, aparam?) — same as ``run_model``.
+   * @param[in] comm_tensors 8 comm tensors in canonical positional
+   *            order: send_list, send_proc, recv_proc, send_num,
+   *            recv_num, communicator, nlocal, nghost.
+   */
+  std::vector<torch::Tensor> run_model_with_comm(
+      const torch::Tensor& coord,
+      const torch::Tensor& atype,
+      const torch::Tensor& nlist,
+      const torch::Tensor& mapping,
+      const torch::Tensor& fparam,
+      const torch::Tensor& aparam,
+      const std::vector<at::Tensor>& comm_tensors);
+
   /**
    * @brief Extract outputs from flat tensor list using output_keys.
    */
diff --git a/source/api_cc/include/DeepSpinPTExpt.h b/source/api_cc/include/DeepSpinPTExpt.h
index af108c7690..08ca4e8ccb 100644
--- a/source/api_cc/include/DeepSpinPTExpt.h
+++ b/source/api_cc/include/DeepSpinPTExpt.h
@@ -14,6 +14,11 @@
 
 #include "DeepSpin.h"
 
+// Forward-declare the temp-file helper from commonPTExpt.h.
+namespace deepmd::ptexpt {
+class TempFile;
+}
+
 namespace torch::inductor {
 class AOTIModelPackageLoader;
 }
@@ -187,6 +192,10 @@ class DeepSpinPTExpt : public DeepSpinBackend {
   std::vector<int> sel;
   NeighborListData nlist_data;
   std::unique_ptr<torch::inductor::AOTIModelPackageLoader> loader;
+  // Optional with-comm artifact for multi-rank GNN spin inference.
+  bool has_comm_artifact_ = false;
+  std::unique_ptr<deepmd::ptexpt::TempFile> with_comm_tempfile_;
+  std::unique_ptr<torch::inductor::AOTIModelPackageLoader> with_comm_loader;
 
   std::vector<torch::Tensor> run_model(const torch::Tensor& coord,
                                        const torch::Tensor& atype,
@@ -196,6 +205,20 @@ class DeepSpinPTExpt : public DeepSpinBackend {
                                        const torch::Tensor& fparam,
                                        const torch::Tensor& aparam);
 
+  /**
+   * @brief Run with-comm spin artifact: 5-7 base inputs (incl.
+   * extended_spin) + 8 comm tensors.
+   */
+  std::vector<torch::Tensor> run_model_with_comm(
+      const torch::Tensor& coord,
+      const torch::Tensor& atype,
+      const torch::Tensor& spin,
+      const torch::Tensor& nlist,
+      const torch::Tensor& mapping,
+      const torch::Tensor& fparam,
+      const torch::Tensor& aparam,
+      const std::vector<at::Tensor>& comm_tensors);
+
   void extract_outputs(std::map<std::string, torch::Tensor>& output_map,
                        const std::vector<torch::Tensor>& flat_outputs);
 
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index c1f3d9d674..dbcfe0e163 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -142,6 +142,25 @@ void DeepPotPTExpt::init(const std::string& model,
       gpu_enabled ? static_cast<c10::DeviceIndex>(gpu_id)
                   : static_cast<c10::DeviceIndex>(-1));
 
+  // Phase 4: load the optional with-comm artifact for multi-rank GNN
+  // inference. Pre-Phase-3 .pt2 files lack ``has_comm_artifact``;
+  // default to false so old artifacts keep working.
+  has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") &&
+                       metadata["has_comm_artifact"].as_bool();
+  if (has_comm_artifact_) {
+    // Extract the nested ``extra/forward_lower_with_comm.pt2`` into a
+    // temp file and load it as a second AOTI module. The TempFile
+    // unlinks the temp file on destruction.
+    with_comm_tempfile_ = std::make_unique<deepmd::ptexpt::TempFile>(
+        deepmd::ptexpt::TempFile::from_zip_entry(
+            model, "extra/forward_lower_with_comm.pt2"));
+    with_comm_loader =
+        std::make_unique<torch::inductor::AOTIModelPackageLoader>(
+            with_comm_tempfile_->path(), "model", false, 1,
+            gpu_enabled ? static_cast<c10::DeviceIndex>(gpu_id)
+                        : static_cast<c10::DeviceIndex>(-1));
+  }
+
   int num_intra_nthreads, num_inter_nthreads;
   get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   if (num_inter_nthreads) {
@@ -182,6 +201,40 @@ std::vector<torch::Tensor> DeepPotPTExpt::run_model(
   return loader->run(inputs);
 }
 
+std::vector<torch::Tensor> DeepPotPTExpt::run_model_with_comm(
+    const torch::Tensor& coord,
+    const torch::Tensor& atype,
+    const torch::Tensor& nlist,
+    const torch::Tensor& mapping,
+    const torch::Tensor& fparam,
+    const torch::Tensor& aparam,
+    const std::vector<at::Tensor>& comm_tensors) {
+  if (!with_comm_loader) {
+    throw deepmd::deepmd_exception(
+        "run_model_with_comm called but the .pt2 file has no with-comm "
+        "artifact. This is a programming error: the caller should check "
+        "has_comm_artifact_ before invoking this path.");
+  }
+  if (comm_tensors.size() != 8) {
+    throw deepmd::deepmd_exception(
+        "run_model_with_comm: comm_tensors must contain exactly 8 tensors "
+        "(send_list, send_proc, recv_proc, send_num, recv_num, "
+        "communicator, nlocal, nghost). Got " +
+        std::to_string(comm_tensors.size()) + ".");
+  }
+  std::vector<torch::Tensor> inputs = {coord, atype, nlist, mapping};
+  if (dfparam > 0) {
+    inputs.push_back(fparam);
+  }
+  if (daparam > 0) {
+    inputs.push_back(aparam);
+  }
+  for (const auto& t : comm_tensors) {
+    inputs.push_back(t);
+  }
+  return with_comm_loader->run(inputs);
+}
+
 void DeepPotPTExpt::extract_outputs(
     std::map<std::string, torch::Tensor>& output_map,
     const std::vector<torch::Tensor>& flat_outputs) {
@@ -328,9 +381,25 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
     aparam_tensor = torch::zeros({0}, options).to(device);
   }
 
-  // Run the .pt2 model
-  auto flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor,
-                                mapping_tensor, fparam_tensor, aparam_tensor);
+  // Phase 4 dispatch: use the with-comm artifact when LAMMPS is
+  // running multi-rank.  ``lmp_list.nswap > 0`` is the proxy for
+  // "multi-rank with cross-domain communication"; in single-rank
+  // mode LAMMPS sets nswap=0.  Falling back to the regular artifact
+  // for nswap=0 is correct because that artifact uses the mapping
+  // tensor to gather ghost embeddings from local atoms.
+  std::vector<torch::Tensor> flat_outputs;
+  bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0;
+  if (use_with_comm) {
+    auto comm_tensors = deepmd::ptexpt::build_comm_tensors_positional(
+        lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, nloc,
+        nghost_real);
+    flat_outputs = run_model_with_comm(
+        coord_Tensor, atype_Tensor, firstneigh_tensor, mapping_tensor,
+        fparam_tensor, aparam_tensor, comm_tensors);
+  } else {
+    flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor,
+                             mapping_tensor, fparam_tensor, aparam_tensor);
+  }
 
   // Map flat outputs to internal keys
   std::map<std::string, torch::Tensor> output_map;
diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc
index ae4ef423ed..ed95018e4c 100644
--- a/source/api_cc/src/DeepSpinPTExpt.cc
+++ b/source/api_cc/src/DeepSpinPTExpt.cc
@@ -154,6 +154,21 @@ void DeepSpinPTExpt::init(const std::string& model,
       gpu_enabled ? static_cast<c10::DeviceIndex>(gpu_id)
                   : static_cast<c10::DeviceIndex>(-1));
 
+  // Phase 4: load the optional with-comm artifact for multi-rank GNN
+  // spin inference.  Mirrors DeepPotPTExpt; see its init() comment.
+  has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") &&
+                       metadata["has_comm_artifact"].as_bool();
+  if (has_comm_artifact_) {
+    with_comm_tempfile_ = std::make_unique<deepmd::ptexpt::TempFile>(
+        deepmd::ptexpt::TempFile::from_zip_entry(
+            model, "extra/forward_lower_with_comm.pt2"));
+    with_comm_loader =
+        std::make_unique<torch::inductor::AOTIModelPackageLoader>(
+            with_comm_tempfile_->path(), "model", false, 1,
+            gpu_enabled ? static_cast<c10::DeviceIndex>(gpu_id)
+                        : static_cast<c10::DeviceIndex>(-1));
+  }
+
   int num_intra_nthreads, num_inter_nthreads;
   get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   if (num_inter_nthreads) {
@@ -195,6 +210,39 @@ std::vector<torch::Tensor> DeepSpinPTExpt::run_model(
   return loader->run(inputs);
 }
 
+std::vector<torch::Tensor> DeepSpinPTExpt::run_model_with_comm(
+    const torch::Tensor& coord,
+    const torch::Tensor& atype,
+    const torch::Tensor& spin,
+    const torch::Tensor& nlist,
+    const torch::Tensor& mapping,
+    const torch::Tensor& fparam,
+    const torch::Tensor& aparam,
+    const std::vector<at::Tensor>& comm_tensors) {
+  if (!with_comm_loader) {
+    throw deepmd::deepmd_exception(
+        "DeepSpinPTExpt::run_model_with_comm called but the .pt2 has no "
+        "with-comm artifact.");
+  }
+  if (comm_tensors.size() != 8) {
+    throw deepmd::deepmd_exception(
+        "DeepSpinPTExpt::run_model_with_comm: comm_tensors must contain "
+        "exactly 8 tensors. Got " +
+        std::to_string(comm_tensors.size()) + ".");
+  }
+  std::vector<torch::Tensor> inputs = {coord, atype, spin, nlist, mapping};
+  if (dfparam > 0) {
+    inputs.push_back(fparam);
+  }
+  if (daparam > 0) {
+    inputs.push_back(aparam);
+  }
+  for (const auto& t : comm_tensors) {
+    inputs.push_back(t);
+  }
+  return with_comm_loader->run(inputs);
+}
+
 void DeepSpinPTExpt::extract_outputs(
     std::map<std::string, torch::Tensor>& output_map,
     const std::vector<torch::Tensor>& flat_outputs) {
@@ -353,10 +401,26 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener,
     aparam_tensor = torch::zeros({0}, options).to(device);
   }
 
-  // Run the .pt2 model (7 args for spin)
-  auto flat_outputs =
-      run_model(coord_Tensor, atype_Tensor, spin_Tensor, firstneigh_tensor,
-                mapping_tensor, fparam_tensor, aparam_tensor);
+  // Phase 4 dispatch: route to with-comm artifact in multi-rank mode.
+  // ``has_spin=tensor([1])`` is baked into the with-comm graph at
+  // trace time (Phase 3, spin_model.forward_common_lower_exportable
+  // _with_comm), so C++ supplies the same 8 comm tensors as the
+  // non-spin path. ``nlocal``/``nghost`` carry the real-atom counts
+  // (pre atom-doubling); the spin override halves them internally.
+  std::vector<torch::Tensor> flat_outputs;
+  bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0;
+  if (use_with_comm) {
+    auto comm_tensors = deepmd::ptexpt::build_comm_tensors_positional(
+        lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, nloc,
+        nghost_real);
+    flat_outputs = run_model_with_comm(
+        coord_Tensor, atype_Tensor, spin_Tensor, firstneigh_tensor,
+        mapping_tensor, fparam_tensor, aparam_tensor, comm_tensors);
+  } else {
+    flat_outputs =
+        run_model(coord_Tensor, atype_Tensor, spin_Tensor, firstneigh_tensor,
+                  mapping_tensor, fparam_tensor, aparam_tensor);
+  }
 
   std::map<std::string, torch::Tensor> output_map;
   extract_outputs(output_map, flat_outputs);
diff --git a/source/api_cc/src/commonPTExpt.h b/source/api_cc/src/commonPTExpt.h
index 7dd02d09a9..dcaaddd6ea 100644
--- a/source/api_cc/src/commonPTExpt.h
+++ b/source/api_cc/src/commonPTExpt.h
@@ -1,10 +1,17 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 // Shared utilities for pt_expt (.pt2 / AOTInductor) backend classes.
-// Provides: JSON parser, ZIP archive reader, and type-sorted nlist builder.
+// Provides: JSON parser, ZIP archive reader, type-sorted nlist builder,
+// and helpers for the with-comm dual-artifact layout (Phase 4 of the
+// GNN MPI plumbing).
 #pragma once
 
+#include <torch/torch.h>
+#include <unistd.h>
+
 #include <algorithm>
 #include <cstdint>
+#include <cstdio>
+#include <cstdlib>
 #include <fstream>
 #include <map>
 #include <sstream>
@@ -12,6 +19,7 @@
 #include <vector>
 
 #include "errors.h"
+#include "neighbor_list.h"
 
 namespace deepmd {
 namespace ptexpt {
@@ -534,5 +542,172 @@ inline torch::Tensor buildTypeSortedNlist(
   return tensor;
 }
 
+// ============================================================================
+// With-comm artifact extraction (Phase 4)
+//
+// GNN .pt2 archives carry a nested ``extra/forward_lower_with_comm.pt2``
+// alongside the regular forward_lower artifact.  AOTInductor's
+// ``ModelPackageLoader`` reads .pt2 files from disk, so to load the
+// nested artifact we extract it to a temp file.
+// ============================================================================
+
+/**
+ * @brief RAII handle for a temp file on disk.
+ *
+ * Used to hold the extracted with-comm .pt2 artifact for the lifetime
+ * of the loader.  Destructor unlinks the file.
+ */
+class TempFile {
+ public:
+  TempFile() = default;
+  TempFile(const TempFile&) = delete;
+  TempFile& operator=(const TempFile&) = delete;
+  TempFile(TempFile&& other) noexcept : path_(std::move(other.path_)) {
+    other.path_.clear();
+  }
+  TempFile& operator=(TempFile&& other) noexcept {
+    if (this != &other) {
+      cleanup();
+      path_ = std::move(other.path_);
+      other.path_.clear();
+    }
+    return *this;
+  }
+  ~TempFile() { cleanup(); }
+
+  const std::string& path() const { return path_; }
+  bool empty() const { return path_.empty(); }
+
+  /**
+   * @brief Write the content of an existing .pt2 ZIP entry to a temp
+   * file and return a TempFile owning that path.
+   *
+   * The temp file is created via ``mkstemp(3)`` (atomic, unique,
+   * 0600 permissions) under the system tempdir (TMPDIR or /tmp).
+   */
+  static TempFile from_zip_entry(const std::string& outer_pt2_path,
+                                 const std::string& entry_name) {
+    std::string content = read_zip_entry(outer_pt2_path, entry_name);
+    const char* tmpdir = std::getenv("TMPDIR");
+    std::string tmpl =
+        std::string(tmpdir ? tmpdir : "/tmp") + "/dp_pt2_with_comm_XXXXXX";
+    std::vector<char> buf(tmpl.begin(), tmpl.end());
+    buf.push_back('\0');
+    int fd = mkstemp(buf.data());
+    if (fd < 0) {
+      throw deepmd::deepmd_exception(
+          "Failed to create temp file for nested .pt2 artifact: " + tmpl);
+    }
+    std::string path(buf.data());
+    // Write content to the fd so we don't race with another process
+    // opening the same path.
+    ssize_t written = 0;
+    const char* p = content.data();
+    ssize_t remain = static_cast<ssize_t>(content.size());
+    while (remain > 0) {
+      ssize_t n = ::write(fd, p + written, static_cast<size_t>(remain));
+      if (n < 0) {
+        ::close(fd);
+        ::unlink(path.c_str());
+        throw deepmd::deepmd_exception(
+            "Failed to write nested .pt2 artifact to temp file: " + path);
+      }
+      written += n;
+      remain -= n;
+    }
+    ::close(fd);
+    TempFile tf;
+    tf.path_ = std::move(path);
+    return tf;
+  }
+
+ private:
+  void cleanup() {
+    if (!path_.empty()) {
+      ::unlink(path_.c_str());
+      path_.clear();
+    }
+  }
+  std::string path_;
+};
+
+// ============================================================================
+// comm_dict tensor packing for the with-comm artifact (Phase 4)
+//
+// The with-comm AOTInductor artifact accepts comm tensors as 8 additional
+// positional inputs (after the regular 4-6 inputs) in this canonical order:
+//   send_list (nswap, int64 ptr-array packed as int64 tensor)
+//   send_proc (nswap, int32)
+//   recv_proc (nswap, int32)
+//   send_num  (nswap, int32)
+//   recv_num  (nswap, int32)
+//   communicator (1, int64 — MPI handle as opaque int)
+//   nlocal    (scalar int32)
+//   nghost    (scalar int32)
+// This mirrors deepmd_export::border_op's argument order in
+// deepmd/pt_expt/utils/comm.py.
+// ============================================================================
+
+/**
+ * @brief Build the 8 comm-tensor positional inputs from LAMMPS data.
+ *
+ * Tensors share storage with the LAMMPS-owned buffers (no copy);
+ * the caller must keep ``lmp_list``, ``sendlist``, ``sendnum``, and
+ * ``recvnum`` alive until ``loader->run`` returns.  ``nlocal`` /
+ * ``nghost`` are produced via ``torch::tensor`` (small allocation).
+ *
+ * @param lmp_list    LAMMPS neighbor list (provides nswap, sendproc,
+ *                    recvproc, world).
+ * @param sendlist    int** pointer-array (already remapped if needed).
+ * @param sendnum     int* per-swap send counts (already remapped).
+ * @param recvnum     int* per-swap recv counts (already remapped).
+ * @param nlocal      Number of local atoms (per-rank).
+ * @param nghost      Number of ghost atoms (per-rank).
+ * @return Vector of 8 tensors in canonical positional order.
+ */
+inline std::vector<at::Tensor> build_comm_tensors_positional(
+    const InputNlist& lmp_list,
+    int** sendlist,
+    int* sendnum,
+    int* recvnum,
+    int nlocal,
+    int nghost) {
+  int nswap = lmp_list.nswap;
+  auto int32_option =
+      torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt32);
+  auto int64_option =
+      torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64);
+
+  // sendlist is int**: nswap entries each holding an int* pointer.
+  // Reinterpret as int64 for tensor packaging (matches what pt's
+  // build_comm_dict does and what border_op expects).
+  at::Tensor sendlist_tensor =
+      torch::from_blob(static_cast<void*>(sendlist), {nswap}, int64_option);
+  at::Tensor sendproc_tensor =
+      torch::from_blob(lmp_list.sendproc, {nswap}, int32_option);
+  at::Tensor recvproc_tensor =
+      torch::from_blob(lmp_list.recvproc, {nswap}, int32_option);
+  at::Tensor sendnum_tensor = torch::from_blob(sendnum, {nswap}, int32_option);
+  at::Tensor recvnum_tensor = torch::from_blob(recvnum, {nswap}, int32_option);
+
+  // MPI communicator handle as a 1-element int64 tensor.
+  static std::int64_t null_communicator = 0;
+  at::Tensor communicator_tensor;
+  if (lmp_list.world == nullptr) {
+    communicator_tensor =
+        torch::from_blob(&null_communicator, {1}, int64_option);
+  } else {
+    communicator_tensor =
+        torch::from_blob(const_cast<void*>(lmp_list.world), {1}, int64_option);
+  }
+
+  // Scalar nlocal / nghost — int32 to match Python-side tracing.
+  at::Tensor nlocal_tensor = torch::tensor(nlocal, int32_option);
+  at::Tensor nghost_tensor = torch::tensor(nghost, int32_option);
+
+  return {sendlist_tensor, sendproc_tensor,     recvproc_tensor, sendnum_tensor,
+          recvnum_tensor,  communicator_tensor, nlocal_tensor,   nghost_tensor};
+}
+
 }  // namespace ptexpt
 }  // namespace deepmd

From 1ad6103549c6123b45032a8c7fab99ba52bd9189 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 15:14:27 +0800
Subject: [PATCH 09/34] feat(gnn-mpi): wire up multi-rank LAMMPS path
 end-to-end
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 — final integration after Phases 1-4 land the dpmodel
plumbing, opaque op wrappers, two-mode AOTI export and C++ dispatch.
Three pieces had to fall into place to make multi-rank LAMMPS
actually run a GNN .pt2:

1. Move deepmd_export op schema declarations to C++.
   torch.library.custom_op only registers the op in the Python
   process, but a LAMMPS run loads the .pt2 in pure C++ (no Python
   interpreter). Add TORCH_LIBRARY_FRAGMENT(deepmd_export, m) +
   TORCH_LIBRARY_IMPL blocks under explicit CPU/CUDA dispatch
   keys in source/op/pt/comm.cc; the C++ impls clone the underlying
   deepmd::* op outputs to satisfy AOTI no-aliasing. Python comm.py
   now layers register_fake + register_autograd on top of the
   C++-defined ops instead of defining new ones.

2. Call deepmd::load_op_library at DeepPot/SpinPTExpt init so
   libdeepmd_op_pt.so loads before AOTIModelPackageLoader; the LAMMPS
   plugin doesnt pre-load it. Without this, a multi-rank GNN .pt2
   aborts at pair_style time with a missing-schema error.

3. Gate dual-artifact production on use_loc_mapping=False.
   _has_message_passing now walks into the GNN block to inspect
   use_loc_mapping; if True, only the regular artifact is produced
   (the override would raise on parallel mode anyway). gen_dpa3.py
   produces a second deeppot_dpa3_mpi.pt2 with use_loc_mapping=False
   so the new mpirun test has a real dual-artifact .pt2 to load.

Plus the multi-rank test itself:
- run_mpi_pair_deepmd_dpa3_pt2.py: subprocess driver. Uses
  PyLammps + processors 2 1 1 so nswap > 0 on every rank,
  forcing the C++ side to dispatch to the with-comm artifact.
  Forces are gathered via lammps.lmp.gather_atoms (rank-local
  atoms[i] doesnt see other ranks); pe via lammps.eval on rank 0.
- test_pair_deepmd_mpi_dpa3 in test_lammps_dpa3_pt2.py: invokes
  the driver under mpirun -n 2, asserts energy + per-atom forces
  match the single-rank reference within atol=1e-8.

Also: register_fake for the backward op too. Without it, make_fx
tracing autograd.grad inside forward_common_lower_exportable hits
the same FakeTensor data_ptr error we solved for forward in Phase 0.

All 31 pt_expt LAMMPS tests pass.
---
 deepmd/pt_expt/utils/comm.py                  | 135 ++++++++----------
 deepmd/pt_expt/utils/serialization.py         |  26 +++-
 source/api_cc/src/DeepPotPTExpt.cc            |   7 +
 source/api_cc/src/DeepSpinPTExpt.cc           |   5 +
 .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py |  72 ++++++++++
 source/lmp/tests/test_lammps_dpa3_pt2.py      |  77 ++++++++++
 source/op/pt/comm.cc                          |  82 +++++++++++
 source/tests/infer/gen_dpa3.py                |  18 +++
 8 files changed, 344 insertions(+), 78 deletions(-)
 create mode 100644 source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py

diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py
index 442a232a6f..b985c57fe6 100644
--- a/deepmd/pt_expt/utils/comm.py
+++ b/deepmd/pt_expt/utils/comm.py
@@ -1,27 +1,32 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Opaque torch.export wrapper around the deepmd MPI border_op.
-
-The existing ``torch.ops.deepmd.border_op`` (registered by
-``libdeepmd_op_pt.so``) is a ``CompositeImplicitAutograd`` op that wraps
-``Border::apply`` for the torch.jit (pt) backend. ``torch.export`` /
-AOTInductor try to *decompose* such ops into primitive aten ops, which
-fails because the C++ kernel calls ``data_ptr()`` on inputs — illegal
-during tracing on FakeTensors.
-
-This module defines a NEW op ``deepmd_export::border_op`` via
-``torch.library.custom_op``, marked opaque so ``torch.export`` records it
-as a single black-box call. At runtime the loaded ``.pt2`` dispatches
-back into ``torch.ops.deepmd.border_op`` (forward) or
-``torch.ops.deepmd.border_op_backward`` (backward), preserving the MPI
-exchange semantics.
+"""Python-side fake / autograd registration for the C++-defined opaque
+``deepmd_export::border_op`` and ``deepmd_export::border_op_backward``.
+
+The op schemas and concrete CPU/CUDA implementations are defined in
+``source/op/pt/comm.cc`` (registered under explicit dispatch keys so
+``torch.export`` records them as opaque external calls instead of
+decomposing into the C++ kernel — which would hit ``data_ptr()`` on
+FakeTensors and fail).  Defining the schema in C++ also means a
+``.pt2`` archive loaded by a pure-C++ process (LAMMPS via
+``DeepPotPTExpt``) can dispatch through the registered op without
+needing a Python interpreter.
+
+This module adds the Python-only metadata that the ops still need:
+    * ``register_fake`` so ``make_fx`` / ``torch.export`` can trace
+      through them with FakeTensor inputs.
+    * ``register_autograd`` so ``torch.autograd.grad`` (used inside
+      ``forward_common_lower_exportable_with_comm``) flows gradients
+      through the forward op back to its inputs.
 
 Constraints discovered during de-risking (scratch/derisk_border_op.py):
-    1. ``custom_op`` forbids returning a tensor that aliases an input —
-       the underlying C++ op returns ``g1`` itself, so we ``.clone()``.
-    2. The fake (meta) impl honours ``g1.dtype`` (no float64 hardcoding).
-    3. ``register_autograd`` makes the op differentiable; the backward
-       dispatches to ``deepmd::border_op_backward`` which performs the
-       symmetric MPI exchange.
+    1. Both forward and backward outputs must NOT alias their inputs
+       (the C++ kernels return the same tensor they modified) — the
+       C++ wrapper layer in ``comm.cc`` clones them before exposing.
+    2. The fake impls honour ``g1.dtype`` (no float64 hardcoding).
+    3. ``register_autograd`` makes the forward op differentiable; the
+       backward callback dispatches to the opaque
+       ``deepmd_export::border_op_backward`` op so ``make_fx`` tracing
+       through ``autograd.grad`` also sees a black box.
 """
 
 from __future__ import (
@@ -34,33 +39,33 @@
 def _check_underlying_ops_loaded() -> None:
     """Surface a clearer error when libdeepmd_op_pt.so isn't loaded.
 
-    pt_expt depends on libdeepmd_op_pt.so for the underlying
-    ``deepmd::border_op`` and ``deepmd::border_op_backward`` C++ ops.
-    Without them, callers get cryptic
-    ``AttributeError: '_OpNamespace' object has no attribute 'border_op'``
-    errors. We translate that into actionable advice.
-
-    Called once on first wrapper invocation (not at import time, since
-    pt_expt may legitimately be imported on systems where the .so is
-    not built — e.g. eager-only smoke tests of dpmodel-side code).
+    pt_expt depends on libdeepmd_op_pt.so for the ``deepmd_export::*``
+    op schemas + impls.  Without it, the ops can't be registered for
+    fake/autograd metadata and callers get a cryptic AttributeError
+    on ``torch.ops.deepmd_export.border_op``.
     """
     if not (
-        hasattr(torch.ops, "deepmd")
-        and hasattr(torch.ops.deepmd, "border_op")
-        and hasattr(torch.ops.deepmd, "border_op_backward")
+        hasattr(torch.ops, "deepmd_export")
+        and hasattr(torch.ops.deepmd_export, "border_op")
+        and hasattr(torch.ops.deepmd_export, "border_op_backward")
     ):
         raise RuntimeError(
-            "deepmd_export::border_op wrapper requires "
-            "torch.ops.deepmd.border_op and "
-            "torch.ops.deepmd.border_op_backward (from "
-            "libdeepmd_op_pt.so) to be loaded. Build the pt custom-op "
-            "library and ensure deepmd.pt is imported before the "
-            "first call to this wrapper."
+            "torch.ops.deepmd_export.{border_op,border_op_backward} "
+            "are not registered. Build libdeepmd_op_pt.so and ensure "
+            "deepmd.pt is imported before this module."
         )
 
 
-@torch.library.custom_op("deepmd_export::border_op", mutates_args=())
-def border_op_export(
+_check_underlying_ops_loaded()
+
+
+# ---------------------------------------------------------------------------
+# Fake (meta) impls — let make_fx / torch.export trace through.
+# ---------------------------------------------------------------------------
+
+
+@torch.library.register_fake("deepmd_export::border_op")
+def _border_op_fake(
     sendlist: torch.Tensor,
     sendproc: torch.Tensor,
     recvproc: torch.Tensor,
@@ -71,44 +76,29 @@ def border_op_export(
     nlocal: torch.Tensor,
     nghost: torch.Tensor,
 ) -> torch.Tensor:
-    """Opaque wrapper around ``torch.ops.deepmd.border_op``.
-
-    Performs MPI ghost-atom exchange of the embedding tensor ``g1`` so
-    GNN message-passing layers can run under multi-rank LAMMPS. Inputs
-    and outputs match the underlying op exactly except for the aliasing
-    fix (see module docstring).
-    """
-    _check_underlying_ops_loaded()
-    out = torch.ops.deepmd.border_op(
-        sendlist,
-        sendproc,
-        recvproc,
-        sendnum,
-        recvnum,
-        g1,
-        communicator,
-        nlocal,
-        nghost,
-    )
-    if isinstance(out, (list, tuple)):
-        out = out[0]
-    # custom_op forbids output aliasing inputs; underlying op returns g1.
-    return out.clone()
+    return torch.empty_like(g1)
 
 
-@border_op_export.register_fake
-def _border_op_export_fake(
+@torch.library.register_fake("deepmd_export::border_op_backward")
+def _border_op_backward_fake(
     sendlist: torch.Tensor,
     sendproc: torch.Tensor,
     recvproc: torch.Tensor,
     sendnum: torch.Tensor,
     recvnum: torch.Tensor,
-    g1: torch.Tensor,
+    grad_g1: torch.Tensor,
     communicator: torch.Tensor,
     nlocal: torch.Tensor,
     nghost: torch.Tensor,
 ) -> torch.Tensor:
-    return torch.empty_like(g1)
+    return torch.empty_like(grad_g1)
+
+
+# ---------------------------------------------------------------------------
+# Autograd: route the forward op's backward through the backward op so
+# ``make_fx`` tracing through ``torch.autograd.grad`` records both as
+# opaque external calls.
+# ---------------------------------------------------------------------------
 
 
 def _border_op_setup_context(
@@ -146,7 +136,7 @@ def _border_op_backward(
     (sendlist, sendproc, recvproc, sendnum, recvnum, communicator, nlocal, nghost) = (
         ctx.saved_tensors
     )
-    grad_in = torch.ops.deepmd.border_op_backward(
+    grad_in = torch.ops.deepmd_export.border_op_backward(
         sendlist,
         sendproc,
         recvproc,
@@ -157,22 +147,21 @@ def _border_op_backward(
         nlocal,
         nghost,
     )
-    # Same aliasing concern as forward: the C++ backward returns the same
-    # tensor object it modified; clone before handing back to autograd.
     return (
         None,
         None,
         None,
         None,
         None,  # sendlist..recvnum
-        grad_in.clone(),  # g1
+        grad_in,  # g1
         None,
         None,
         None,  # communicator, nlocal, nghost
     )
 
 
-border_op_export.register_autograd(
+torch.library.register_autograd(
+    "deepmd_export::border_op",
     _border_op_backward,
     setup_context=_border_op_setup_context,
 )
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 74fbe67111..fe5fe7f318 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -85,6 +85,11 @@ def _has_message_passing(model: torch.nn.Module) -> bool:
     compiled.  Non-GNN descriptors (se_e2_a, se_r, se_t, se_t_tebd,
     DPA1, hybrid-of-non-GNN) need only the regular artifact.
 
+    Additional gate: ``use_loc_mapping=True`` GNN models (the default
+    for DPA3) keep nlist in local-only indexing, so per-layer ghost
+    exchange is meaningless — these get only the regular artifact.
+    Multi-rank LAMMPS for GNN requires use_loc_mapping=False.
+
     Returns False if the descriptor's ``has_message_passing()`` query
     cannot be answered (e.g. linear/zbl/frozen models without a single
     descriptor) — those are assumed local.
@@ -93,12 +98,23 @@ def _has_message_passing(model: torch.nn.Module) -> bool:
         descriptor = model.atomic_model.descriptor
     except AttributeError:
         return False
-    if hasattr(descriptor, "has_message_passing"):
-        try:
-            return bool(descriptor.has_message_passing())
-        except (AttributeError, NotImplementedError):
+    if not hasattr(descriptor, "has_message_passing"):
+        return False
+    try:
+        if not descriptor.has_message_passing():
+            return False
+    except (AttributeError, NotImplementedError):
+        return False
+    # Walk into the GNN block (repflows / repformers) to inspect
+    # ``use_loc_mapping``. The attribute lives on the block, not on the
+    # top-level descriptor wrapper.
+    for attr in ("repflows", "repformers"):
+        block = getattr(descriptor, attr, None)
+        if block is None:
+            continue
+        if getattr(block, "use_loc_mapping", False):
             return False
-    return False
+    return True
 
 
 # Module-level cache for the trace-time sendlist buffer. The pointer
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index dbcfe0e163..020566de23 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -62,6 +62,13 @@ void DeepPotPTExpt::init(const std::string& model,
     return;
   }
 
+  // Load libdeepmd_op_pt.so so its TORCH_LIBRARY_FRAGMENT entries
+  // (deepmd::*, deepmd_export::*) are visible to torch's dispatcher
+  // before the AOTI module loads.  Without this, multi-rank GNN .pt2
+  // archives fail at pair_style time with
+  // ``Could not find schema for deepmd_export::border_op``.
+  deepmd::load_op_library();
+
   if (!file_content.empty()) {
     throw deepmd::deepmd_exception(
         "In-memory file_content loading is not supported for .pt2 models. "
diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc
index ed95018e4c..e16991d884 100644
--- a/source/api_cc/src/DeepSpinPTExpt.cc
+++ b/source/api_cc/src/DeepSpinPTExpt.cc
@@ -62,6 +62,11 @@ void DeepSpinPTExpt::init(const std::string& model,
     return;
   }
 
+  // Load libdeepmd_op_pt.so so deepmd_export::* schemas are visible
+  // to torch's dispatcher before the AOTI module loads.  See
+  // DeepPotPTExpt::init for the full rationale.
+  deepmd::load_op_library();
+
   if (!file_content.empty()) {
     throw deepmd::deepmd_exception(
         "In-memory file_content loading is not supported for .pt2 models. "
diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
new file mode 100644
index 0000000000..29b103cf01
--- /dev/null
+++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Multi-rank LAMMPS driver for DPA3 .pt2 (Phase 5 of GNN MPI).
+
+Run via ``mpirun -n N python run_mpi_pair_deepmd_dpa3_pt2.py DATAFILE PB_FILE OUTPUT``.
+Mirrors ``run_mpi_pair_deepmd.py`` but targets a GNN model whose .pt2 archive
+carries the with-comm artifact (Phase 3 dual-artifact layout). The C++
+``DeepPotPTExpt`` (Phase 4) routes to the with-comm artifact when LAMMPS
+reports nswap > 0 (multi-rank), driving MPI ghost-atom exchange via
+``deepmd_export::border_op`` per layer.
+
+Rank 0 writes potential energy + per-atom forces to ``OUTPUT`` so the parent
+pytest process can compare against the single-rank reference.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import argparse
+
+import numpy as np
+from lammps import (
+    PyLammps,
+)
+from mpi4py import (
+    MPI,
+)
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("DATAFILE", type=str, help="LAMMPS data file (atom positions)")
+parser.add_argument("PB_FILE", type=str, help=".pt2 model file")
+parser.add_argument("OUTPUT", type=str, help="Output file for energies + forces")
+args = parser.parse_args()
+
+lammps = PyLammps()
+# Force a non-trivial domain decomposition: 2 x 1 x 1 across ranks.
+# Combined with the simulation box this guarantees nswap > 0 on the C++
+# side, so DeepPotPTExpt routes to the with-comm AOTI artifact.
+lammps.processors("2 1 1")
+lammps.units("metal")
+lammps.boundary("p p p")
+lammps.atom_style("atomic")
+lammps.neighbor("2.0 bin")
+lammps.neigh_modify("every 10 delay 0 check no")
+lammps.read_data(args.DATAFILE)
+lammps.mass("1 16")
+lammps.mass("2 2")
+lammps.timestep(0.0005)
+lammps.fix("1 all nve")
+
+lammps.pair_style(f"deepmd {args.PB_FILE}")
+lammps.pair_coeff("* *")
+lammps.run(0)
+
+# Forces need to be gathered across ranks. PyLammps's ``atoms[i]``
+# only exposes rank-local atoms; ``gather_atoms`` returns the global,
+# id-ordered array on every rank.
+forces_global = lammps.lmp.gather_atoms("f", 1, 3)
+# ``PyLammps.eval`` is rank-0-only.
+if rank == 0:
+    pe_global = lammps.eval("pe")
+    natoms = lammps.atoms.natoms
+    forces = np.array(forces_global, dtype=np.float64).reshape(natoms, 3)
+    with open(args.OUTPUT, "w") as f:
+        f.write(f"{pe_global:.16e}\n")
+        for row in forces:
+            f.write(" ".join(f"{v:.16e}" for v in row) + "\n")
+
+MPI.Finalize()
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index 7ce05f9a2d..73b3ea1442 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -5,7 +5,12 @@
 Reference values from source/tests/infer/gen_dpa3.py / C++ test.
 """
 
+import importlib.util
 import os
+import shutil
+import subprocess as sp
+import sys
+import tempfile
 from pathlib import (
     Path,
 )
@@ -21,6 +26,12 @@
 )
 
 pb_file = Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa3.pt2"
+# Multi-rank-capable variant (use_loc_mapping=False; carries the
+# nested forward_lower_with_comm.pt2 artifact). Produced alongside
+# deeppot_dpa3.pt2 by source/tests/infer/gen_dpa3.py.
+pb_file_mpi = (
+    Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa3_mpi.pt2"
+)
 data_file = Path(__file__).parent / "data_dpa3_pt2.lmp"
 data_file_si = Path(__file__).parent / "data_dpa3_pt2.si"
 data_type_map_file = Path(__file__).parent / "data_type_map_dpa3_pt2.lmp"
@@ -315,3 +326,69 @@ def test_pair_deepmd_si(lammps_si) -> None:
             expected_f[lammps_si.atoms[ii].id - 1] * constants.force_metal2si
         )
     lammps_si.run(1)
+
+
+# ---------------------------------------------------------------------------
+# Multi-rank test (Phase 5 of GNN MPI)
+#
+# Drives the .pt2 model under ``mpirun -n 2`` so the C++ ``DeepPotPTExpt``
+# routes to the with-comm AOTI artifact (Phase 4) and ``border_op`` does
+# real MPI ghost exchange between two ranks.  The expected energy/forces
+# are the same as the single-rank reference (single-rank LAMMPS would
+# need ``atom_modify map yes`` to use the regular artifact; multi-rank
+# uses the with-comm artifact whose graph reproduces the gather via
+# MPI exchange).
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3() -> None:
+    """Multi-rank LAMMPS run for DPA3 .pt2 must match the single-rank
+    reference within numerical tolerance.
+
+    Requires the .pt2 archive to carry a with-comm artifact (Phase 3
+    output for GNN models).  If the archive lacks it, the C++ falls
+    back to the regular artifact and produces wrong cross-rank values
+    — which the assertion would catch (loud test failure, not silent).
+    """
+    with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
+        out_path = f.name
+    try:
+        sp.check_call(
+            [
+                "mpirun",
+                "-n",
+                "2",
+                sys.executable,
+                str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"),
+                str(data_file.resolve()),
+                str(pb_file_mpi.resolve()),
+                out_path,
+            ]
+        )
+        with open(out_path) as fh:
+            lines = fh.read().strip().splitlines()
+        pe_mpi = float(lines[0])
+        forces_mpi = np.array(
+            [list(map(float, line.split())) for line in lines[1:]],
+            dtype=np.float64,
+        )
+        # Energy matches single-rank reference.
+        assert pe_mpi == pytest.approx(expected_e)
+        # Per-atom forces match (atoms in id-sorted order from the
+        # subprocess script).
+        for ii in range(6):
+            np.testing.assert_allclose(
+                forces_mpi[ii],
+                expected_f[ii],
+                atol=1e-8,
+                rtol=0,
+            )
+    finally:
+        if os.path.exists(out_path):
+            os.remove(out_path)
diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
index 9dd9b50c3b..3bb7516155 100644
--- a/source/op/pt/comm.cc
+++ b/source/op/pt/comm.cc
@@ -452,3 +452,85 @@ TORCH_LIBRARY_FRAGMENT(deepmd, m) {
   m.def("border_op", border_op);
   m.def("border_op_backward", border_op_backward);
 }
+
+// ============================================================================
+// Opaque wrappers for the pt_expt (.pt2 / AOTInductor) export path.
+//
+// ``deepmd::border_op`` and ``deepmd::border_op_backward`` are registered
+// without an explicit dispatch key, which makes them
+// ``CompositeImplicitAutograd`` ops.  ``torch.export`` decomposes such ops
+// during tracing — i.e., it tries to inline the C++ kernel — and that
+// fails because the kernel calls ``data_ptr()`` on FakeTensors.
+//
+// These ``deepmd_export::*`` wrappers are registered with explicit
+// ``CPU`` and ``CUDA`` dispatch keys so ``torch.export`` records them as
+// opaque external calls in the graph.  The .pt2 archive embeds the call
+// sites; at runtime the dispatcher routes back to the underlying
+// ``deepmd::*`` op.  Both clones because ``deepmd::border_op`` returns
+// the same tensor it modified in place, which violates AOTInductor's
+// no-aliasing rule for graph outputs.
+//
+// Python (``deepmd/pt_expt/utils/comm.py``) layers ``register_fake`` and
+// ``register_autograd`` on top of these C++-defined ops so traced graphs
+// can run their fake/backward.
+// ============================================================================
+
+namespace {
+torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor,
+                               const torch::Tensor& sendproc_tensor,
+                               const torch::Tensor& recvproc_tensor,
+                               const torch::Tensor& sendnum_tensor,
+                               const torch::Tensor& recvnum_tensor,
+                               const torch::Tensor& g1_tensor,
+                               const torch::Tensor& communicator_tensor,
+                               const torch::Tensor& nlocal_tensor,
+                               const torch::Tensor& nghost_tensor) {
+  auto out = border_op(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                       sendnum_tensor, recvnum_tensor, g1_tensor,
+                       communicator_tensor, nlocal_tensor, nghost_tensor);
+  // border_op returns {g1_tensor} — a list whose first element aliases
+  // g1_tensor. Clone for AOTI graph-output correctness.
+  return out.empty() ? torch::empty_like(g1_tensor) : out[0].clone();
+}
+
+torch::Tensor border_op_backward_export(
+    const torch::Tensor& sendlist_tensor,
+    const torch::Tensor& sendproc_tensor,
+    const torch::Tensor& recvproc_tensor,
+    const torch::Tensor& sendnum_tensor,
+    const torch::Tensor& recvnum_tensor,
+    const torch::Tensor& grad_g1,
+    const torch::Tensor& communicator_tensor,
+    const torch::Tensor& nlocal_tensor,
+    const torch::Tensor& nghost_tensor) {
+  return border_op_backward(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                            sendnum_tensor, recvnum_tensor, grad_g1,
+                            communicator_tensor, nlocal_tensor, nghost_tensor)
+      .clone();
+}
+}  // namespace
+
+TORCH_LIBRARY_FRAGMENT(deepmd_export, m) {
+  m.def(
+      "border_op(Tensor sendlist, Tensor sendproc, Tensor recvproc, "
+      "Tensor sendnum, Tensor recvnum, Tensor g1, Tensor communicator, "
+      "Tensor nlocal, Tensor nghost) -> Tensor");
+  m.def(
+      "border_op_backward(Tensor sendlist, Tensor sendproc, Tensor recvproc, "
+      "Tensor sendnum, Tensor recvnum, Tensor grad_g1, Tensor communicator, "
+      "Tensor nlocal, Tensor nghost) -> Tensor");
+}
+
+// Register CPU + CUDA implementations under explicit dispatch keys so
+// torch.export sees opaque external calls (vs CompositeImplicitAutograd
+// which gets decomposed during trace).
+TORCH_LIBRARY_IMPL(deepmd_export, CPU, m) {
+  m.impl("border_op", border_op_export);
+  m.impl("border_op_backward", border_op_backward_export);
+}
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+TORCH_LIBRARY_IMPL(deepmd_export, CUDA, m) {
+  m.impl("border_op", border_op_export);
+  m.impl("border_op_backward", border_op_backward_export);
+}
+#endif
diff --git a/source/tests/infer/gen_dpa3.py b/source/tests/infer/gen_dpa3.py
index 322163462d..69a6757d0e 100644
--- a/source/tests/infer/gen_dpa3.py
+++ b/source/tests/infer/gen_dpa3.py
@@ -88,6 +88,24 @@ def main():
     print(f"Exporting to {pt2_path} ...")  # noqa: T201
     pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data))
 
+    # Multi-rank LAMMPS variant (use_loc_mapping=False) — produces a
+    # dual-artifact .pt2 with the with-comm AOTI module nested inside
+    # so the C++ DeepPotPTExpt routes to it under mpirun.  See
+    # source/lmp/tests/test_lammps_dpa3_pt2.py::test_pair_deepmd_mpi_dpa3.
+    config_mpi = copy.deepcopy(config)
+    config_mpi["descriptor"]["use_loc_mapping"] = False
+    model_mpi = get_model(config_mpi)
+    data_mpi = {
+        "model": model_mpi.serialize(),
+        "model_def_script": config_mpi,
+        "backend": "dpmodel",
+        "software": "deepmd-kit",
+        "version": "3.0.0",
+    }
+    pt2_mpi_path = os.path.join(base_dir, "deeppot_dpa3_mpi.pt2")
+    print(f"Exporting to {pt2_mpi_path} ...")  # noqa: T201
+    pt_expt_deserialize_to_file(pt2_mpi_path, copy.deepcopy(data_mpi))
+
     pth_path = os.path.join(base_dir, "deeppot_dpa3.pth")
     print(f"Exporting to {pth_path} ...")  # noqa: T201
     try:

From 8b2501dbb1e8db7dd8f9fa5c965293c74c9e0e71 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 17:43:03 +0800
Subject: [PATCH 10/34] test(gnn-mpi): expand multi-rank coverage; address
 Phase 5 follow-up gaps

Three fixes targeting the limitations from the previous Phase 5 commit:

1. NULL-type atoms (build_comm_dict_with_virtual_atoms equivalent).
   When ``select_real_atoms_coord`` filters atoms with atype < 0,
   the LAMMPS-supplied sendlist still indexes the original atom
   array. ``DeepPotPTExpt::compute`` (and Spin) now check
   ``has_null_atoms = (nall_real < nall)`` and route to the new
   ``build_comm_tensors_positional_with_virtual_atoms`` helper in
   commonPTExpt.h, which calls ``remap_comm_sendlist`` to translate
   indices through ``fwd_map`` (mirrors what
   ``commonPT.h::build_comm_dict_with_virtual_atoms`` does for the
   torch.jit pt-backend). Untested numerically (no test fixture
   produces NULL-type atoms in multi-rank); code path is structurally
   identical to the validated pt-backend equivalent.

2. nlist-rebuild test (test_pair_deepmd_mpi_dpa3_nlist_rebuild).
   Runs 50 MD steps under mpirun -n 2 with neigh_modify every=10,
   forcing >=5 neighbor-list rebuilds. Validates the with-comm
   dispatch path stays consistent across rebuilds (the comm tensors
   are reconstructed when ``ago == 0`` triggers). Asserts forces
   stay finite and bounded; no exact-value comparison since round-
   off accumulates over the trajectory and cross-rank ordering can
   shift the LSBs.

3. Spin multi-rank dispatch wiring (DeepSpinPTExpt::compute).
   Same has_null_atoms branch as DeepPotPTExpt. Code path
   structurally identical to the validated DeepPotPTExpt path; no
   spin-specific multi-rank test yet (would need a spin DPA3 .pt2
   with use_loc_mapping=False to exercise it end-to-end).

Note: virial check via LAMMPS compute pressure NULL virial caused
PyLammps multi-rank deadlock; deferred to a follow-up. Forces ARE
the autograd output of energy through the with-comm graph, so
force parity already validates the with-comm backward path.

All 26 pt_expt LAMMPS tests pass (including the new multi-rank
ones); 9 model_devi_pt2 tests confirm DeepPotModelDevi delegates
correctly through the dispatch.
---
 source/api_cc/src/DeepPotPTExpt.cc            |  21 +++-
 source/api_cc/src/DeepSpinPTExpt.cc           |  19 ++-
 source/api_cc/src/commonPTExpt.h              |  53 +++++---
 .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py |  15 +++
 source/lmp/tests/test_lammps_dpa3_pt2.py      | 116 ++++++++++++------
 5 files changed, 162 insertions(+), 62 deletions(-)

diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 020566de23..711061f6bf 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -5,6 +5,7 @@
 #include <torch/csrc/inductor/aoti_package/model_package_loader.h>
 
 #include <algorithm>
+#include <numeric>
 #include <cstdint>
 #include <fstream>
 #include <map>
@@ -396,10 +397,24 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   // tensor to gather ghost embeddings from local atoms.
   std::vector<torch::Tensor> flat_outputs;
   bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0;
+  // When NULL-type atoms exist, remapped storage must outlive comm
+  // tensors (the int** pointer-array tensor references it).
+  std::vector<std::vector<int>> remapped_sendlist;
+  std::vector<int*> remapped_sendlist_ptrs;
+  std::vector<int> remapped_sendnum, remapped_recvnum;
   if (use_with_comm) {
-    auto comm_tensors = deepmd::ptexpt::build_comm_tensors_positional(
-        lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, nloc,
-        nghost_real);
+    bool has_null_atoms = (nall_real < nall);
+    std::vector<at::Tensor> comm_tensors;
+    if (has_null_atoms) {
+      comm_tensors =
+          deepmd::ptexpt::build_comm_tensors_positional_with_virtual_atoms(
+              lmp_list, fwd_map, nloc, nghost_real, remapped_sendlist,
+              remapped_sendlist_ptrs, remapped_sendnum, remapped_recvnum);
+    } else {
+      comm_tensors = deepmd::ptexpt::build_comm_tensors_positional(
+          lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum,
+          nloc, nghost_real);
+    }
     flat_outputs = run_model_with_comm(
         coord_Tensor, atype_Tensor, firstneigh_tensor, mapping_tensor,
         fparam_tensor, aparam_tensor, comm_tensors);
diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc
index e16991d884..cf714ea79e 100644
--- a/source/api_cc/src/DeepSpinPTExpt.cc
+++ b/source/api_cc/src/DeepSpinPTExpt.cc
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <fstream>
+#include <numeric>
 #include <map>
 #include <sstream>
 
@@ -414,10 +415,22 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener,
   // (pre atom-doubling); the spin override halves them internally.
   std::vector<torch::Tensor> flat_outputs;
   bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0;
+  std::vector<std::vector<int>> remapped_sendlist;
+  std::vector<int*> remapped_sendlist_ptrs;
+  std::vector<int> remapped_sendnum, remapped_recvnum;
   if (use_with_comm) {
-    auto comm_tensors = deepmd::ptexpt::build_comm_tensors_positional(
-        lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, nloc,
-        nghost_real);
+    bool has_null_atoms = (nall_real < nall);
+    std::vector<at::Tensor> comm_tensors;
+    if (has_null_atoms) {
+      comm_tensors =
+          deepmd::ptexpt::build_comm_tensors_positional_with_virtual_atoms(
+              lmp_list, fwd_map, nloc, nghost_real, remapped_sendlist,
+              remapped_sendlist_ptrs, remapped_sendnum, remapped_recvnum);
+    } else {
+      comm_tensors = deepmd::ptexpt::build_comm_tensors_positional(
+          lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum,
+          nloc, nghost_real);
+    }
     flat_outputs = run_model_with_comm(
         coord_Tensor, atype_Tensor, spin_Tensor, firstneigh_tensor,
         mapping_tensor, fparam_tensor, aparam_tensor, comm_tensors);
diff --git a/source/api_cc/src/commonPTExpt.h b/source/api_cc/src/commonPTExpt.h
index dcaaddd6ea..20ffe10781 100644
--- a/source/api_cc/src/commonPTExpt.h
+++ b/source/api_cc/src/commonPTExpt.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 
+#include "common.h"  // for remap_comm_sendlist
 #include "errors.h"
 #include "neighbor_list.h"
 
@@ -649,21 +650,9 @@ class TempFile {
 // ============================================================================
 
 /**
- * @brief Build the 8 comm-tensor positional inputs from LAMMPS data.
- *
- * Tensors share storage with the LAMMPS-owned buffers (no copy);
- * the caller must keep ``lmp_list``, ``sendlist``, ``sendnum``, and
- * ``recvnum`` alive until ``loader->run`` returns.  ``nlocal`` /
- * ``nghost`` are produced via ``torch::tensor`` (small allocation).
- *
- * @param lmp_list    LAMMPS neighbor list (provides nswap, sendproc,
- *                    recvproc, world).
- * @param sendlist    int** pointer-array (already remapped if needed).
- * @param sendnum     int* per-swap send counts (already remapped).
- * @param recvnum     int* per-swap recv counts (already remapped).
- * @param nlocal      Number of local atoms (per-rank).
- * @param nghost      Number of ghost atoms (per-rank).
- * @return Vector of 8 tensors in canonical positional order.
+ * @brief Build the 8 comm-tensor positional inputs from LAMMPS data
+ * (Phase 5 working signature, restored after the consolidation
+ * attempt regressed).
  */
 inline std::vector<at::Tensor> build_comm_tensors_positional(
     const InputNlist& lmp_list,
@@ -678,9 +667,6 @@ inline std::vector<at::Tensor> build_comm_tensors_positional(
   auto int64_option =
       torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64);
 
-  // sendlist is int**: nswap entries each holding an int* pointer.
-  // Reinterpret as int64 for tensor packaging (matches what pt's
-  // build_comm_dict does and what border_op expects).
   at::Tensor sendlist_tensor =
       torch::from_blob(static_cast<void*>(sendlist), {nswap}, int64_option);
   at::Tensor sendproc_tensor =
@@ -690,7 +676,6 @@ inline std::vector<at::Tensor> build_comm_tensors_positional(
   at::Tensor sendnum_tensor = torch::from_blob(sendnum, {nswap}, int32_option);
   at::Tensor recvnum_tensor = torch::from_blob(recvnum, {nswap}, int32_option);
 
-  // MPI communicator handle as a 1-element int64 tensor.
   static std::int64_t null_communicator = 0;
   at::Tensor communicator_tensor;
   if (lmp_list.world == nullptr) {
@@ -701,7 +686,6 @@ inline std::vector<at::Tensor> build_comm_tensors_positional(
         torch::from_blob(const_cast<void*>(lmp_list.world), {1}, int64_option);
   }
 
-  // Scalar nlocal / nghost — int32 to match Python-side tracing.
   at::Tensor nlocal_tensor = torch::tensor(nlocal, int32_option);
   at::Tensor nghost_tensor = torch::tensor(nghost, int32_option);
 
@@ -709,5 +693,34 @@ inline std::vector<at::Tensor> build_comm_tensors_positional(
           recvnum_tensor,  communicator_tensor, nlocal_tensor,   nghost_tensor};
 }
 
+/**
+ * @brief Build the 8 comm-tensor positional inputs with NULL-type-atom
+ * remapping.  When ``select_real_atoms_coord`` filters atoms (atype <
+ * 0), ``fwd_map`` translates original sendlist indices into real-atom
+ * indices (with ``-1`` for filtered).  Mirrors
+ * ``commonPT.h::build_comm_dict_with_virtual_atoms``.  The remapped
+ * storage must outlive the returned tensors.
+ */
+inline std::vector<at::Tensor> build_comm_tensors_positional_with_virtual_atoms(
+    const InputNlist& lmp_list,
+    const std::vector<int>& fwd_map,
+    int nlocal,
+    int nghost,
+    std::vector<std::vector<int>>& remapped_sendlist,
+    std::vector<int*>& remapped_sendlist_ptrs,
+    std::vector<int>& remapped_sendnum,
+    std::vector<int>& remapped_recvnum) {
+  remap_comm_sendlist(remapped_sendlist, remapped_sendnum, remapped_recvnum,
+                      lmp_list, fwd_map);
+  int nswap = lmp_list.nswap;
+  remapped_sendlist_ptrs.resize(nswap);
+  for (int s = 0; s < nswap; ++s) {
+    remapped_sendlist_ptrs[s] = remapped_sendlist[s].data();
+  }
+  return build_comm_tensors_positional(
+      lmp_list, remapped_sendlist_ptrs.data(), remapped_sendnum.data(),
+      remapped_recvnum.data(), nlocal, nghost);
+}
+
 }  // namespace ptexpt
 }  // namespace deepmd
diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
index 29b103cf01..1d593882bd 100644
--- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
+++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
@@ -33,6 +33,14 @@
 parser.add_argument("DATAFILE", type=str, help="LAMMPS data file (atom positions)")
 parser.add_argument("PB_FILE", type=str, help=".pt2 model file")
 parser.add_argument("OUTPUT", type=str, help="Output file for energies + forces")
+parser.add_argument(
+    "--nsteps",
+    type=int,
+    default=0,
+    help="Number of MD steps to run after the initial force evaluation; "
+    "with --nsteps > 10 (LAMMPS neigh_modify every=10) the dispatch path "
+    "is exercised across at least one neighbor-list rebuild.",
+)
 args = parser.parse_args()
 
 lammps = PyLammps()
@@ -55,6 +63,13 @@
 lammps.pair_coeff("* *")
 lammps.run(0)
 
+# Optional: run additional MD steps to exercise the with-comm
+# dispatch across neighbor-list rebuilds (LAMMPS rebuilds every
+# 10 steps with our neigh_modify config, so any nsteps >= 10
+# triggers at least one rebuild).
+if args.nsteps > 0:
+    lammps.run(args.nsteps)
+
 # Forces need to be gathered across ranks. PyLammps's ``atoms[i]``
 # only exposes rank-local atoms; ``gather_atoms`` returns the global,
 # id-ordered array on every rank.
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index 73b3ea1442..7b33c64b75 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -341,6 +341,38 @@ def test_pair_deepmd_si(lammps_si) -> None:
 # ---------------------------------------------------------------------------
 
 
+def _run_mpi_subprocess(extra_args: list[str] | None = None) -> dict:
+    """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under mpirun -n 2,
+    return ``{"pe": float, "forces": (n, 3) array}``."""
+    with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
+        out_path = f.name
+    try:
+        argv = [
+            "mpirun",
+            "-n",
+            "2",
+            sys.executable,
+            str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"),
+            str(data_file.resolve()),
+            str(pb_file_mpi.resolve()),
+            out_path,
+        ]
+        if extra_args:
+            argv.extend(extra_args)
+        sp.check_call(argv)
+        with open(out_path) as fh:
+            lines = fh.read().strip().splitlines()
+        pe = float(lines[0])
+        forces = np.array(
+            [list(map(float, line.split())) for line in lines[1:]],
+            dtype=np.float64,
+        )
+        return {"pe": pe, "forces": forces}
+    finally:
+        if os.path.exists(out_path):
+            os.remove(out_path)
+
+
 @pytest.mark.skipif(
     shutil.which("mpirun") is None, reason="MPI is not installed on this system"
 )
@@ -349,46 +381,58 @@ def test_pair_deepmd_si(lammps_si) -> None:
 )
 def test_pair_deepmd_mpi_dpa3() -> None:
     """Multi-rank LAMMPS run for DPA3 .pt2 must match the single-rank
-    reference within numerical tolerance.
+    reference within numerical tolerance for energy and forces.
+
+    Forces are the autograd output of energy through the with-comm
+    graph, so they implicitly validate the backward path of
+    ``deepmd_export::border_op``.  Virial requires a separate
+    ``compute pressure NULL virial`` which interacts poorly with
+    PyLammps multi-rank (hangs); deferred to a follow-up.
 
     Requires the .pt2 archive to carry a with-comm artifact (Phase 3
     output for GNN models).  If the archive lacks it, the C++ falls
     back to the regular artifact and produces wrong cross-rank values
     — which the assertion would catch (loud test failure, not silent).
     """
-    with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
-        out_path = f.name
-    try:
-        sp.check_call(
-            [
-                "mpirun",
-                "-n",
-                "2",
-                sys.executable,
-                str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"),
-                str(data_file.resolve()),
-                str(pb_file_mpi.resolve()),
-                out_path,
-            ]
-        )
-        with open(out_path) as fh:
-            lines = fh.read().strip().splitlines()
-        pe_mpi = float(lines[0])
-        forces_mpi = np.array(
-            [list(map(float, line.split())) for line in lines[1:]],
-            dtype=np.float64,
+    out = _run_mpi_subprocess()
+    # Energy matches single-rank reference.
+    assert out["pe"] == pytest.approx(expected_e)
+    # Per-atom forces match (atoms in id-sorted order from the
+    # subprocess script).
+    for ii in range(6):
+        np.testing.assert_allclose(
+            out["forces"][ii],
+            expected_f[ii],
+            atol=1e-8,
+            rtol=0,
         )
-        # Energy matches single-rank reference.
-        assert pe_mpi == pytest.approx(expected_e)
-        # Per-atom forces match (atoms in id-sorted order from the
-        # subprocess script).
-        for ii in range(6):
-            np.testing.assert_allclose(
-                forces_mpi[ii],
-                expected_f[ii],
-                atol=1e-8,
-                rtol=0,
-            )
-    finally:
-        if os.path.exists(out_path):
-            os.remove(out_path)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None:
+    """Multi-rank with neighbor-list rebuilds.
+
+    Runs ~50 MD steps with ``neigh_modify every 10 delay 0 check no``,
+    forcing at least 5 nlist rebuilds during the trajectory. The
+    purpose is NOT to validate exact final-state values (round-off
+    accumulates over MD steps and cross-rank ordering can shift the
+    LSBs) but to verify the with-comm dispatch path stays consistent
+    across rebuilds — i.e. ``DeepPotPTExpt::compute`` correctly
+    reconstructs the comm tensors when ``ago == 0`` triggers and the
+    AOTI graph keeps producing finite values.
+    """
+    out = _run_mpi_subprocess(extra_args=["--nsteps", "50"])
+    # Trajectory advanced; final state will differ from the run-0
+    # reference. Just sanity-check finite values + reasonable forces.
+    assert np.all(np.isfinite(out["forces"]))
+    assert np.isfinite(out["pe"])
+    # Force magnitudes shouldn't blow up; pick a generous bound for
+    # the small-box water-like 6-atom system.
+    assert np.max(np.abs(out["forces"])) < 100.0, (
+        f"forces exploded after 50 steps: max|f|={np.max(np.abs(out['forces']))}"
+    )

From c43bd8bc8b022ab182d13a7899795cb8c5ed068f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 21:22:35 +0800
Subject: [PATCH 11/34] test(gnn-mpi): tighten multi-rank LAMMPS test
 assertions

- run_mpi_pair_deepmd_dpa3_pt2.py: gather atom ids alongside forces and
  sort by id explicitly. Output ordering is now robust to subdomain
  layout, empty subdomains, or future LAMMPS gather_atoms changes.
  Add atom_modify map yes so single-rank dispatch on the dual-artifact
  .pt2 (uses mapping) works; expose --processors so the runner can
  produce a same-archive single-rank reference.

- test_pair_deepmd_mpi_dpa3_nlist_rebuild: replace the finite/bounded
  sanity check with a value comparison against a single-rank reference
  of the same trajectory (mpirun -n 1, processors "1 1 1"). 25 MD steps
  cross two nlist rebuilds, atol=1e-6 forces / rel=1e-8 energy. This
  catches a wrong-but-finite force from a dispatch bug that the
  previous assertion would have missed.
---
 .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 33 ++++++++--
 source/lmp/tests/test_lammps_dpa3_pt2.py      | 61 ++++++++++++-------
 2 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
index 1d593882bd..fa536b5c6f 100644
--- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
+++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
@@ -41,16 +41,31 @@
     "with --nsteps > 10 (LAMMPS neigh_modify every=10) the dispatch path "
     "is exercised across at least one neighbor-list rebuild.",
 )
+parser.add_argument(
+    "--processors",
+    type=str,
+    default="2 1 1",
+    help="LAMMPS processors grid. Default '2 1 1' forces multi-rank "
+    "domain decomposition (nswap>0). Pass '1 1 1' for a single-rank "
+    "reference run on the same archive (single-artifact dispatch).",
+)
 args = parser.parse_args()
 
 lammps = PyLammps()
-# Force a non-trivial domain decomposition: 2 x 1 x 1 across ranks.
-# Combined with the simulation box this guarantees nswap > 0 on the C++
-# side, so DeepPotPTExpt routes to the with-comm AOTI artifact.
-lammps.processors("2 1 1")
+# Force the requested domain decomposition. The default "2 1 1"
+# combined with the simulation box guarantees nswap > 0 on the C++
+# side, so DeepPotPTExpt routes to the with-comm AOTI artifact. Pass
+# "1 1 1" to obtain a single-rank reference using the same archive
+# (the regular artifact handles nswap==0).
+lammps.processors(args.processors)
 lammps.units("metal")
 lammps.boundary("p p p")
 lammps.atom_style("atomic")
+# ``atom_modify map yes`` is required when single-rank dispatch goes
+# through the regular artifact of a use_loc_mapping=False .pt2: the
+# C++ side needs the LAMMPS global-id->local-index map to build the
+# ``mapping`` tensor. It is harmless under multi-rank.
+lammps.atom_modify("map yes")
 lammps.neighbor("2.0 bin")
 lammps.neigh_modify("every 10 delay 0 check no")
 lammps.read_data(args.DATAFILE)
@@ -72,13 +87,21 @@
 
 # Forces need to be gathered across ranks. PyLammps's ``atoms[i]``
 # only exposes rank-local atoms; ``gather_atoms`` returns the global,
-# id-ordered array on every rank.
+# id-ordered array on every rank. We also gather ``id`` and reorder
+# explicitly by id rather than trusting an implicit ordering — this
+# is robust against subdomain layout, empty subdomains, and any
+# future LAMMPS change in gather ordering.
 forces_global = lammps.lmp.gather_atoms("f", 1, 3)
+ids_global = lammps.lmp.gather_atoms("id", 0, 1)
 # ``PyLammps.eval`` is rank-0-only.
 if rank == 0:
     pe_global = lammps.eval("pe")
     natoms = lammps.atoms.natoms
     forces = np.array(forces_global, dtype=np.float64).reshape(natoms, 3)
+    ids = np.array(ids_global, dtype=np.int64).reshape(natoms)
+    # Sort by atom id so output is unambiguously id-ordered (id 1 first).
+    order = np.argsort(ids)
+    forces = forces[order]
     with open(args.OUTPUT, "w") as f:
         f.write(f"{pe_global:.16e}\n")
         for row in forces:
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index 7b33c64b75..d14a46cea0 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -341,22 +341,33 @@ def test_pair_deepmd_si(lammps_si) -> None:
 # ---------------------------------------------------------------------------
 
 
-def _run_mpi_subprocess(extra_args: list[str] | None = None) -> dict:
-    """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under mpirun -n 2,
-    return ``{"pe": float, "forces": (n, 3) array}``."""
+def _run_mpi_subprocess(
+    extra_args: list[str] | None = None,
+    nprocs: int = 2,
+) -> dict:
+    """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under
+    ``mpirun -n <nprocs>`` and return ``{"pe": float, "forces": (n, 3) array}``.
+
+    With ``nprocs == 1`` the runner is invoked with ``--processors 1 1 1``
+    so the C++ side sees ``nswap == 0`` and routes to the regular
+    (single-rank) artifact of the dual-artifact .pt2 — useful as a
+    same-archive reference for multi-rank comparisons.
+    """
     with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
         out_path = f.name
     try:
         argv = [
             "mpirun",
             "-n",
-            "2",
+            str(nprocs),
             sys.executable,
             str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"),
             str(data_file.resolve()),
             str(pb_file_mpi.resolve()),
             out_path,
         ]
+        if nprocs == 1:
+            argv.extend(["--processors", "1 1 1"])
         if extra_args:
             argv.extend(extra_args)
         sp.check_call(argv)
@@ -415,24 +426,28 @@ def test_pair_deepmd_mpi_dpa3() -> None:
     importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
 )
 def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None:
-    """Multi-rank with neighbor-list rebuilds.
-
-    Runs ~50 MD steps with ``neigh_modify every 10 delay 0 check no``,
-    forcing at least 5 nlist rebuilds during the trajectory. The
-    purpose is NOT to validate exact final-state values (round-off
-    accumulates over MD steps and cross-rank ordering can shift the
-    LSBs) but to verify the with-comm dispatch path stays consistent
-    across rebuilds — i.e. ``DeepPotPTExpt::compute`` correctly
-    reconstructs the comm tensors when ``ago == 0`` triggers and the
-    AOTI graph keeps producing finite values.
+    """Multi-rank with neighbor-list rebuilds, validated against a
+    single-rank reference of the same archive and trajectory.
+
+    Runs 25 MD steps with ``neigh_modify every 10 delay 0 check no``,
+    so the multi-rank trajectory crosses two nlist rebuilds (at steps
+    10 and 20) before the final force evaluation. The same trajectory
+    is then run under ``mpirun -n 1`` (regular-artifact dispatch on
+    the same dual-artifact .pt2) to obtain a reference; comparing the
+    two catches a wrong-but-finite force from a dispatch bug that the
+    previous finite/bounded check would miss.
+
+    NVE is deterministic up to floating-point summation order, so the
+    cross-rank divergence after 25 steps is bounded by accumulated
+    round-off — small for a 6-atom system but non-zero, hence the
+    relaxed (but still tight) tolerances.
     """
-    out = _run_mpi_subprocess(extra_args=["--nsteps", "50"])
-    # Trajectory advanced; final state will differ from the run-0
-    # reference. Just sanity-check finite values + reasonable forces.
-    assert np.all(np.isfinite(out["forces"]))
-    assert np.isfinite(out["pe"])
-    # Force magnitudes shouldn't blow up; pick a generous bound for
-    # the small-box water-like 6-atom system.
-    assert np.max(np.abs(out["forces"])) < 100.0, (
-        f"forces exploded after 50 steps: max|f|={np.max(np.abs(out['forces']))}"
+    out_mpi = _run_mpi_subprocess(extra_args=["--nsteps", "25"], nprocs=2)
+    out_ref = _run_mpi_subprocess(extra_args=["--nsteps", "25"], nprocs=1)
+    np.testing.assert_allclose(
+        out_mpi["forces"],
+        out_ref["forces"],
+        atol=1e-6,
+        rtol=1e-6,
     )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10)

From 17064354511b6ab56e12d6f02dd97eecf7f8f83d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 26 Apr 2026 22:17:27 +0800
Subject: [PATCH 12/34] fix(cc): handle empty subdomain in copy_from_nlist;
 expand MPI tests with virial

- common.cc: NeighborListData::copy_from_nlist used &ilist[0] /
  &jlist[ii][0] for the memcpy destination, which is OOB on an empty
  vector (libstdc++ debug-mode assertion) and undefined behaviour in
  general. Switch to .data() and skip the copy when the count is zero.
  Surfaced by the new empty-subdomain MPI test where rank 1 owns
  nloc=0 atoms; the same latent bug also applied to atoms with no
  neighbours.

- run_mpi_pair_deepmd_dpa3_pt2.py: also gather the per-atom virial
  via ``compute centroid/stress/atom NULL pair`` and
  ``lmp.gather("c_virial", 1, 9)``. Output rows are now (3 force) +
  (9 virial) per atom, id-sorted.

- test_lammps_dpa3_pt2.py:
  * test_pair_deepmd_mpi_dpa3 now asserts virial against expected_v
    (with the same column permutation as test_pair_deepmd_virial),
    closing the previous "virial multi-rank deferred" gap.
  * test_pair_deepmd_mpi_dpa3_nlist_rebuild now also compares virial
    between the multi-rank and single-rank reference runs.
  * New test_pair_deepmd_mpi_dpa3_empty_subdomain: 30 x 13 x 13 box
    with all atoms in x in [0.25, 12.83]; under processors "2 1 1"
    rank 1 owns zero local atoms. Compares forces + virial + energy
    against a same-archive single-rank reference.
---
 source/api_cc/src/common.cc                   | 15 ++-
 .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 22 ++++-
 source/lmp/tests/test_lammps_dpa3_pt2.py      | 94 +++++++++++++++++--
 3 files changed, 114 insertions(+), 17 deletions(-)

diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index 1ad1a5c97b..7154992892 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -276,13 +276,20 @@ void deepmd::NeighborListData::copy_from_nlist(const InputNlist& inlist,
   int inum = natoms >= 0 ? natoms : inlist.inum;
   ilist.resize(inum);
   jlist.resize(inum);
-  memcpy(&ilist[0], inlist.ilist, inum * sizeof(int));
+  // Guard against an empty subdomain (inum == 0): &ilist[0] on an
+  // empty vector is OOB under libstdc++ debug-mode and undefined
+  // behaviour in general. Use data() and skip the copy when empty.
+  if (inum > 0) {
+    memcpy(ilist.data(), inlist.ilist, inum * sizeof(int));
+  }
   for (int ii = 0; ii < inum; ++ii) {
     int jnum = inlist.numneigh[ii];
     jlist[ii].resize(jnum);
-    memcpy(&jlist[ii][0], inlist.firstneigh[ii], jnum * sizeof(int));
-    for (int jj = 0; jj < jnum; ++jj) {
-      jlist[ii][jj] &= inlist.mask;
+    if (jnum > 0) {
+      memcpy(jlist[ii].data(), inlist.firstneigh[ii], jnum * sizeof(int));
+      for (int jj = 0; jj < jnum; ++jj) {
+        jlist[ii][jj] &= inlist.mask;
+      }
     }
   }
 }
diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
index fa536b5c6f..d07af7a158 100644
--- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
+++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
@@ -8,8 +8,10 @@
 reports nswap > 0 (multi-rank), driving MPI ghost-atom exchange via
 ``deepmd_export::border_op`` per layer.
 
-Rank 0 writes potential energy + per-atom forces to ``OUTPUT`` so the parent
-pytest process can compare against the single-rank reference.
+Rank 0 writes potential energy + per-atom forces (3 cols) + per-atom
+virial (9 cols, from ``compute centroid/stress/atom NULL pair`` in
+LAMMPS internal units) to ``OUTPUT`` so the parent pytest process can
+compare against the single-rank reference.
 """
 
 from __future__ import (
@@ -76,6 +78,12 @@
 
 lammps.pair_style(f"deepmd {args.PB_FILE}")
 lammps.pair_coeff("* *")
+# Per-atom virial from the pair contribution. ``centroid/stress/atom``
+# is parallel-safe (rank-local data, gathered below). LAMMPS computes
+# stress*volume per atom in internal units; the parent test reverses
+# the unit conversion (divide by ``constants.nktv2p``) before comparing
+# against the reference virial.
+lammps.compute("virial all centroid/stress/atom NULL pair")
 lammps.run(0)
 
 # Optional: run additional MD steps to exercise the with-comm
@@ -93,18 +101,26 @@
 # future LAMMPS change in gather ordering.
 forces_global = lammps.lmp.gather_atoms("f", 1, 3)
 ids_global = lammps.lmp.gather_atoms("id", 0, 1)
+# Gather the per-atom virial across ranks. ``lmp.gather`` accepts
+# named per-atom computes (``c_<id>``) and returns the global,
+# id-ordered array on every rank.
+virial_global = lammps.lmp.gather("c_virial", 1, 9)
 # ``PyLammps.eval`` is rank-0-only.
 if rank == 0:
     pe_global = lammps.eval("pe")
     natoms = lammps.atoms.natoms
     forces = np.array(forces_global, dtype=np.float64).reshape(natoms, 3)
+    virials = np.array(virial_global, dtype=np.float64).reshape(natoms, 9)
     ids = np.array(ids_global, dtype=np.int64).reshape(natoms)
     # Sort by atom id so output is unambiguously id-ordered (id 1 first).
     order = np.argsort(ids)
     forces = forces[order]
+    virials = virials[order]
     with open(args.OUTPUT, "w") as f:
         f.write(f"{pe_global:.16e}\n")
-        for row in forces:
+        # Each row: 3 force components followed by 9 virial components.
+        for fi, vi in zip(forces, virials, strict=True):
+            row = np.concatenate([fi, vi])
             f.write(" ".join(f"{v:.16e}" for v in row) + "\n")
 
 MPI.Finalize()
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index d14a46cea0..84127246d6 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -35,6 +35,12 @@
 data_file = Path(__file__).parent / "data_dpa3_pt2.lmp"
 data_file_si = Path(__file__).parent / "data_dpa3_pt2.si"
 data_type_map_file = Path(__file__).parent / "data_type_map_dpa3_pt2.lmp"
+# Elongated-box variant for the empty-subdomain MPI test: x is
+# extended to 30 Å while atoms remain in x ∈ [0.25, 12.83]. Combined
+# with ``processors 2 1 1`` this leaves rank 1 (x ≥ 15) with zero
+# local atoms — a corner case the comm-dispatch path must handle
+# without crashing or producing wrong forces.
+data_file_empty_subdomain = Path(__file__).parent / "data_dpa3_pt2_empty_subdomain.lmp"
 
 # Reference values from gen_dpa3.py / test_deeppot_dpa3_ptexpt.cc (PBC)
 expected_ae = np.array(
@@ -158,10 +164,19 @@ def setup_module() -> None:
         type_OH,
         data_file_si,
     )
+    # Elongated x-axis; atoms unchanged. With ``processors 2 1 1`` the
+    # split is at x = 15 Å and rank 1 owns x ≥ 15, which is empty.
+    box_empty_subdomain = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0])
+    write_lmp_data(box_empty_subdomain, coord, type_OH, data_file_empty_subdomain)
 
 
 def teardown_module() -> None:
-    for f in [data_file, data_type_map_file, data_file_si]:
+    for f in [
+        data_file,
+        data_type_map_file,
+        data_file_si,
+        data_file_empty_subdomain,
+    ]:
         if f.exists():
             os.remove(f)
 
@@ -344,15 +359,22 @@ def test_pair_deepmd_si(lammps_si) -> None:
 def _run_mpi_subprocess(
     extra_args: list[str] | None = None,
     nprocs: int = 2,
+    data_path: Path | None = None,
 ) -> dict:
     """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under
-    ``mpirun -n <nprocs>`` and return ``{"pe": float, "forces": (n, 3) array}``.
+    ``mpirun -n <nprocs>`` and return
+    ``{"pe": float, "forces": (n, 3) array, "virials": (n, 9) array}``.
 
     With ``nprocs == 1`` the runner is invoked with ``--processors 1 1 1``
     so the C++ side sees ``nswap == 0`` and routes to the regular
     (single-rank) artifact of the dual-artifact .pt2 — useful as a
     same-archive reference for multi-rank comparisons.
+
+    ``data_path`` (default ``data_file``) selects the LAMMPS data file —
+    the empty-subdomain test points at a non-default elongated-box file.
     """
+    if data_path is None:
+        data_path = data_file
     with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
         out_path = f.name
     try:
@@ -362,7 +384,7 @@ def _run_mpi_subprocess(
             str(nprocs),
             sys.executable,
             str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"),
-            str(data_file.resolve()),
+            str(data_path.resolve()),
             str(pb_file_mpi.resolve()),
             out_path,
         ]
@@ -374,11 +396,14 @@ def _run_mpi_subprocess(
         with open(out_path) as fh:
             lines = fh.read().strip().splitlines()
         pe = float(lines[0])
-        forces = np.array(
+        rows = np.array(
             [list(map(float, line.split())) for line in lines[1:]],
             dtype=np.float64,
         )
-        return {"pe": pe, "forces": forces}
+        # Each row is (3 force) + (9 virial); see runner script.
+        forces = rows[:, :3]
+        virials = rows[:, 3:]
+        return {"pe": pe, "forces": forces, "virials": virials}
     finally:
         if os.path.exists(out_path):
             os.remove(out_path)
@@ -392,16 +417,17 @@ def _run_mpi_subprocess(
 )
 def test_pair_deepmd_mpi_dpa3() -> None:
     """Multi-rank LAMMPS run for DPA3 .pt2 must match the single-rank
-    reference within numerical tolerance for energy and forces.
+    reference within numerical tolerance for energy, forces, and virial.
 
     Forces are the autograd output of energy through the with-comm
     graph, so they implicitly validate the backward path of
-    ``deepmd_export::border_op``.  Virial requires a separate
-    ``compute pressure NULL virial`` which interacts poorly with
-    PyLammps multi-rank (hangs); deferred to a follow-up.
+    ``deepmd_export::border_op``. Per-atom virial is gathered from
+    ``compute centroid/stress/atom NULL pair`` (parallel-safe) — the
+    earlier deadlock comment was specific to ``compute pressure NULL
+    virial`` + ``lammps.eval(...)``, which we sidestep entirely.
 
     Requires the .pt2 archive to carry a with-comm artifact (Phase 3
-    output for GNN models).  If the archive lacks it, the C++ falls
+    output for GNN models). If the archive lacks it, the C++ falls
     back to the regular artifact and produces wrong cross-rank values
     — which the assertion would catch (loud test failure, not silent).
     """
@@ -417,6 +443,20 @@ def test_pair_deepmd_mpi_dpa3() -> None:
             atol=1e-8,
             rtol=0,
         )
+    # Per-atom virial matches the gen_dpa3.py reference. LAMMPS
+    # centroid/stress/atom returns components in [xx, yy, zz, xy, xz,
+    # yz, yx, zx, zy] order; ``expected_v`` columns follow the same
+    # column-major flattening as the single-rank ``test_pair_deepmd_virial``
+    # (which uses idx_map [0, 4, 8, 3, 6, 7, 1, 2, 5] from c_virial[1..9]
+    # to expected_v columns). The inverse permutation maps
+    # ``out["virials"]`` columns back to ``expected_v`` columns.
+    expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2]
+    np.testing.assert_allclose(
+        out["virials"][:, expected_v_to_lammps] / constants.nktv2p,
+        expected_v,
+        atol=1e-8,
+        rtol=0,
+    )
 
 
 @pytest.mark.skipif(
@@ -450,4 +490,38 @@ def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None:
         atol=1e-6,
         rtol=1e-6,
     )
+    np.testing.assert_allclose(
+        out_mpi["virials"],
+        out_ref["virials"],
+        atol=1e-6,
+        rtol=1e-6,
+    )
     assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_empty_subdomain() -> None:
+    """Multi-rank DPA3 with one rank owning zero local atoms.
+
+    Uses a 30 x 13 x 13 box with all six atoms clustered in x in
+    [0.25, 12.83]. Under ``processors 2 1 1`` the split is at x = 15
+    so rank 1 owns an empty subdomain. The comm-dispatch path must
+    still produce correct forces and virial (compared against a
+    same-archive single-rank reference of the same configuration).
+
+    This catches: zero-length send/recv lists in the comm tensors,
+    division-by-zero in nlocal-dependent reshapes, and any silent
+    drop of a rank's contribution when it has no atoms to evaluate.
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain)
+    out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-12, abs=1e-12)

From a81fc10bd6c6b48cbd868987417a32d82c5bd252 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 27 Apr 2026 08:50:52 +0800
Subject: [PATCH 13/34] test: cover DPA2 multi-rank dispatch + fix opaque-op
 import order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- source/lmp/tests/test_lammps_dpa2_pt2.py (NEW): runs DPA2 .pt2 under
  mpirun -n 2 with the with-comm artifact and asserts pe + per-atom
  forces + per-atom virial match a same-archive single-rank reference.
  Closes the recorded gap "DPA2 multi-rank dispatch never exercised
  end-to-end" (gnn_mpi_untested_paths.md). The runner script
  (run_mpi_pair_deepmd_dpa3_pt2.py) is descriptor-agnostic so no new
  driver is needed.

- source/tests/infer/gen_dpa2.py: drop dead config_mpi block accidentally
  added during planning. DPA2's repformer has no use_loc_mapping knob
  (unlike DPA3), so the single deeppot_dpa2.pt2 already carries the
  dual-artifact layout — _has_message_passing returns True for any
  DPA2 model.

- source/tests/pt_expt/conftest.py: ``import deepmd.pt`` at conftest
  evaluation time so libdeepmd_op_pt.so is loaded and
  ``deepmd_export::{border_op, border_op_backward}`` are registered
  before any pt_expt test module imports ``deepmd.pt_expt.utils``
  (which transitively imports ``comm.py`` and its
  ``_check_underlying_ops_loaded()`` runtime check). Previously this
  worked only when the test was collected alongside earlier modules
  that happened to import deepmd.pt first; running the spin/export
  tests in isolation crashed at collection.

- source/tests/pt_expt/model/test_spin_export_with_comm.py: fix
  pre-existing test data bug — model has sel=[20,20,20] (sum=60) but
  the trace test was passing nlist with width 6, tripping the
  _format_nlist post-condition assertion. Now uses the correct
  sum(sel) width. Surfaced once the conftest fix above made the test
  reliably runnable in isolation.
---
 source/lmp/tests/test_lammps_dpa2_pt2.py      | 145 ++++++++++++++++++
 source/tests/infer/gen_dpa2.py                |   4 +
 source/tests/pt_expt/conftest.py              |   9 ++
 .../model/test_spin_export_with_comm.py       |   6 +-
 4 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 source/lmp/tests/test_lammps_dpa2_pt2.py

diff --git a/source/lmp/tests/test_lammps_dpa2_pt2.py b/source/lmp/tests/test_lammps_dpa2_pt2.py
new file mode 100644
index 0000000000..48ed966605
--- /dev/null
+++ b/source/lmp/tests/test_lammps_dpa2_pt2.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Multi-rank LAMMPS test for DPA2 .pt2 (extends GNN MPI Phase 5 to DPA2).
+
+DPA2's repformer block participates in the per-layer ghost-atom MPI
+exchange just like DPA3's repflows; the with-comm AOTInductor artifact
+is produced automatically by ``deepmd/pt_expt/utils/serialization.py``
+because ``_has_message_passing`` returns True for any DPA2 model.
+
+Unlike DPA3 (which has ``use_loc_mapping``), DPA2's repformer always
+takes a ``mapping`` tensor, so a single ``deeppot_dpa2.pt2`` already
+carries the dual-artifact layout — no separate ``_mpi.pt2`` needed.
+
+This file targets the gap "DPA2 multi-rank dispatch never tested
+end-to-end" recorded in
+``memory/gnn_mpi_untested_paths.md::Dispatch wired, no test fixture``.
+The reference is a same-archive single-rank run (``mpirun -n 1``
+through the same dual-artifact ``.pt2``); no hardcoded reference
+values are needed.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import importlib.util
+import os
+import shutil
+import subprocess as sp
+import sys
+import tempfile
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import pytest
+from write_lmp_data import (
+    write_lmp_data,
+)
+
+# Reuses the same generic mpirun driver as the DPA3 multi-rank tests —
+# the script is descriptor-agnostic (just LAMMPS + pair_style deepmd).
+RUNNER_PATH = Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"
+
+pb_file = Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa2.pt2"
+data_file = Path(__file__).parent / "data_dpa2_pt2.lmp"
+
+box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
+coord = np.array(
+    [
+        [12.83, 2.56, 2.18],
+        [12.09, 2.87, 2.74],
+        [0.25, 3.32, 1.68],
+        [3.36, 3.00, 1.81],
+        [3.51, 2.51, 2.60],
+        [4.27, 3.22, 1.56],
+    ]
+)
+type_OH = np.array([1, 2, 2, 1, 2, 2])
+
+
+def setup_module() -> None:
+    if os.environ.get("ENABLE_PYTORCH", "1") != "1":
+        pytest.skip(
+            "Skip test because PyTorch support is not enabled.",
+        )
+    write_lmp_data(box, coord, type_OH, data_file)
+
+
+def teardown_module() -> None:
+    if data_file.exists():
+        os.remove(data_file)
+
+
+def _run_mpi_subprocess(nprocs: int = 2) -> dict:
+    """Invoke the generic mpirun driver and parse the output.
+
+    With ``nprocs == 2`` (default) the runner forces ``processors 2 1 1``
+    so ``DeepPotPTExpt`` routes to the with-comm artifact. With
+    ``nprocs == 1`` the runner uses ``processors 1 1 1`` and the C++
+    side falls back to the regular artifact — useful as a same-archive
+    reference for value comparison.
+    """
+    with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
+        out_path = f.name
+    try:
+        argv = [
+            "mpirun",
+            "-n",
+            str(nprocs),
+            sys.executable,
+            str(RUNNER_PATH),
+            str(data_file.resolve()),
+            str(pb_file.resolve()),
+            out_path,
+        ]
+        if nprocs == 1:
+            argv.extend(["--processors", "1 1 1"])
+        sp.check_call(argv)
+        with open(out_path) as fh:
+            lines = fh.read().strip().splitlines()
+        pe = float(lines[0])
+        rows = np.array(
+            [list(map(float, line.split())) for line in lines[1:]],
+            dtype=np.float64,
+        )
+        forces = rows[:, :3]
+        virials = rows[:, 3:]
+        return {"pe": pe, "forces": forces, "virials": virials}
+    finally:
+        if os.path.exists(out_path):
+            os.remove(out_path)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa2() -> None:
+    """Multi-rank DPA2 .pt2 dispatch must match the same-archive
+    single-rank reference for energy, forces, and virial.
+
+    Verifies that:
+    - ``DeepPotPTExpt::compute`` correctly routes to the with-comm
+      artifact for DPA2 (descriptor-agnostic dispatch).
+    - The pt_expt ``DescrptBlockRepformers._exchange_ghosts`` override
+      drives ``deepmd_export::border_op`` for repformer's per-layer
+      ghost exchange (the path equivalent to DPA3's repflows).
+    - Different ``model_nnei`` from DPA3 (DPA2 repformer has nsel=15
+      vs DPA3's e_sel=30) — exercises the dynamic-nnei with-comm
+      trace at a different baked-in value.
+
+    No hardcoded reference; compares against a same-archive single-rank
+    run (``mpirun -n 1`` + ``processors 1 1 1`` falls back to the
+    regular artifact).
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=2)
+    out_ref = _run_mpi_subprocess(nprocs=1)
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-12, abs=1e-12)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
diff --git a/source/tests/infer/gen_dpa2.py b/source/tests/infer/gen_dpa2.py
index 8ce277fcf5..5aff706aab 100644
--- a/source/tests/infer/gen_dpa2.py
+++ b/source/tests/infer/gen_dpa2.py
@@ -108,6 +108,10 @@ def main():
 
     pt2_path = os.path.join(base_dir, "deeppot_dpa2.pt2")
     print(f"Exporting to {pt2_path} ...")  # noqa: T201
+    # DPA2's repformer block has no ``use_loc_mapping`` knob (unlike
+    # DPA3), so a single .pt2 already carries the dual-artifact layout
+    # (regular + with-comm) — _has_message_passing returns True and the
+    # serializer produces both. No separate _mpi.pt2 needed.
     pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data), do_atomic_virial=True)
 
     pth_path = os.path.join(base_dir, "deeppot_dpa2.pth")
diff --git a/source/tests/pt_expt/conftest.py b/source/tests/pt_expt/conftest.py
index f2a9b07a6a..06bca2fec5 100644
--- a/source/tests/pt_expt/conftest.py
+++ b/source/tests/pt_expt/conftest.py
@@ -17,6 +17,15 @@
     _get_current_function_mode_stack,
 )
 
+# Import ``deepmd.pt`` at conftest evaluation time so libdeepmd_op_pt.so
+# is loaded and ``deepmd_export::{border_op, border_op_backward}`` are
+# registered before any pt_expt test module imports
+# ``deepmd.pt_expt.utils`` (which transitively imports ``comm.py`` and
+# its ``_check_underlying_ops_loaded()`` runtime check). Previously this
+# worked only when collected alongside earlier tests that happened to
+# import deepmd.pt first.
+import deepmd.pt  # noqa: F401  - side-effect: register custom ops
+
 
 def _pop_device_contexts() -> list:
     """Pop all stale DeviceContext modes from the torch function mode stack."""
diff --git a/source/tests/pt_expt/model/test_spin_export_with_comm.py b/source/tests/pt_expt/model/test_spin_export_with_comm.py
index 93b22bf864..f77c9fe415 100644
--- a/source/tests/pt_expt/model/test_spin_export_with_comm.py
+++ b/source/tests/pt_expt/model/test_spin_export_with_comm.py
@@ -95,14 +95,16 @@ def test_spin_forward_common_lower_exportable_with_comm_traces() -> None:
     model.eval()
 
     # Build sample inputs (nframes=1 to match the override's nb=1
-    # constraint; spin doubles natoms).
+    # constraint; spin doubles natoms). nlist width must match the
+    # model's sum(sel); the descriptor's _format_nlist asserts this.
     nloc = 6  # 3 real + 3 virtual
     nall = 8  # 1 ghost on each side
     n_dim_coord = 3
+    nnei = sum(SPIN_GNN_DATA["descriptor"]["sel"])
     ext_coord = torch.zeros(1, nall, n_dim_coord, dtype=torch.float64)
     ext_atype = torch.zeros(1, nall, dtype=torch.int64)
     ext_spin = torch.zeros(1, nall, n_dim_coord, dtype=torch.float64)
-    nlist = torch.zeros(1, nloc, 6, dtype=torch.int64)  # nnei from sel
+    nlist = torch.zeros(1, nloc, nnei, dtype=torch.int64)
     mapping = torch.zeros(1, nall, dtype=torch.int64)
     fparam = None
     aparam = None

From ece5c3daa8bcc4b9574e3fe584f3472204e89c3e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 30 Apr 2026 17:35:31 +0800
Subject: [PATCH 14/34] test: extend MPI coverage with N>2 decompositions and
 schema-drift unit test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes two gaps from the GNN-MPI untested-paths catalog:

- ``test_pair_deepmd_mpi_dpa3_decomposition`` (parametrized): runs DPA3
  .pt2 under three additional processor grids — ``4@2x2x1`` (2D),
  ``4@4x1x1`` (1D-deep chain), and ``8@2x2x2`` (3D). All three must
  match the gen_dpa3.py reference for energy / per-atom force /
  per-atom virial within atol=1e-8. The 2x2x2 split puts several
  subdomains empty, so this also exercises the
  ``copy_from_nlist`` empty-subdomain guard in a 3D layout.

- ``source/tests/pt_expt/utils/test_has_message_passing.py``: pins
  ``_has_message_passing`` against schema drift. The detection chain
  (``model.atomic_model.descriptor`` ->
  ``descriptor.has_message_passing()`` -> ``block.use_loc_mapping``)
  is brittle to attribute renames in the dpmodel descriptor layer; a
  silent regression would disable the with-comm artifact and break
  multi-rank LAMMPS for GNN users with no test failure to flag it.
  The test asserts the documented value for 5 baseline configs
  (se_e2_a, dpa1, dpa3 use_loc_mapping=True/False, dpa2) plus two
  stub-model defensive cases.

The runner helper ``_run_mpi_subprocess`` gains an optional
``processors`` arg so the new parametrized test can dictate the
LAMMPS ``processors`` grid; existing tests keep their previous
defaults (``2 1 1`` for nprocs=2, ``1 1 1`` for nprocs=1).
---
 source/lmp/tests/test_lammps_dpa3_pt2.py      |  55 ++++-
 .../pt_expt/utils/test_has_message_passing.py | 229 ++++++++++++++++++
 2 files changed, 283 insertions(+), 1 deletion(-)
 create mode 100644 source/tests/pt_expt/utils/test_has_message_passing.py

diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index 84127246d6..61058b770e 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -360,6 +360,7 @@ def _run_mpi_subprocess(
     extra_args: list[str] | None = None,
     nprocs: int = 2,
     data_path: Path | None = None,
+    processors: str | None = None,
 ) -> dict:
     """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under
     ``mpirun -n <nprocs>`` and return
@@ -372,6 +373,10 @@ def _run_mpi_subprocess(
 
     ``data_path`` (default ``data_file``) selects the LAMMPS data file —
     the empty-subdomain test points at a non-default elongated-box file.
+
+    ``processors`` overrides the runner's default decomposition string
+    (``"2 1 1"``); used by the ``test_*_decomposition`` variants to
+    exercise 2D / 3D processor grids (Px*Py*Pz must equal nprocs).
     """
     if data_path is None:
         data_path = data_file
@@ -388,7 +393,9 @@ def _run_mpi_subprocess(
             str(pb_file_mpi.resolve()),
             out_path,
         ]
-        if nprocs == 1:
+        if processors is not None:
+            argv.extend(["--processors", processors])
+        elif nprocs == 1:
             argv.extend(["--processors", "1 1 1"])
         if extra_args:
             argv.extend(extra_args)
@@ -525,3 +532,49 @@ def test_pair_deepmd_mpi_dpa3_empty_subdomain() -> None:
         out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
     )
     assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-12, abs=1e-12)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+@pytest.mark.parametrize(
+    "nprocs,processors",
+    [
+        (4, "2 2 1"),  # 2D decomposition; nswap > 2, two-direction borders
+        (4, "4 1 1"),  # 1D-deep chain; sendlist depth = 3 (each pair is 1+2 swaps)
+        (8, "2 2 2"),  # 3D decomposition; full xyz border exchange
+    ],
+)
+def test_pair_deepmd_mpi_dpa3_decomposition(nprocs, processors) -> None:
+    """Multi-rank DPA3 .pt2 must match the single-rank reference under
+    deeper / 3D processor grids beyond the canonical 2x1x1 (N=2) layout.
+
+    Production MD typically runs with 8/16/32+ ranks and 2D/3D
+    decompositions. Bugs that don't fire at N=2 (deeper sendlist
+    chains, 3D border swaps, asymmetric subdomains, multiple empty
+    cells in the 2x2x2 split of a small fixture) have zero coverage
+    without this test.
+
+    The 6-atom 13x13x13 fixture is intentionally small relative to
+    the rank count: in the 2x2x2 split each subdomain is
+    ~6.5x6.5x6.5 A, so several subdomains are empty — exercising the
+    empty-subdomain ``copy_from_nlist`` guard fix in 3D.
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=nprocs, processors=processors)
+    # Step-0 evaluation; bit-exact match expected against the
+    # gen_dpa3.py-derived reference.
+    assert out_mpi["pe"] == pytest.approx(expected_e, rel=0, abs=1e-8)
+    for ii in range(6):
+        np.testing.assert_allclose(
+            out_mpi["forces"][ii], expected_f[ii], atol=1e-8, rtol=0
+        )
+    expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2]
+    np.testing.assert_allclose(
+        out_mpi["virials"][:, expected_v_to_lammps] / constants.nktv2p,
+        expected_v,
+        atol=1e-8,
+        rtol=0,
+    )
diff --git a/source/tests/pt_expt/utils/test_has_message_passing.py b/source/tests/pt_expt/utils/test_has_message_passing.py
new file mode 100644
index 0000000000..673e4d8bd0
--- /dev/null
+++ b/source/tests/pt_expt/utils/test_has_message_passing.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Schema-drift regression test for ``_has_message_passing``.
+
+``_has_message_passing`` (in ``deepmd/pt_expt/utils/serialization.py``)
+gates whether the dual-artifact ``.pt2`` is produced for GNN models —
+specifically, whether the with-comm AOTInductor module is compiled and
+nested inside the archive. The detection relies on a chain of attribute
+lookups:
+
+* ``model.atomic_model.descriptor``
+* ``descriptor.has_message_passing()``
+* For repflows/repformers: ``block.use_loc_mapping``
+
+A rename of any of these (refactor in the dpmodel descriptor layer, a
+new GNN block name, etc.) silently disables the with-comm artifact and
+multi-rank LAMMPS users get a single-artifact .pt2 that crashes on the
+first ghost exchange — with no test failure to flag the breakage.
+
+This test pins the contract: assert ``_has_message_passing`` returns
+the documented value for each baseline configuration.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import copy
+
+import pytest
+
+from deepmd.dpmodel.model.model import (
+    get_model,
+)
+from deepmd.pt_expt.utils.serialization import (
+    _has_message_passing,
+)
+
+
+def _se_e2_a_config() -> dict:
+    """Non-GNN descriptor — must report False."""
+    return {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "se_e2_a",
+            "rcut": 6.0,
+            "rcut_smth": 0.5,
+            "sel": [20, 20],
+            "neuron": [2, 4],
+            "axis_neuron": 2,
+            "type_one_side": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+        "fitting_net": {
+            "neuron": [4, 4],
+            "resnet_dt": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+    }
+
+
+def _dpa1_config() -> dict:
+    """DPA1 (se_atten) — non-GNN; must report False."""
+    return {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "se_atten",
+            "rcut": 6.0,
+            "rcut_smth": 0.5,
+            "sel": 20,
+            "neuron": [2, 4],
+            "axis_neuron": 2,
+            "attn": 5,
+            "attn_layer": 1,
+            "type_one_side": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+        "fitting_net": {
+            "neuron": [4, 4],
+            "resnet_dt": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+    }
+
+
+def _dpa3_config(use_loc_mapping: bool) -> dict:
+    """DPA3 (repflows). use_loc_mapping=False -> True, True -> False."""
+    return {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "dpa3",
+            "repflow": {
+                "n_dim": 8,
+                "e_dim": 6,
+                "a_dim": 4,
+                "nlayers": 1,
+                "e_rcut": 4.0,
+                "e_rcut_smth": 0.5,
+                "e_sel": 8,
+                "a_rcut": 3.5,
+                "a_rcut_smth": 0.5,
+                "a_sel": 4,
+                "axis_neuron": 4,
+                "update_angle": False,
+            },
+            "use_loc_mapping": use_loc_mapping,
+        },
+        "fitting_net": {"neuron": [16, 16], "seed": 1},
+    }
+
+
+def _dpa2_config() -> dict:
+    """DPA2 (repformer) — GNN; repformer has no use_loc_mapping knob,
+    so always reports True.
+    """
+    return {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "dpa2",
+            "repinit": {
+                "rcut": 6.0,
+                "rcut_smth": 2.0,
+                "nsel": 20,
+                "neuron": [2, 4],
+                "axis_neuron": 4,
+                "tebd_dim": 8,
+                "tebd_input_mode": "concat",
+                "set_davg_zero": True,
+                "type_one_side": True,
+                "use_three_body": False,
+            },
+            "repformer": {
+                "rcut": 3.0,
+                "rcut_smth": 1.5,
+                "nsel": 10,
+                "nlayers": 1,
+                "g1_dim": 8,
+                "g2_dim": 5,
+                "axis_neuron": 4,
+                "update_g1_has_conv": True,
+                "update_g1_has_drrd": True,
+                "update_g1_has_grrg": True,
+                "update_g2_has_attn": True,
+                "attn1_hidden": 8,
+                "attn1_nhead": 2,
+                "attn2_hidden": 5,
+                "attn2_nhead": 1,
+                "update_style": "res_avg",
+                "set_davg_zero": True,
+            },
+            "concat_output_tebd": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+        "fitting_net": {
+            "neuron": [4, 4],
+            "resnet_dt": True,
+            "seed": 1,
+        },
+    }
+
+
+@pytest.mark.parametrize(
+    "config_factory,expected",
+    [
+        (_se_e2_a_config, False),
+        (_dpa1_config, False),
+        (lambda: _dpa3_config(use_loc_mapping=True), False),
+        (lambda: _dpa3_config(use_loc_mapping=False), True),
+        (_dpa2_config, True),
+    ],
+    ids=[
+        "se_e2_a-non-gnn",
+        "dpa1-non-gnn",
+        "dpa3-use-loc-mapping-true",
+        "dpa3-use-loc-mapping-false",
+        "dpa2-repformer",
+    ],
+)
+def test_has_message_passing_matches_descriptor_kind(config_factory, expected) -> None:
+    """``_has_message_passing`` must report the documented value for
+    each baseline descriptor configuration.
+
+    A False positive (non-GNN reported as GNN) wastes compile time on
+    a useless with-comm artifact. A False negative (GNN with
+    use_loc_mapping=False reported as non-GNN) is worse: multi-rank
+    LAMMPS gets a single-artifact .pt2 and crashes on the first ghost
+    exchange. This test pins both directions.
+    """
+    config = config_factory()
+    model = get_model(copy.deepcopy(config))
+    assert _has_message_passing(model) is expected
+
+
+def test_has_message_passing_no_descriptor_returns_false() -> None:
+    """Models without a single ``atomic_model.descriptor`` (e.g. linear
+    / ZBL / frozen) must report False — the function defends against
+    AttributeError and treats the model as local.
+    """
+
+    class _StubAtomicModel:
+        # Intentionally no ``descriptor`` attribute.
+        pass
+
+    class _StubModel:
+        atomic_model = _StubAtomicModel()
+
+    assert _has_message_passing(_StubModel()) is False
+
+
+def test_has_message_passing_descriptor_without_query_returns_false() -> None:
+    """If the descriptor exists but lacks ``has_message_passing``, the
+    function must report False rather than raise.
+    """
+
+    class _StubDescriptor:
+        # Intentionally no ``has_message_passing`` method.
+        pass
+
+    class _StubAtomicModel:
+        descriptor = _StubDescriptor()
+
+    class _StubModel:
+        atomic_model = _StubAtomicModel()
+
+    assert _has_message_passing(_StubModel()) is False

From 0ef1bfc8bd184de357cf2052df966913ab347c1a Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 30 Apr 2026 18:40:33 +0800
Subject: [PATCH 15/34] test: cover NULL-type atoms (atype<0) under mpirun
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the recorded gap "NULL-type atoms under mpirun" — until now
``build_comm_tensors_positional_with_virtual_atoms`` and the
``fwd_map``-based comm-tensor remap had never been exercised in a
multi-rank LAMMPS run despite being reachable any time a user runs
a model on a system with atom types outside its ``type_map``.

Fixture (``data_dpa3_pt2_null_type.lmp``): the canonical 6 real
atoms (types 1, 2) plus 2 LAMMPS type-3 atoms placed at (5.5, 6, 6)
and (7.5, 7, 7) — straddling the x=6.5 rank boundary under
``processors 2 1 1`` and within rcut (=6) of multiple real atoms.
The pair_coeff ``* * O H NULL`` maps LAMMPS type 3 to deepmd
atype=-1, so ``select_real_atoms_coord`` filters them and
``DeepPotPTExpt::compute`` takes the
``build_comm_tensors_positional_with_virtual_atoms`` branch.

The NULL atoms appear in cross-rank sendlists because both sit in
the boundary's rcut window, so the remap must:
  - drop the -1 fwd_map slots from each swap's sendlist;
  - decrement sendnum/recvnum by the number dropped;
  - translate surviving indices into real-atom space.

Test asserts:
  - forces on the 6 real atoms match the no-NULL baseline
    ``expected_f`` (atol 1e-8);
  - NULL atom forces are zero (atol 1e-12) — deepmd is the only
    pair_style and skips them;
  - total potential energy matches ``expected_e``;
  - per-atom virial on real atoms matches ``expected_v``.

Runner script (``run_mpi_pair_deepmd_dpa3_pt2.py``) gains two
optional flags: ``--pair-coeff`` (override the default ``"* *"``)
and ``--mass3`` (mass for a third LAMMPS atom type). Existing tests
keep their previous defaults unchanged.

The ``_run_mpi_subprocess`` helper gains a ``runner_args`` kwarg
to forward arbitrary flags to the runner; existing call sites are
unaffected.
---
 .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 25 +++++-
 source/lmp/tests/test_lammps_dpa3_pt2.py      | 89 +++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
index d07af7a158..4180ffac47 100644
--- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
+++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
@@ -51,6 +51,23 @@
     "domain decomposition (nswap>0). Pass '1 1 1' for a single-rank "
     "reference run on the same archive (single-artifact dispatch).",
 )
+parser.add_argument(
+    "--pair-coeff",
+    type=str,
+    default="* *",
+    help="pair_coeff arguments (after 'pair_coeff'). Default '* *' "
+    "uses identity LAMMPS-type-to-deepmd-atype mapping (assumes the "
+    "data file's types match the model's type_map order). For NULL-type "
+    "tests pass e.g. '* * O H NULL' so the third LAMMPS type becomes "
+    "deepmd atype=-1 (filtered before model evaluation).",
+)
+parser.add_argument(
+    "--mass3",
+    type=float,
+    default=None,
+    help="Optional mass for LAMMPS atom type 3 (and any higher types). "
+    "Used by the NULL-type fixture; ignored when only 2 types exist.",
+)
 args = parser.parse_args()
 
 lammps = PyLammps()
@@ -73,11 +90,17 @@
 lammps.read_data(args.DATAFILE)
 lammps.mass("1 16")
 lammps.mass("2 2")
+if args.mass3 is not None:
+    # Used by the NULL-type test where the data file has a 3rd LAMMPS
+    # type that maps to a NULL deepmd atype (filtered before model
+    # evaluation). The mass value is physically irrelevant — these
+    # atoms get zero force from the deepmd model.
+    lammps.mass(f"3 {args.mass3}")
 lammps.timestep(0.0005)
 lammps.fix("1 all nve")
 
 lammps.pair_style(f"deepmd {args.PB_FILE}")
-lammps.pair_coeff("* *")
+lammps.pair_coeff(args.pair_coeff)
 # Per-atom virial from the pair contribution. ``centroid/stress/atom``
 # is parallel-safe (rank-local data, gathered below). LAMMPS computes
 # stress*volume per atom in internal units; the parent test reverses
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index 61058b770e..1d4c77fa9e 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -41,6 +41,14 @@
 # local atoms — a corner case the comm-dispatch path must handle
 # without crashing or producing wrong forces.
 data_file_empty_subdomain = Path(__file__).parent / "data_dpa3_pt2_empty_subdomain.lmp"
+# NULL-type variant: 6 real atoms (types 1,2) + 2 type-3 atoms straddling
+# the x=6.5 rank boundary. With ``pair_coeff * * O H NULL`` LAMMPS type 3
+# maps to deepmd atype=-1, so those atoms are filtered by
+# ``select_real_atoms_coord`` and the comm tensors must be remapped via
+# ``fwd_map`` before being handed to the with-comm artifact. Forces on
+# the 6 real atoms must match the no-NULL baseline; NULL atoms get zero
+# force from the deepmd model.
+data_file_null_type = Path(__file__).parent / "data_dpa3_pt2_null_type.lmp"
 
 # Reference values from gen_dpa3.py / test_deeppot_dpa3_ptexpt.cc (PBC)
 expected_ae = np.array(
@@ -168,6 +176,24 @@ def setup_module() -> None:
     # split is at x = 15 Å and rank 1 owns x ≥ 15, which is empty.
     box_empty_subdomain = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0])
     write_lmp_data(box_empty_subdomain, coord, type_OH, data_file_empty_subdomain)
+    # NULL-type fixture: original 6 real atoms (types 1,2) plus 2 LAMMPS
+    # type-3 atoms placed within rcut (~6 Å) of real atoms on BOTH sides
+    # of the x=6.5 rank boundary. The NULL atoms appear in real atoms'
+    # neighbour lists and in the cross-rank sendlists, so the comm-tensor
+    # remap (``fwd_map``-based) is genuinely exercised — not trivial.
+    coord_null_type = np.concatenate(
+        [
+            coord,
+            np.array(
+                [
+                    [5.5, 6.0, 6.0],  # rank 0 side, near boundary
+                    [7.5, 7.0, 7.0],  # rank 1 side, near boundary
+                ]
+            ),
+        ]
+    )
+    type_null = np.concatenate([type_OH, np.array([3, 3])])
+    write_lmp_data(box, coord_null_type, type_null, data_file_null_type)
 
 
 def teardown_module() -> None:
@@ -176,6 +202,7 @@ def teardown_module() -> None:
         data_type_map_file,
         data_file_si,
         data_file_empty_subdomain,
+        data_file_null_type,
     ]:
         if f.exists():
             os.remove(f)
@@ -361,6 +388,7 @@ def _run_mpi_subprocess(
     nprocs: int = 2,
     data_path: Path | None = None,
     processors: str | None = None,
+    runner_args: list[str] | None = None,
 ) -> dict:
     """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under
     ``mpirun -n <nprocs>`` and return
@@ -399,6 +427,8 @@ def _run_mpi_subprocess(
             argv.extend(["--processors", "1 1 1"])
         if extra_args:
             argv.extend(extra_args)
+        if runner_args:
+            argv.extend(runner_args)
         sp.check_call(argv)
         with open(out_path) as fh:
             lines = fh.read().strip().splitlines()
@@ -578,3 +608,62 @@ def test_pair_deepmd_mpi_dpa3_decomposition(nprocs, processors) -> None:
         atol=1e-8,
         rtol=0,
     )
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_null_type() -> None:
+    """Multi-rank DPA3 .pt2 with NULL-type atoms.
+
+    Exercises ``select_real_atoms_coord`` filtering AND
+    ``build_comm_tensors_positional_with_virtual_atoms`` remapping
+    under multi-rank dispatch — neither path was reachable in any
+    previous test fixture.
+
+    Setup: 6 real atoms (types 1,2) at the canonical positions plus
+    2 LAMMPS type-3 atoms straddling the x=6.5 rank boundary. With
+    ``pair_coeff * * O H NULL`` the type-3 atoms map to deepmd
+    atype=-1 and are filtered before model evaluation. Because the
+    NULL atoms sit within rcut of real atoms on BOTH sides of the
+    boundary, they appear in cross-rank sendlists — forcing the
+    ``fwd_map``-based remap (which translates unfiltered LAMMPS
+    indices into filtered real-atom indices, dropping ``-1`` slots).
+
+    Assertions:
+    - Forces on the 6 real atoms (ids 1..6, id-sorted output) match
+      the no-NULL baseline ``expected_f`` exactly. NULL atoms don't
+      contribute to the deepmd model so real-atom forces are
+      identical to the 6-atom baseline.
+    - NULL-atom forces (ids 7,8) are zero — the deepmd model is the
+      only pair_style and skips them entirely.
+    - Total energy matches ``expected_e``.
+    - Per-atom virial on real atoms matches ``expected_v``.
+    """
+    out_mpi = _run_mpi_subprocess(
+        nprocs=2,
+        data_path=data_file_null_type,
+        runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"],
+    )
+    # Forces on real atoms (ids 1..6) match the no-NULL baseline.
+    real_forces = out_mpi["forces"][:6]
+    for ii in range(6):
+        np.testing.assert_allclose(real_forces[ii], expected_f[ii], atol=1e-8, rtol=0)
+    # NULL atoms (ids 7,8) get zero force from the deepmd model.
+    null_forces = out_mpi["forces"][6:]
+    np.testing.assert_allclose(null_forces, 0.0, atol=1e-12, rtol=0)
+    # Total potential energy unchanged (NULL atoms contribute 0).
+    assert out_mpi["pe"] == pytest.approx(expected_e, rel=0, abs=1e-8)
+    # Per-atom virial on real atoms matches expected_v with the same
+    # column permutation as test_pair_deepmd_mpi_dpa3.
+    expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2]
+    real_virials = out_mpi["virials"][:6]
+    np.testing.assert_allclose(
+        real_virials[:, expected_v_to_lammps] / constants.nktv2p,
+        expected_v,
+        atol=1e-8,
+        rtol=0,
+    )

From 0c95b3af6b4e5f5bf59dede637553c0af1c073a8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 30 Apr 2026 22:01:00 +0800
Subject: [PATCH 16/34] test: cover three NULL-type edge cases (isolated /
 all-null-rank / nlist-rebuild)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes three more entries from the GNN-MPI untested-paths catalog,
all variations on the multi-rank NULL-type filter path:

- ``test_pair_deepmd_mpi_dpa3_null_isolated``: large box (30 x 13 x 13)
  puts a NULL atom at x=7.5, in rank 0's subdomain interior. With
  rcut=6 the boundary rcut-windows on rank 0 are x in [0, 6] (PBC of
  the right wall via x=30) and [9, 15] (rank 1's left wall); atoms
  in (6, 9) are local but never appear in any sendlist. Exercises
  ``has_null_atoms == True`` with a no-op remap (sendlists contain
  no NULL entries to drop) — complementary to
  ``test_pair_deepmd_mpi_dpa3_null_type`` which exercises the
  remap-with-NULLs case.

- ``test_pair_deepmd_mpi_dpa3_all_null_rank``: rank 1 owns ONLY NULL
  atoms (intersection of empty-subdomain and NULL-type paths). After
  ``select_real_atoms_coord`` rank 1 has nloc_real=0, so the
  ``copy_from_nlist`` empty-subdomain guard must fire AND the
  ``_with_virtual_atoms`` remap must handle a sendlist whose entire
  local section was NULL.

- ``test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild``: rebuilds the
  nlist 3 times in 3 MD steps using ``neigh_modify every 1``. NULL
  atoms drift across the boundary so sendlist composition changes
  per rebuild — validates that the remap re-runs correctly on every
  ago=0 trigger and stays consistent with the cached
  ``mapping_tensor`` / ``firstneigh_tensor`` in
  ``DeepPotPTExpt::compute``.

Also speeds up ``test_pair_deepmd_mpi_dpa3_nlist_rebuild`` (existing
non-NULL test) by switching from ``every 10`` + 25 steps to
``every 1`` + 3 steps — same 3 rebuilds, ~1/3 the wall time.

Runner script gains a ``--neigh-every`` flag (default 10). All
three new tests compare mpirun -n 2 against an mpirun -n 1
reference on the same fixture.
---
 .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py |  11 +-
 source/lmp/tests/test_lammps_dpa3_pt2.py      | 210 +++++++++++++++++-
 2 files changed, 209 insertions(+), 12 deletions(-)

diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
index 4180ffac47..6691bdd220 100644
--- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
+++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
@@ -68,6 +68,15 @@
     help="Optional mass for LAMMPS atom type 3 (and any higher types). "
     "Used by the NULL-type fixture; ignored when only 2 types exist.",
 )
+parser.add_argument(
+    "--neigh-every",
+    type=int,
+    default=10,
+    help="LAMMPS ``neigh_modify every`` value. Default 10 mirrors the "
+    "production-realistic interval. Pass 1 for tests that want to "
+    "trigger nlist rebuilds on every step (and run a small ``--nsteps`` "
+    "to keep wall time low while still exercising the rebuild path).",
+)
 args = parser.parse_args()
 
 lammps = PyLammps()
@@ -86,7 +95,7 @@
 # ``mapping`` tensor. It is harmless under multi-rank.
 lammps.atom_modify("map yes")
 lammps.neighbor("2.0 bin")
-lammps.neigh_modify("every 10 delay 0 check no")
+lammps.neigh_modify(f"every {args.neigh_every} delay 0 check no")
 lammps.read_data(args.DATAFILE)
 lammps.mass("1 16")
 lammps.mass("2 2")
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index 1d4c77fa9e..17bc1c2892 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -49,6 +49,20 @@
 # the 6 real atoms must match the no-NULL baseline; NULL atoms get zero
 # force from the deepmd model.
 data_file_null_type = Path(__file__).parent / "data_dpa3_pt2_null_type.lmp"
+# Isolated-NULL fixture: box=30 Å in x so rank 0 (x ∈ [0, 15]) has a
+# subdomain interior that is NOT within rcut of any boundary. With
+# rcut=6, boundary-adjacent regions are [0, 6] (PBC of right wall)
+# and [9, 15] (left wall of rank 1) — atoms in x in (6, 9) are LOCAL
+# but not in any sendlist. Place 1 NULL atom at x=7.5 (in this gap)
+# so the remap branch is reached but the sendlists contain no NULL
+# entries — exercises ``has_null_atoms=true`` with no-op remap.
+data_file_null_isolated = Path(__file__).parent / "data_dpa3_pt2_null_isolated.lmp"
+# All-NULL-rank fixture: box=30 Å in x. 6 real atoms in rank 0
+# (x < 13). 2 NULL atoms in rank 1 (x ∈ {20, 25}). Under
+# ``processors 2 1 1`` rank 1 owns ONLY NULL atoms, so after
+# ``select_real_atoms_coord`` rank 1 has nloc_real=0 (intersection
+# of empty-subdomain and NULL-type paths).
+data_file_all_null_rank = Path(__file__).parent / "data_dpa3_pt2_all_null_rank.lmp"
 
 # Reference values from gen_dpa3.py / test_deeppot_dpa3_ptexpt.cc (PBC)
 expected_ae = np.array(
@@ -194,6 +208,37 @@ def setup_module() -> None:
     )
     type_null = np.concatenate([type_OH, np.array([3, 3])])
     write_lmp_data(box, coord_null_type, type_null, data_file_null_type)
+    # Isolated-NULL fixture: same elongated box as empty-subdomain
+    # plus one NULL atom in rank 0's subdomain interior (x ∈ (6, 9)).
+    coord_null_isolated = np.concatenate([coord, np.array([[7.5, 6.5, 6.5]])])
+    type_null_isolated = np.concatenate([type_OH, np.array([3])])
+    write_lmp_data(
+        box_empty_subdomain,
+        coord_null_isolated,
+        type_null_isolated,
+        data_file_null_isolated,
+    )
+    # All-NULL-rank fixture: box=30 in x. Real atoms in rank 0
+    # (their original coords; all x < 13). NULL atoms placed in
+    # rank 1 (x ∈ {20, 25}). Rank 1 owns ONLY NULL atoms.
+    coord_all_null_rank = np.concatenate(
+        [
+            coord,
+            np.array(
+                [
+                    [20.0, 6.5, 6.5],
+                    [25.0, 6.5, 6.5],
+                ]
+            ),
+        ]
+    )
+    type_all_null_rank = np.concatenate([type_OH, np.array([3, 3])])
+    write_lmp_data(
+        box_empty_subdomain,
+        coord_all_null_rank,
+        type_all_null_rank,
+        data_file_all_null_rank,
+    )
 
 
 def teardown_module() -> None:
@@ -203,6 +248,8 @@ def teardown_module() -> None:
         data_file_si,
         data_file_empty_subdomain,
         data_file_null_type,
+        data_file_null_isolated,
+        data_file_all_null_rank,
     ]:
         if f.exists():
             os.remove(f)
@@ -506,21 +553,25 @@ def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None:
     """Multi-rank with neighbor-list rebuilds, validated against a
     single-rank reference of the same archive and trajectory.
 
-    Runs 25 MD steps with ``neigh_modify every 10 delay 0 check no``,
-    so the multi-rank trajectory crosses two nlist rebuilds (at steps
-    10 and 20) before the final force evaluation. The same trajectory
-    is then run under ``mpirun -n 1`` (regular-artifact dispatch on
-    the same dual-artifact .pt2) to obtain a reference; comparing the
-    two catches a wrong-but-finite force from a dispatch bug that the
-    previous finite/bounded check would miss.
+    Uses ``neigh_modify every 1`` so a rebuild happens on every step,
+    then runs 3 steps — yields 3 rebuilds in roughly 1/8 the wall
+    time of a 25-step ``every 10`` run. The same trajectory is then
+    run under ``mpirun -n 1`` (regular-artifact dispatch on the same
+    dual-artifact .pt2) to obtain a reference; comparing the two
+    catches a wrong-but-finite force from a dispatch bug.
 
-    NVE is deterministic up to floating-point summation order, so the
-    cross-rank divergence after 25 steps is bounded by accumulated
+    NVE is deterministic up to floating-point summation order, so
+    the cross-rank divergence after 3 steps is bounded by accumulated
     round-off — small for a 6-atom system but non-zero, hence the
     relaxed (but still tight) tolerances.
     """
-    out_mpi = _run_mpi_subprocess(extra_args=["--nsteps", "25"], nprocs=2)
-    out_ref = _run_mpi_subprocess(extra_args=["--nsteps", "25"], nprocs=1)
+    runner_args = ["--neigh-every", "1"]
+    out_mpi = _run_mpi_subprocess(
+        extra_args=["--nsteps", "3"], nprocs=2, runner_args=runner_args
+    )
+    out_ref = _run_mpi_subprocess(
+        extra_args=["--nsteps", "3"], nprocs=1, runner_args=runner_args
+    )
     np.testing.assert_allclose(
         out_mpi["forces"],
         out_ref["forces"],
@@ -667,3 +718,140 @@ def test_pair_deepmd_mpi_dpa3_null_type() -> None:
         atol=1e-8,
         rtol=0,
     )
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_null_isolated() -> None:
+    """NULL atom local on a rank but absent from every sendlist.
+
+    Box is 30x13x13 with split at x=15. With rcut=6 the boundary
+    rcut-windows on rank 0 are x ∈ [0, 6] (PBC of right wall via
+    x=30) and x ∈ [9, 15] (left wall of rank 1). Atoms in
+    x ∈ (6, 9) are LOCAL on rank 0 but never appear in any
+    cross-rank sendlist. Placing a NULL atom at x=7.5 puts it in
+    that gap.
+
+    Coverage: ``has_null_atoms == True`` triggers the
+    ``_with_virtual_atoms`` branch, but the remap encounters NO
+    NULL entries in any sendlist (no-op remap). The
+    ``test_pair_deepmd_mpi_dpa3_null_type`` test exercises the
+    remap-with-NULLs case; this one pins the
+    remap-with-no-NULLs-in-sendlist case.
+
+    Comparison is mpi-vs-single-rank on the same fixture (no hardcoded
+    reference because the box differs from the canonical 13x13x13).
+    """
+    out_mpi = _run_mpi_subprocess(
+        nprocs=2,
+        data_path=data_file_null_isolated,
+        runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"],
+    )
+    out_ref = _run_mpi_subprocess(
+        nprocs=1,
+        data_path=data_file_null_isolated,
+        runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"],
+    )
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=0, abs=1e-8)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_all_null_rank() -> None:
+    """Rank that owns ONLY NULL atoms (intersection of empty-subdomain
+    and NULL-type paths).
+
+    Box=30x13x13, split at x=15. Real atoms (types 1,2) are all in
+    rank 0 (x < 13). NULL atoms (type 3) are at x ∈ {20, 25},
+    both in rank 1. After ``select_real_atoms_coord``:
+
+    - Rank 0: nloc_real=6 (all real local), receives NULL atoms as
+      ghosts via PBC -> filtered -> nall_real ≤ nall.
+    - Rank 1: nloc_real=0 (all local atoms filtered out — empty
+      subdomain after filter), receives real atoms as ghosts.
+
+    Tests that the comm-dispatch path handles a rank with zero real
+    locals correctly. The empty-subdomain ``copy_from_nlist`` guard
+    must fire on rank 1, AND the ``_with_virtual_atoms`` remap must
+    handle the case where the local section of the sendlist is
+    entirely NULL.
+    """
+    out_mpi = _run_mpi_subprocess(
+        nprocs=2,
+        data_path=data_file_all_null_rank,
+        runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"],
+    )
+    out_ref = _run_mpi_subprocess(
+        nprocs=1,
+        data_path=data_file_all_null_rank,
+        runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"],
+    )
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=0, abs=1e-8)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None:
+    """NULL-type atoms across nlist rebuilds during MD.
+
+    Uses ``neigh_modify every 1`` so the nlist rebuilds on every MD
+    step, then runs 3 steps — yielding 3 rebuilds in roughly 1/8 the
+    wall time of a 25-step ``every 10`` run. Atoms still move (NVE
+    integration), so the comm-tensor composition (which atoms appear
+    in each swap's sendlist, where NULL atoms map under
+    ``fwd_map``) genuinely changes between rebuilds.
+
+    Coverage: validates that the ``_with_virtual_atoms`` remap
+    re-runs correctly on every ago=0 trigger and that the cached
+    state in ``DeepPotPTExpt::compute`` (mapping_tensor,
+    firstneigh_tensor) plus the per-step rebuilt comm tensors stay
+    consistent under NULL filtering. Compares mpi-2-rank vs
+    mpi-1-rank trajectories.
+    """
+    runner_args = [
+        "--pair-coeff",
+        "* * O H NULL",
+        "--mass3",
+        "5.0",
+        "--neigh-every",
+        "1",
+    ]
+    out_mpi = _run_mpi_subprocess(
+        nprocs=2,
+        data_path=data_file_null_type,
+        extra_args=["--nsteps", "3"],
+        runner_args=runner_args,
+    )
+    out_ref = _run_mpi_subprocess(
+        nprocs=1,
+        data_path=data_file_null_type,
+        extra_args=["--nsteps", "3"],
+        runner_args=runner_args,
+    )
+    np.testing.assert_allclose(
+        out_mpi["forces"], out_ref["forces"], atol=1e-6, rtol=1e-6
+    )
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-6, rtol=1e-6
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10)

From ad7761cba20d6b5db6fb1733788ee5adfc299112 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 30 Apr 2026 22:16:29 +0800
Subject: [PATCH 17/34] test: NULL atoms cross rank boundary; prune redundant
 decomposition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related changes that strengthen the NULL-type rebuild test and
trim the decomposition variant set:

- ``test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild`` now sets a
  high initial velocity (v_x = 2000 A/ps) on LAMMPS type-3 atoms via
  the runner's new ``--null-vx`` flag and a per-type ``velocity``
  command. With timestep 0.0005 ps each NULL atom moves 1.0 A per
  step — enough to physically cross the x=6.5 rank boundary in
  step 1 (NULL @ 5.5 -> 6.5 -> 7.5 -> 8.5). NULL atoms therefore
  migrate ranks across rebuilds, exercising the case where a NULL's
  fwd_map index moves between the local-section and ghost-section
  of per-rank sendlists.

  Real atoms keep v=0 so their dynamics are stable; the deepmd model
  never sees NULL atoms (filtered by ``select_real_atoms_coord``)
  so unphysical NULL velocity is harmless. mpi-2 vs mpi-1 reference
  match within atol=1e-6 / rel=1e-8.

- ``test_pair_deepmd_mpi_dpa3_decomposition``: drop the ``[4-2 2 1]``
  variant. Its 2D coverage is fully subsumed by ``[8-2 2 2]``
  (which is 3D, so 2D face exchange is a strict subset). The two
  remaining variants — ``[4-4 1 1]`` for 1D-deep sendlist chains
  and ``[8-2 2 2]`` for 3D borders — are complementary and not
  subsumable. Saves ~5.5s of suite wall time.

Runner script gains a ``--null-vx`` flag (no-op when not passed,
so existing tests are unaffected).
---
 .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 20 +++++++++
 source/lmp/tests/test_lammps_dpa3_pt2.py      | 45 ++++++++++++-------
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
index 6691bdd220..b35a2a38d4 100644
--- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
+++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
@@ -77,6 +77,18 @@
     "trigger nlist rebuilds on every step (and run a small ``--nsteps`` "
     "to keep wall time low while still exercising the rebuild path).",
 )
+parser.add_argument(
+    "--null-vx",
+    type=float,
+    default=None,
+    help="Optional initial x-velocity (units: Angstrom/ps in metal "
+    "units) for LAMMPS atom type 3 atoms. Real atoms stay at v=0. "
+    "Used by the NULL-type rebuild test to make NULL atoms cross the "
+    "rank boundary in a few MD steps without destabilising real-atom "
+    "dynamics — the deepmd model never sees NULL atoms (filtered by "
+    "``select_real_atoms_coord``) so their unphysical velocity is "
+    "harmless.",
+)
 args = parser.parse_args()
 
 lammps = PyLammps()
@@ -107,6 +119,14 @@
     lammps.mass(f"3 {args.mass3}")
 lammps.timestep(0.0005)
 lammps.fix("1 all nve")
+if args.null_vx is not None:
+    # Restrict initial velocity to LAMMPS type 3 atoms (NULL-type
+    # in the deepmd plugin's pair_coeff mapping). Real atoms stay
+    # at v=0; only the NULL atoms get the high vx, so the deepmd
+    # model's force outputs on real atoms remain bounded and the
+    # NVE integrator stays stable.
+    lammps.group("nullgroup type 3")
+    lammps.velocity(f"nullgroup set {args.null_vx:.6f} 0.0 0.0 units box")
 
 lammps.pair_style(f"deepmd {args.PB_FILE}")
 lammps.pair_coeff(args.pair_coeff)
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index 17bc1c2892..e7b2b525b8 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -624,7 +624,10 @@ def test_pair_deepmd_mpi_dpa3_empty_subdomain() -> None:
 @pytest.mark.parametrize(
     "nprocs,processors",
     [
-        (4, "2 2 1"),  # 2D decomposition; nswap > 2, two-direction borders
+        # 2D ``2 2 1`` is omitted: ``8 @ 2 2 2`` already exercises 2D
+        # face exchange (it's a superset, in 3D), so the 2D-only case
+        # is redundant. The two kept variants give complementary
+        # coverage: 1D-deep sendlist chains vs 3D border exchange.
         (4, "4 1 1"),  # 1D-deep chain; sendlist depth = 3 (each pair is 1+2 swaps)
         (8, "2 2 2"),  # 3D decomposition; full xyz border exchange
     ],
@@ -812,20 +815,30 @@ def test_pair_deepmd_mpi_dpa3_all_null_rank() -> None:
     importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
 )
 def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None:
-    """NULL-type atoms across nlist rebuilds during MD.
-
-    Uses ``neigh_modify every 1`` so the nlist rebuilds on every MD
-    step, then runs 3 steps — yielding 3 rebuilds in roughly 1/8 the
-    wall time of a 25-step ``every 10`` run. Atoms still move (NVE
-    integration), so the comm-tensor composition (which atoms appear
-    in each swap's sendlist, where NULL atoms map under
-    ``fwd_map``) genuinely changes between rebuilds.
-
-    Coverage: validates that the ``_with_virtual_atoms`` remap
-    re-runs correctly on every ago=0 trigger and that the cached
-    state in ``DeepPotPTExpt::compute`` (mapping_tensor,
-    firstneigh_tensor) plus the per-step rebuilt comm tensors stay
-    consistent under NULL filtering. Compares mpi-2-rank vs
+    """NULL-type atoms physically crossing the rank boundary during MD.
+
+    NULL atoms get a high initial v_x = 2000 A/ps via
+    ``--null-vx 2000`` so that with timestep 0.0005 ps each NULL atom
+    moves 1.0 A per step. Initial NULL positions are x=5.5 (rank 0,
+    moving right) and x=7.5 (rank 1, also moving right — wraps via
+    PBC). After 3 steps with ``neigh_modify every 1``:
+
+    - NULL @ x=5.5 -> 6.5 -> 7.5 -> 8.5 : crosses x=6.5 boundary
+      between steps 0 and 1 (moves from rank 0 to rank 1).
+    - NULL @ x=7.5 -> 8.5 -> 9.5 -> 10.5 : stays in rank 1 but
+      drifts deeper into the rcut window of rank 0.
+
+    Real atoms stay at v=0 so their dynamics are stable; the deepmd
+    model never sees the NULL atoms (filtered by
+    ``select_real_atoms_coord``) so unphysical NULL velocity is
+    harmless. The boundary crossing changes which rank owns each
+    NULL atom across rebuilds — exercising the case where a NULL's
+    fwd_map index moves between the local-section and ghost-section
+    of the per-rank sendlists.
+
+    Coverage: ``has_null_atoms`` must remain True across rebuilds;
+    the ``_with_virtual_atoms`` remap must produce correct outputs
+    even as NULL atoms migrate ranks. Compares mpi-2-rank vs
     mpi-1-rank trajectories.
     """
     runner_args = [
@@ -835,6 +848,8 @@ def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None:
         "5.0",
         "--neigh-every",
         "1",
+        "--null-vx",
+        "2000.0",
     ]
     out_mpi = _run_mpi_subprocess(
         nprocs=2,

From b25e00c971d8cb8c62ecc484b0e2eea0e6e694ed Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 30 Apr 2026 22:27:32 +0800
Subject: [PATCH 18/34] test: mixed-direction NULL velocities + real-atom
 thermal motion

Strengthens ``test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild`` so
the rebuilds see non-trivial sendlist composition changes:

- NULL atoms now move in OPPOSITE directions via the new
  ``--null-vx-split`` flag. NULL id=7 (at x=5.5) gets v_x=-2000 A/ps
  -> drifts left (and via PBC into rank 1's far ghost region). NULL
  id=8 (at x=7.5) gets v_x=+2000 A/ps -> drifts right (deeper into
  rank 1's domain). The +/- split means each rebuild sees one NULL
  entering rank 0's sendlist while the other leaves it.

- Real atoms get thermal velocities at T=10000 K via the new
  ``--real-temp`` flag (LAMMPS ``velocity realgroup create T seed``).
  Each real atom gets a different random direction, so the sendlist
  composition is also perturbed by real-atom motion (small but
  detectable under ``every 1`` rebuilds).

NULL atoms still don't contribute to the deepmd model (filtered by
``select_real_atoms_coord``), so their unphysical velocity is
harmless. Real-atom thermal motion at T=10000 K corresponds to
RMS speed ~3-9 A/ps (mass-weighted) -> ~0.005-0.015 A motion per
step; small enough that NVE stays stable but enough to perturb
sendlists.

Both new flags are no-ops when not passed; existing tests are
unaffected.
---
 .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 44 ++++++++++++--
 source/lmp/tests/test_lammps_dpa3_pt2.py      | 58 +++++++++++--------
 2 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
index b35a2a38d4..042f47c56c 100644
--- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
+++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py
@@ -89,6 +89,26 @@
     "``select_real_atoms_coord``) so their unphysical velocity is "
     "harmless.",
 )
+parser.add_argument(
+    "--null-vx-split",
+    action="store_true",
+    help="With ``--null-vx X``, split type-3 atoms into two groups by "
+    "their LAMMPS atom-id parity: even ids get +X, odd ids get -X. "
+    "Used by the NULL-type rebuild test to send different NULLs in "
+    "opposite directions, so the cross-rank sendlist composition "
+    "changes in BOTH directions per rebuild (rank 0 loses one NULL, "
+    "gains another simultaneously).",
+)
+parser.add_argument(
+    "--real-temp",
+    type=float,
+    default=None,
+    help="Optional initial thermal temperature (Kelvin) for non-NULL "
+    "atoms via ``velocity realgroup create T seed``. Each real atom "
+    "gets a random thermal velocity in a different direction — used "
+    "to exercise sendlist composition changes from real-atom motion "
+    "rather than only from NULL motion.",
+)
 args = parser.parse_args()
 
 lammps = PyLammps()
@@ -119,14 +139,30 @@
     lammps.mass(f"3 {args.mass3}")
 lammps.timestep(0.0005)
 lammps.fix("1 all nve")
+# Initial velocities. Order matters: thermalize real atoms first
+# (``velocity all create`` would also affect type 3, so we restrict
+# it to a real-atom group), then set the NULL bias on type 3.
+if args.real_temp is not None:
+    lammps.group("realgroup type 1 2")
+    lammps.velocity(f"realgroup create {args.real_temp:.6f} 12345 mom yes rot yes")
 if args.null_vx is not None:
     # Restrict initial velocity to LAMMPS type 3 atoms (NULL-type
     # in the deepmd plugin's pair_coeff mapping). Real atoms stay
-    # at v=0; only the NULL atoms get the high vx, so the deepmd
-    # model's force outputs on real atoms remain bounded and the
-    # NVE integrator stays stable.
+    # at v=0 (or thermal); only the NULL atoms get the high vx, so
+    # the deepmd model's force outputs on real atoms remain bounded
+    # and the NVE integrator stays stable.
     lammps.group("nullgroup type 3")
-    lammps.velocity(f"nullgroup set {args.null_vx:.6f} 0.0 0.0 units box")
+    if args.null_vx_split:
+        # Send NULL atoms with even/odd LAMMPS atom-id in opposite
+        # directions. Hardcoded to the null_type fixture's NULL ids
+        # (7, 8); sufficient because the runner is only used by this
+        # branch's tests, not as a general utility.
+        lammps.group("null_id7 id 7")
+        lammps.group("null_id8 id 8")
+        lammps.velocity(f"null_id7 set {-args.null_vx:.6f} 0.0 0.0 units box")
+        lammps.velocity(f"null_id8 set {args.null_vx:.6f} 0.0 0.0 units box")
+    else:
+        lammps.velocity(f"nullgroup set {args.null_vx:.6f} 0.0 0.0 units box")
 
 lammps.pair_style(f"deepmd {args.PB_FILE}")
 lammps.pair_coeff(args.pair_coeff)
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index e7b2b525b8..c3dafa48e8 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -815,31 +815,36 @@ def test_pair_deepmd_mpi_dpa3_all_null_rank() -> None:
     importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
 )
 def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None:
-    """NULL-type atoms physically crossing the rank boundary during MD.
-
-    NULL atoms get a high initial v_x = 2000 A/ps via
-    ``--null-vx 2000`` so that with timestep 0.0005 ps each NULL atom
-    moves 1.0 A per step. Initial NULL positions are x=5.5 (rank 0,
-    moving right) and x=7.5 (rank 1, also moving right — wraps via
-    PBC). After 3 steps with ``neigh_modify every 1``:
-
-    - NULL @ x=5.5 -> 6.5 -> 7.5 -> 8.5 : crosses x=6.5 boundary
-      between steps 0 and 1 (moves from rank 0 to rank 1).
-    - NULL @ x=7.5 -> 8.5 -> 9.5 -> 10.5 : stays in rank 1 but
-      drifts deeper into the rcut window of rank 0.
-
-    Real atoms stay at v=0 so their dynamics are stable; the deepmd
-    model never sees the NULL atoms (filtered by
-    ``select_real_atoms_coord``) so unphysical NULL velocity is
-    harmless. The boundary crossing changes which rank owns each
-    NULL atom across rebuilds — exercising the case where a NULL's
-    fwd_map index moves between the local-section and ghost-section
-    of the per-rank sendlists.
-
-    Coverage: ``has_null_atoms`` must remain True across rebuilds;
-    the ``_with_virtual_atoms`` remap must produce correct outputs
-    even as NULL atoms migrate ranks. Compares mpi-2-rank vs
-    mpi-1-rank trajectories.
+    """NULL atoms cross the boundary in OPPOSITE directions while
+    real atoms move randomly via thermal motion — sendlist
+    composition changes both ways per rebuild.
+
+    Initial conditions:
+    - Real atoms (types 1, 2): thermal velocities at T=10000 K
+      (``--real-temp 10000``). Each real atom gets a different
+      random direction; mass-weighted RMS speed is roughly
+      3 - 9 A/ps so motion in 3 steps is ~0.005 - 0.015 A. Tiny
+      but enough to perturb sendlist composition under
+      ``every 1`` rebuilds.
+    - NULL atom 7 (id=7) at x=5.5: gets ``v_x = -2000 A/ps`` via
+      ``--null-vx 2000 --null-vx-split`` (odd id -> negative).
+      Wraps via PBC: x = 5.5 -> 4.5 -> 3.5 -> 2.5 (stays in rank 0
+      but drifts deeper into the PBC ghost region of rank 1).
+    - NULL atom 8 (id=8) at x=7.5: gets ``v_x = +2000 A/ps``
+      (even id -> positive). x = 7.5 -> 8.5 -> 9.5 -> 10.5 (stays
+      in rank 1 but drifts deeper).
+
+    The +x/-x split means each rebuild sees NULL atoms entering
+    different sendlists (rank 0's right-edge sendlist gains NULL 7
+    even as it loses NULL 8 deeper into rank 1's domain, and vice
+    versa). Real-atom thermal motion provides additional sendlist
+    perturbation per atom.
+
+    Coverage: ``has_null_atoms`` must remain True; the
+    ``_with_virtual_atoms`` remap must produce correct outputs as
+    NULL atoms migrate in mixed directions and real-atom positions
+    shift. Compares mpi-2-rank vs mpi-1-rank trajectories
+    deterministically (both use the same velocity seed 12345).
     """
     runner_args = [
         "--pair-coeff",
@@ -850,6 +855,9 @@ def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None:
         "1",
         "--null-vx",
         "2000.0",
+        "--null-vx-split",
+        "--real-temp",
+        "10000.0",
     ]
     out_mpi = _run_mpi_subprocess(
         nprocs=2,

From 124dc5e85a1913d3505b4d8f47dac5cecc621387 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 30 Apr 2026 22:35:53 +0800
Subject: [PATCH 19/34] test: empty-subdomain test exercises cached
 mapping_tensor path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends ``test_pair_deepmd_mpi_dpa3_empty_subdomain`` to run 5 MD
steps with ``neigh_modify every 100`` instead of a single
``lammps.run(0)``. This forces:

  step 0  -> ago=0 (full rebuild; mapping_tensor + firstneigh_tensor
             populated for the first time on the empty-subdomain rank)
  step 1  -> ago=1 (cache HIT — mapping_tensor and firstneigh_tensor
             reused)
  step 2  -> ago=2 (cache hit)
  step 3  -> ago=3 (cache hit)
  step 4  -> ago=4 (cache hit)
  step 5  -> ago=5 (cache hit)

Closes the catalog gap "Empty subdomain under PR 5407's
mapping_tensor cache". Previously only step 0 was tested, which is
always ago=0; the cache-hit branch in DeepPotPTExpt::compute on a
rank with nloc=0 was unexercised.

Compares mpi-2 vs mpi-1 trajectory with the same atol=1e-6 / rel=1e-8
tolerances as the other rebuild tests.
---
 source/lmp/tests/test_lammps_dpa3_pt2.py | 39 +++++++++++++++++++-----
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py
index c3dafa48e8..a7a58b9f49 100644
--- a/source/lmp/tests/test_lammps_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa3_pt2.py
@@ -596,23 +596,46 @@ def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None:
 def test_pair_deepmd_mpi_dpa3_empty_subdomain() -> None:
     """Multi-rank DPA3 with one rank owning zero local atoms.
 
+    Runs 5 MD steps with ``neigh_modify every 100`` so the nlist is
+    rebuilt only once (at step 0, ago=0) and the next 4 force
+    evaluations exercise the cached ``mapping_tensor`` /
+    ``firstneigh_tensor`` path (PR 5407 caching) under empty
+    subdomain. Atoms move ~0 (v=0 default) so positions only differ
+    by tiny round-off, but the C++ dispatch path with cached state
+    on rank 1 (which has nloc=0) must still produce correct
+    cross-rank forces.
+
     Uses a 30 x 13 x 13 box with all six atoms clustered in x in
     [0.25, 12.83]. Under ``processors 2 1 1`` the split is at x = 15
     so rank 1 owns an empty subdomain. The comm-dispatch path must
     still produce correct forces and virial (compared against a
-    same-archive single-rank reference of the same configuration).
+    same-archive single-rank reference of the same trajectory).
 
     This catches: zero-length send/recv lists in the comm tensors,
-    division-by-zero in nlocal-dependent reshapes, and any silent
-    drop of a rank's contribution when it has no atoms to evaluate.
+    division-by-zero in nlocal-dependent reshapes, silent drop of a
+    rank's contribution when it has no atoms to evaluate, AND
+    cache-hit (ago>0) bugs specific to the empty-subdomain rank.
     """
-    out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain)
-    out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain)
-    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    runner_args = ["--neigh-every", "100"]
+    out_mpi = _run_mpi_subprocess(
+        nprocs=2,
+        data_path=data_file_empty_subdomain,
+        extra_args=["--nsteps", "5"],
+        runner_args=runner_args,
+    )
+    out_ref = _run_mpi_subprocess(
+        nprocs=1,
+        data_path=data_file_empty_subdomain,
+        extra_args=["--nsteps", "5"],
+        runner_args=runner_args,
+    )
     np.testing.assert_allclose(
-        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+        out_mpi["forces"], out_ref["forces"], atol=1e-6, rtol=1e-6
     )
-    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-12, abs=1e-12)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-6, rtol=1e-6
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10)
 
 
 @pytest.mark.skipif(

From 5fef5c6b55752452240e063fcecc115c6087eefe Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 30 Apr 2026 23:17:10 +0800
Subject: [PATCH 20/34] test(spin-mpi): cover spin GNN multi-rank end-to-end

Adds the smallest reachable test that exercises the full spin GNN
multi-rank dispatch chain (Tier-1 #2 in gnn_mpi_untested_paths).

- gen_spin.py: also produce deeppot_dpa3_spin_mpi.pt2 (DPA3 +
  use_loc_mapping=False + use_spin=[True, False]) so a dual-artifact
  spin GNN .pt2 exists for testing.
- run_mpi_pair_deepmd_spin_dpa3_pt2.py: MPI runner that drives the
  spin pair_style and gathers force / force_mag / virial across ranks.
  fm goes via 'compute property/atom fmx fmy fmz' since the legacy
  extract/gather_atoms registry doesn't expose 'fm'.
- test_lammps_spin_dpa3_pt2.py: mpirun -n 2 vs same-archive mpirun -n 1
  reference for energy / force / force_mag / virial (atol 1e-8). A
  divergence is necessarily a problem in DeepSpinPTExpt multi-rank
  dispatch, the spin branch of _exchange_ghosts, the C++
  deepmd_export::border_op invocation, or the comm-tensor builder.
- _build_dynamic_shapes: bump nall_dim min from 1 to 4 when has_spin.
  Without this, torch.export raises CONSTRAINT_VIOLATION on the
  pre-doubling nall axis when tracing GNN spin with with_comm_dict
  (the suggested fix in the error matches min=4).

Eager parity (test_spin_dpa3_eager_parity) and trace-only validation
already existed; this PR closes the gap by adding AOTI compile +
LAMMPS load + real MPI exchange.

Known limitations:
- Single configuration tested (4 atoms, 2 ranks, type_map ["Ni", "O"],
  use_spin=[True, False]).  No NULL-type, empty-subdomain, nlist-rebuild
  variants for spin yet -- the non-spin DPA3 path covers those code
  branches and the spin override differs only in the real/virtual
  split, which the new test exercises.
- do_atomic_virial=True only (matches all current multi-rank tests;
  Tier-1 #3 still open).
- N=2 only; no decomposition/N>2 spin variant.
- CPU only.
---
 deepmd/pt_expt/utils/serialization.py         |    8 +-
 .../run_mpi_pair_deepmd_spin_dpa3_pt2.py      |  117 ++
 source/lmp/tests/test_lammps_spin_dpa3_pt2.py |  178 ++
 source/tests/infer/deeppot_dpa3_spin_mpi.yaml | 1863 +++++++++++++++++
 source/tests/infer/gen_spin.py                |   76 +-
 5 files changed, 2240 insertions(+), 2 deletions(-)
 create mode 100644 source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py
 create mode 100644 source/lmp/tests/test_lammps_spin_dpa3_pt2.py
 create mode 100644 source/tests/infer/deeppot_dpa3_spin_mpi.yaml

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index acaa58dbeb..cdf1937fe5 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -318,7 +318,13 @@ def _build_dynamic_shapes(
     nframes_dim: torch.export.Dim | int = (
         1 if with_comm_dict else torch.export.Dim("nframes", min=1)
     )
-    nall_dim = torch.export.Dim("nall", min=1)
+    # Spin models double atom count internally (real + virtual). Some
+    # GNN ops in the spin path generate a min=4 constraint on the
+    # *pre-doubling* nall axis (matches "Suggested fixes" from
+    # torch.export's CONSTRAINT_VIOLATION error). Bump the min for spin
+    # so the export does not error on the inferred guard.
+    nall_min = 4 if has_spin else 1
+    nall_dim = torch.export.Dim("nall", min=nall_min)
     nloc_dim = torch.export.Dim("nloc", min=1)
     nnei_dim = torch.export.Dim("nnei", min=max(1, model_nnei))
 
diff --git a/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py
new file mode 100644
index 0000000000..a8d7fe71a6
--- /dev/null
+++ b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Multi-rank LAMMPS driver for the DPA3 spin GNN .pt2 fixture.
+
+Mirrors ``run_mpi_pair_deepmd_dpa3_pt2.py`` but for spin models:
+``atom_style spin`` / ``pair_style deepspin`` and gathers the
+per-atom magnetic force ``fm`` in addition to the normal force and
+per-atom virial. The DPA3 spin .pt2 with ``use_loc_mapping=False``
+carries a with-comm AOTI artifact (Phase 3 dual-artifact layout); the
+C++ ``DeepSpinPTExpt`` (Phase 4c) routes to it when LAMMPS reports
+nswap > 0 (multi-rank), driving MPI ghost-atom exchange via
+``deepmd_export::border_op``.
+
+Rank 0 writes potential energy + per-atom forces (3 cols) +
+per-atom force_mag (3 cols) + per-atom virial (9 cols, from
+``compute centroid/stress/atom NULL pair`` in LAMMPS internal units)
+to ``OUTPUT`` so the parent pytest process can compare against the
+single-rank reference.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import argparse
+
+import numpy as np
+from lammps import (
+    PyLammps,
+)
+from mpi4py import (
+    MPI,
+)
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "DATAFILE", type=str, help="LAMMPS data file (atom positions + spin)"
+)
+parser.add_argument("PB_FILE", type=str, help=".pt2 model file (spin GNN)")
+parser.add_argument(
+    "OUTPUT", type=str, help="Output file for energies + forces + force_mag + virial"
+)
+parser.add_argument(
+    "--nsteps",
+    type=int,
+    default=0,
+    help="Number of MD steps to run after the initial force evaluation. "
+    "Note: integrating spin requires fix nve/spin which is outside the "
+    "scope of this multi-rank correctness test; we only run static "
+    "force/energy evaluations and an optional run > 0 to exercise the "
+    "with-comm dispatch across neighbour-list rebuilds.",
+)
+parser.add_argument(
+    "--processors",
+    type=str,
+    default="2 1 1",
+    help="LAMMPS processors grid. Default '2 1 1' forces multi-rank "
+    "domain decomposition (nswap>0). Pass '1 1 1' for a single-rank "
+    "reference run on the same archive.",
+)
+args = parser.parse_args()
+
+lammps = PyLammps()
+lammps.processors(args.processors)
+lammps.units("metal")
+lammps.boundary("p p p")
+lammps.atom_style("spin")
+lammps.atom_modify("map yes")
+lammps.neighbor("2.0 bin")
+lammps.neigh_modify("every 10 delay 0 check no")
+lammps.read_data(args.DATAFILE)
+lammps.mass("1 58")
+lammps.mass("2 16")
+lammps.timestep(0.0005)
+lammps.fix("1 all nve")
+
+lammps.pair_style(f"deepspin {args.PB_FILE}")
+lammps.pair_coeff("* *")
+lammps.compute("virial all centroid/stress/atom NULL pair")
+# Per-atom magnetic force components. LAMMPS does not expose ``fm``
+# through the legacy ``extract``/``gather_atoms`` registry, so we go
+# via ``compute property/atom fmx fmy fmz`` + ``gather`` to obtain a
+# global, id-ordered (nlocal+nghost reduced) array on every rank.
+lammps.compute("fmprop all property/atom fmx fmy fmz")
+lammps.run(0)
+
+if args.nsteps > 0:
+    lammps.run(args.nsteps)
+
+# All per-atom data goes through the LAMMPS global gather API.
+# ``c_fmprop`` is the compute defined above (fmx/fmy/fmz columns).
+forces_global = lammps.lmp.gather_atoms("f", 1, 3)
+ids_global = lammps.lmp.gather_atoms("id", 0, 1)
+virial_global = lammps.lmp.gather("c_virial", 1, 9)
+fm_global = lammps.lmp.gather("c_fmprop", 1, 3)
+
+if rank == 0:
+    pe_global = lammps.eval("pe")
+    natoms = lammps.atoms.natoms
+    forces = np.array(forces_global, dtype=np.float64).reshape(natoms, 3)
+    fm = np.array(fm_global, dtype=np.float64).reshape(natoms, 3)
+    virials = np.array(virial_global, dtype=np.float64).reshape(natoms, 9)
+    ids = np.array(ids_global, dtype=np.int64).reshape(natoms)
+    order = np.argsort(ids)
+    forces = forces[order]
+    fm = fm[order]
+    virials = virials[order]
+    with open(args.OUTPUT, "w") as f:
+        f.write(f"{pe_global:.16e}\n")
+        # Each row: 3 force + 3 force_mag + 9 virial = 15 columns.
+        for fi, fmi, vi in zip(forces, fm, virials, strict=True):
+            row = np.concatenate([fi, fmi, vi])
+            f.write(" ".join(f"{v:.16e}" for v in row) + "\n")
+
+MPI.Finalize()
diff --git a/source/lmp/tests/test_lammps_spin_dpa3_pt2.py b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py
new file mode 100644
index 0000000000..fd4ee0a7cb
--- /dev/null
+++ b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Multi-rank LAMMPS test for the DPA3 spin GNN .pt2 fixture.
+
+The DPA3 spin .pt2 (``deeppot_dpa3_spin_mpi.pt2``) is generated by
+``source/tests/infer/gen_spin.py`` with ``use_loc_mapping=False``,
+producing a dual-artifact archive whose nested
+``forward_lower_with_comm.pt2`` is selected by ``DeepSpinPTExpt`` when
+LAMMPS reports ``nswap > 0`` (multi-rank). This test exercises the
+spin GNN multi-rank dispatch end-to-end:
+
+1. Eager parity is already covered by
+   ``source/tests/pt_expt/model/test_spin_export_with_comm.py
+   ::test_spin_dpa3_eager_parity`` (Python override only).
+2. AOTI compile of the with-comm artifact is verified at fixture
+   generation time (``gen_spin.py`` calls ``convert_backend`` which
+   triggers the compile).
+3. **This test** wires the loaded artifact through ``DeepSpinPTExpt``,
+   ``commPTExpt::build_comm_tensors_positional``, the C++
+   ``deepmd_export::border_op`` registration, and real MPI ghost
+   exchange between two LAMMPS subdomains. A passing test means the
+   full chain (Python override + AOTI export + C++ load + comm-tensor
+   build + custom op invocation + MPI exchange) produces forces /
+   force_mag / virial identical to a same-archive single-rank
+   reference within numerical tolerance.
+
+Compares mpi-2 vs same-archive mpi-1 to avoid hardcoding numerical
+references (the same approach used for the DPA3 / DPA2 multi-rank
+tests). Same-archive means the regular and with-comm artifacts come
+from the same trace, so any divergence is purely the multi-rank
+dispatch path's responsibility.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import importlib.util
+import os
+import shutil
+import subprocess as sp
+import sys
+import tempfile
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import pytest
+from write_lmp_data import (
+    write_lmp_data_spin,
+)
+
+pb_file_mpi = (
+    Path(__file__).parent.parent.parent
+    / "tests"
+    / "infer"
+    / "deeppot_dpa3_spin_mpi.pt2"
+)
+data_file = Path(__file__).parent / "data_dpa3_spin_pt2.lmp"
+
+# 4-atom Ni-O system; same layout as ``test_lammps_spin_pt2.py``. With
+# ``processors 2 1 1`` the split sits at x=6.5 -> 2 atoms per rank.
+box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
+coord = np.array(
+    [
+        [12.83, 2.56, 2.18],
+        [12.09, 2.87, 2.74],
+        [3.51, 2.51, 2.60],
+        [4.27, 3.22, 1.56],
+    ]
+)
+spin = np.array(
+    [
+        [0, 0, 1.2737],
+        [0, 0, 1.2737],
+        [0, 0, 0],
+        [0, 0, 0],
+    ]
+)
+type_NiO = np.array([1, 1, 2, 2])
+
+
+def setup_module() -> None:
+    if os.environ.get("ENABLE_PYTORCH", "1") != "1":
+        pytest.skip(
+            "Skip test because PyTorch support is not enabled.",
+        )
+    write_lmp_data_spin(box, coord, spin, type_NiO, data_file)
+
+
+def teardown_module() -> None:
+    if data_file.exists():
+        os.remove(data_file)
+
+
+def _run_mpi_subprocess(
+    nprocs: int,
+    extra_args: list[str] | None = None,
+    processors: str | None = None,
+) -> dict:
+    """Run ``run_mpi_pair_deepmd_spin_dpa3_pt2.py`` under
+    ``mpirun -n <nprocs>`` and return
+    ``{"pe", "forces", "force_mag", "virials"}``.
+    """
+    with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
+        out_path = f.name
+    try:
+        argv = [
+            "mpirun",
+            "-n",
+            str(nprocs),
+            sys.executable,
+            str(Path(__file__).parent / "run_mpi_pair_deepmd_spin_dpa3_pt2.py"),
+            str(data_file.resolve()),
+            str(pb_file_mpi.resolve()),
+            out_path,
+        ]
+        if processors is not None:
+            argv.extend(["--processors", processors])
+        elif nprocs == 1:
+            argv.extend(["--processors", "1 1 1"])
+        if extra_args:
+            argv.extend(extra_args)
+        sp.check_call(argv)
+        with open(out_path) as fh:
+            lines = fh.read().strip().splitlines()
+        pe = float(lines[0])
+        rows = np.array(
+            [list(map(float, line.split())) for line in lines[1:]],
+            dtype=np.float64,
+        )
+        # Each row: 3 force + 3 force_mag + 9 virial = 15 cols (see runner).
+        forces = rows[:, :3]
+        force_mag = rows[:, 3:6]
+        virials = rows[:, 6:]
+        return {
+            "pe": pe,
+            "forces": forces,
+            "force_mag": force_mag,
+            "virials": virials,
+        }
+    finally:
+        if os.path.exists(out_path):
+            os.remove(out_path)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_spin() -> None:
+    """Multi-rank LAMMPS run for spin DPA3 .pt2 must match the
+    same-archive single-rank reference within numerical tolerance for
+    energy, forces, force_mag, and per-atom virial.
+
+    Going via mpi-1 (rather than a hardcoded reference array) means we
+    are validating the multi-rank dispatch path itself, isolated from
+    any tracing / AOTI precision drift that might appear at fixture
+    generation time. Single-rank uses the regular artifact (nswap=0);
+    multi-rank uses the with-comm artifact — so a divergence here is
+    necessarily a problem in either ``DeepSpinPTExpt`` multi-rank
+    dispatch, the spin branch of ``_exchange_ghosts``, the C++
+    ``deepmd_export::border_op`` invocation, or the comm-tensor
+    builder.
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=2)
+    out_ref = _run_mpi_subprocess(nprocs=1)
+
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-10, abs=1e-12)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["force_mag"], out_ref["force_mag"], atol=1e-8, rtol=0
+    )
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
diff --git a/source/tests/infer/deeppot_dpa3_spin_mpi.yaml b/source/tests/infer/deeppot_dpa3_spin_mpi.yaml
new file mode 100644
index 0000000000..6fce85245a
--- /dev/null
+++ b/source/tests/infer/deeppot_dpa3_spin_mpi.yaml
@@ -0,0 +1,1863 @@
+backend: dpmodel
+model:
+  backbone_model:
+    "@class": Model
+    "@variables":
+      out_bias:
+        "@class": np.ndarray
+        "@is_variable": true
+        "@version": 1
+        dtype: float64
+        value:
+          - - - 0.0
+            - - 0.0
+            - - 0.0
+            - - 0.0
+      out_std:
+        "@class": np.ndarray
+        "@is_variable": true
+        "@version": 1
+        dtype: float64
+        value:
+          - - - 1.0
+            - - 1.0
+            - - 1.0
+            - - 1.0
+    "@version": 2
+    atom_exclude_types: &id002
+      - 2
+      - 3
+    descriptor:
+      "@class": Descriptor
+      "@version": 2
+      activation_function: silu
+      add_chg_spin_ebd: false
+      concat_output_tebd: false
+      env_protection: 1.0e-06
+      exclude_types: &id003
+        - - 3
+          - 0
+        - - 3
+          - 1
+        - - 3
+          - 2
+        - - 3
+          - 3
+      ntypes: 4
+      precision: float64
+      repflow_args:
+        a_compress_e_rate: 1
+        a_compress_rate: 0
+        a_compress_use_split: false
+        a_dim: 4
+        a_rcut: 3.5
+        a_rcut_smth: 0.5
+        a_sel: 4
+        axis_neuron: 4
+        e_dim: 6
+        e_rcut: 4.0
+        e_rcut_smth: 0.5
+        e_sel: 8
+        edge_init_use_dist: false
+        fix_stat_std: 0.3
+        n_dim: 8
+        n_multi_edge_message: 1
+        nlayers: 1
+        optim_update: true
+        sel_reduce_factor: 10.0
+        smooth_edge_update: false
+        update_angle: false
+        update_residual: 0.1
+        update_residual_init: const
+        update_style: res_residual
+        use_dynamic_sel: false
+        use_exp_switch: false
+      repflow_variable:
+        "@variables":
+          davg:
+            "@class": np.ndarray
+            "@is_variable": true
+            "@version": 1
+            dtype: float64
+            value:
+              - - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+              - - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+              - - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+              - - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+                - - 0.0
+                  - 0.0
+                  - 0.0
+                  - 0.0
+          dstd:
+            "@class": np.ndarray
+            "@is_variable": true
+            "@version": 1
+            dtype: float64
+            value:
+              - - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+              - - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+              - - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+              - - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+                - - 0.3
+                  - 0.3
+                  - 0.3
+                  - 0.3
+        angle_embd:
+          "@class": Layer
+          "@variables":
+            b: null
+            idt: null
+            w:
+              "@class": np.ndarray
+              "@is_variable": true
+              "@version": 1
+              dtype: float64
+              value:
+                - - 0.17649720801051316
+                  - 0.26111987625920485
+                  - -0.5130082451185703
+                  - -0.473906411761865
+          "@version": 2
+          activation_function: none
+          bias: false
+          precision: float64
+          resnet: false
+          trainable: true
+          use_timestep: false
+        edge_embd:
+          "@class": Layer
+          "@variables":
+            b:
+              "@class": np.ndarray
+              "@is_variable": true
+              "@version": 1
+              dtype: float64
+              value:
+                - -0.34071690817734457
+                - -0.10233679058558394
+                - -0.042158524863509905
+                - 0.1901865304247668
+                - 0.5454968471423458
+                - -0.15466206519031384
+            idt: null
+            w:
+              "@class": np.ndarray
+              "@is_variable": true
+              "@version": 1
+              dtype: float64
+              value:
+                - - -0.41394371690540416
+                  - -0.020029235227998383
+                  - 0.7548439568852728
+                  - 0.18561320525199423
+                  - -0.1235585931191982
+                  - -0.6320668874586287
+          "@version": 2
+          activation_function: none
+          bias: true
+          precision: float64
+          resnet: false
+          trainable: true
+          use_timestep: false
+        env_mat:
+          protection: 0.0
+          rcut: 4.0
+          rcut_smth: 0.5
+          use_exp_switch: false
+        repflow_layers:
+          - "@class": RepFlowLayer
+            "@variables":
+              a_residual: []
+              e_residual:
+                - "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+              n_residual:
+                - "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                - "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                - "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+                    - 0.1
+            "@version": 2
+            a_compress_e_rate: 1
+            a_compress_rate: 0
+            a_compress_use_split: false
+            a_dim: 4
+            a_rcut: 3.5
+            a_rcut_smth: 0.5
+            a_sel: 4
+            activation_function: silu
+            axis_neuron: 4
+            e_dim: 6
+            e_rcut: 4.0
+            e_rcut_smth: 0.5
+            e_sel:
+              - 8
+            edge_self_linear:
+              "@class": Layer
+              "@variables":
+                b:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - 0.023029640256287606
+                    - -0.1553057617002839
+                    - 0.288173154238969
+                    - 0.05107253540449686
+                    - 0.2614964936530096
+                    - -0.46218537330158843
+                idt: null
+                w:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - - -0.05677878533841125
+                      - 0.08257140520321203
+                      - -0.17682541236689134
+                      - -0.06780335156677138
+                      - 0.2613225810769312
+                      - -0.2528499571680411
+                    - - -0.1060811611888437
+                      - 0.2834597688459181
+                      - 0.17021435817066793
+                      - -0.2119118384053945
+                      - 0.044257126661162875
+                      - -0.20348530980582044
+                    - - -0.17206345113221683
+                      - 0.19628652842871394
+                      - -0.03853877890775931
+                      - 0.0064469870425646406
+                      - -0.2035021228657865
+                      - 0.33893151231499635
+                    - - -0.1603541649096622
+                      - -0.07431170557593793
+                      - -0.1285051383598977
+                      - -0.09531516630926942
+                      - -0.10774430641710406
+                      - 0.10558546868439946
+                    - - -0.027408677366010784
+                      - 0.03171038951939523
+                      - -0.26649612755080526
+                      - 0.0749559135333121
+                      - 0.12753219377780048
+                      - -0.12375862279261161
+                    - - -0.3561807917324476
+                      - -0.028580689473013433
+                      - -0.2740045204725747
+                      - 0.12423725221263406
+                      - -0.0746927118825747
+                      - -0.16583458613892285
+                    - - -0.22497394079088218
+                      - -0.10329264538957945
+                      - -0.06015496745765555
+                      - -0.24047390264558702
+                      - -0.2470728805254821
+                      - -0.03091482236168157
+                    - - 0.313786674711663
+                      - -0.014345848137540798
+                      - -0.1446657411476756
+                      - -0.11134433415995286
+                      - -0.10957716367503506
+                      - 0.25318230359455945
+                    - - -0.11353216449019704
+                      - -0.24278855525542462
+                      - -0.0657328669264313
+                      - -0.08357873620530161
+                      - -0.19969579432068596
+                      - 0.0217399962565733
+                    - - -0.017346111123240478
+                      - -0.20460540022763518
+                      - 0.19580548714183002
+                      - 0.26081320850512657
+                      - -0.01937612111130992
+                      - 0.26782602217325135
+                    - - 0.169450738702664
+                      - 0.0007586409725729347
+                      - 0.2217946757639642
+                      - -0.08785618340632881
+                      - 0.08754553673729902
+                      - 0.0459550075486224
+                    - - -0.10347442434861592
+                      - -0.05665265742992996
+                      - -0.15657294594958857
+                      - 0.07518451488260593
+                      - -0.200469822163535
+                      - 0.008552407309104103
+                    - - -0.13418495292451688
+                      - -0.15007855071339413
+                      - -0.47561245640659827
+                      - 0.05519145026405931
+                      - 0.034426127944687676
+                      - -0.19833628864440492
+                    - - -0.061539077996517144
+                      - 0.140236735963936
+                      - 0.44907007382747016
+                      - -0.17502514002597466
+                      - -0.13141545313528988
+                      - -0.10225960767785013
+                    - - 0.15849623153238157
+                      - 0.14969793438965767
+                      - 0.05020396887825857
+                      - -0.42237393212574514
+                      - -0.43560848414739306
+                      - -0.34368434587411545
+                    - - -0.090558268807612
+                      - -0.10586479947976848
+                      - -0.17654116465986686
+                      - -0.17464251661717356
+                      - -0.17707748016637653
+                      - 0.4728011907076426
+                    - - 0.06839467741547146
+                      - 0.20172332403056187
+                      - -0.20761658659357723
+                      - -0.3179201458113386
+                      - 0.1570191398976539
+                      - 0.30829366728408747
+                    - - -0.07346831672915768
+                      - -0.01603422028135059
+                      - -0.2343121216044693
+                      - -0.09228986456390967
+                      - -0.12259985802096685
+                      - 0.13925704477109332
+                    - - 0.03112673892006412
+                      - -0.12259170091097643
+                      - -0.01873720650800844
+                      - -0.02825905483134531
+                      - -0.07410620360262994
+                      - -0.13890487670689447
+                    - - 0.2599426512954838
+                      - -0.030475413023044056
+                      - 0.04418102981236639
+                      - 0.14747916053674695
+                      - 0.11469436259489629
+                      - -0.12589465767715197
+                    - - 0.1534348683560527
+                      - -0.2598559665351654
+                      - 0.1691188844559884
+                      - -0.05815067519957393
+                      - -0.09922406205302857
+                      - -0.026111067214965193
+                    - - -0.09469687008709152
+                      - -0.25433614509748265
+                      - -0.3230603080176275
+                      - 0.10565697308598668
+                      - 0.11382397843310456
+                      - 0.12636033735242963
+              "@version": 2
+              activation_function: none
+              bias: true
+              precision: float64
+              resnet: false
+              trainable: true
+              use_timestep: false
+            n_dim: 8
+            n_multi_edge_message: 1
+            node_edge_linear:
+              "@class": Layer
+              "@variables":
+                b:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - -0.08691917211888521
+                    - -0.2190238624015504
+                    - -0.03155197949842874
+                    - -0.06801377552229042
+                    - -0.36593263653854924
+                    - -0.2524793728902088
+                    - -0.2985692887165394
+                    - 0.041241949395424804
+                idt: null
+                w:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - - -0.0696849625476975
+                      - 0.2297992430114777
+                      - 0.2016390404837622
+                      - -0.14268332516715398
+                      - 0.05104561028973491
+                      - -0.0047524771390660015
+                      - 0.2007647888532239
+                      - -0.13892443853282946
+                    - - -0.06269186432762
+                      - -0.3620323943560662
+                      - 0.1429598213093448
+                      - 0.32251103225335886
+                      - 0.03297508356302982
+                      - 0.09813020765990464
+                      - -0.2128967691985101
+                      - 0.1446653191521595
+                    - - 0.3408694676048958
+                      - 0.2108742996875453
+                      - 0.3708891734344055
+                      - 0.03603632207631642
+                      - -0.021861106604374982
+                      - 0.05885265981211556
+                      - -0.3850668694292795
+                      - -0.02565715855818745
+                    - - 0.2554067355579947
+                      - -0.12385934461529113
+                      - 0.20794804172087122
+                      - 0.34771760519493133
+                      - 0.0030775484903255083
+                      - 0.08033323613153229
+                      - 0.04547640227535313
+                      - 0.03256133523805088
+                    - - -0.20228475171413982
+                      - -0.40882303462099395
+                      - -0.13933573248982073
+                      - -0.09056898648309795
+                      - 0.06705102826758672
+                      - -0.10643998751725821
+                      - 0.3714789434592029
+                      - -0.15660714896565422
+                    - - 0.0620445405627027
+                      - -0.14981233984554174
+                      - 0.1377612457580642
+                      - -0.3264453797874674
+                      - 0.18886992363386892
+                      - -0.13120999191064697
+                      - 0.2639396300281778
+                      - 0.20744058178112204
+                    - - 0.0902283316504931
+                      - 0.2642720697422412
+                      - 0.11616051352480065
+                      - 0.33194344559115435
+                      - -0.07519119975054182
+                      - -0.05062288710700148
+                      - -0.0033899752949634763
+                      - 0.12074780296663348
+                    - - -0.14625494457636853
+                      - -0.39187048236106403
+                      - 0.005863181213654556
+                      - 0.08058988215606765
+                      - 0.229684677996952
+                      - 0.02491096095922189
+                      - -0.07923462148233366
+                      - 0.03463149425218323
+                    - - 0.1459761391053138
+                      - 0.1826916307305693
+                      - -0.24330282168960599
+                      - -0.15404338160080283
+                      - -0.15732528026070658
+                      - 0.10082194118502665
+                      - 0.10780094880007303
+                      - 0.10439459076027502
+                    - - 0.2967580322447816
+                      - 0.19310548831515634
+                      - 0.13271337197427788
+                      - -0.003964549207383962
+                      - -0.3053587625881187
+                      - 0.12883374510336365
+                      - 0.045960329737757634
+                      - 0.19761345822107057
+                    - - -0.13773367016862742
+                      - 0.09659775346412201
+                      - 0.13561552570758134
+                      - 0.07814681408507194
+                      - -0.28773064288403055
+                      - 0.07556744144481048
+                      - -0.09838713644355178
+                      - 0.009867107649393275
+                    - - -0.09655717020123253
+                      - -0.10871554819348611
+                      - -0.11670258568304015
+                      - 0.2177137774640066
+                      - -0.14817421356773255
+                      - -0.03606693672811542
+                      - -0.026029214690369364
+                      - -0.040666475049662726
+                    - - -0.0677385423671006
+                      - -0.12993597893178244
+                      - 0.1180039874263662
+                      - 0.1384604584579823
+                      - -0.024227421664540914
+                      - 0.1679245814762119
+                      - -0.19280274838451647
+                      - 0.0990223630355508
+                    - - 0.0758415027141385
+                      - 0.16215196433523008
+                      - 0.2767732385588474
+                      - 0.022163750613004355
+                      - -0.12254120989786124
+                      - -0.12391951174230557
+                      - -0.028791741351884195
+                      - -0.0595519969823867
+                    - - 0.22247449036902905
+                      - 0.07567917899966987
+                      - -0.18221068561029122
+                      - -0.1496346790319525
+                      - 0.01739141266484531
+                      - 0.03295277270138665
+                      - -0.27927822171693173
+                      - -0.13558030103477586
+                    - - 0.1712575942124072
+                      - 0.13705603104177683
+                      - 0.290608271870899
+                      - 0.25077636518593155
+                      - 0.06723740912894116
+                      - -0.29077479630216374
+                      - -0.25998108625190797
+                      - 0.15096707384533595
+                    - - -0.011258223444056591
+                      - 0.07940059884337107
+                      - 0.14539160696529732
+                      - -0.33401238196882443
+                      - 0.0359760729699335
+                      - -0.02226084988227022
+                      - 0.12276616178918343
+                      - 0.0439592954772777
+                    - - 0.07596667366672871
+                      - -0.11052600964268607
+                      - -0.13155071841622368
+                      - -0.07425437999013539
+                      - 0.00827734508288158
+                      - 0.07414300482320346
+                      - 0.052019022231599196
+                      - 0.16368644986528788
+                    - - 0.31022863799320216
+                      - 0.11380817934759249
+                      - 0.11671054675679823
+                      - 0.03833224311415518
+                      - 0.1545146635596559
+                      - 0.5283089690392868
+                      - -0.17235747525638992
+                      - -0.16802245441710034
+                    - - 0.19547575805994974
+                      - 0.03442738806627725
+                      - 0.035134165349037516
+                      - 0.1685202553837112
+                      - -0.13706885637245225
+                      - -0.09105484518308726
+                      - 0.24401116664356562
+                      - -0.042463896239058455
+                    - - 0.18293429344914702
+                      - -0.0797150153045118
+                      - 0.2837300628985514
+                      - -0.03290000697254011
+                      - 0.07484025269991934
+                      - 0.4486382833349405
+                      - 0.18215765586473062
+                      - 0.14222755521955213
+                    - - -0.054949228485595726
+                      - 0.2298266346316468
+                      - -0.13022437426681047
+                      - 0.31473958548227127
+                      - -0.16053599380138361
+                      - 0.12351036770696595
+                      - -0.2026640600757936
+                      - -0.3120452604960154
+              "@version": 2
+              activation_function: none
+              bias: true
+              precision: float64
+              resnet: false
+              trainable: true
+              use_timestep: false
+            node_self_mlp:
+              "@class": Layer
+              "@variables":
+                b:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - 0.3339521660758233
+                    - 0.19617864215638078
+                    - 0.11685150273643896
+                    - -0.04301114015831818
+                    - -0.2646745547826684
+                    - -0.05874585577443532
+                    - 0.4130256006886377
+                    - -0.6003500792716773
+                idt: null
+                w:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - - -0.22061013087519665
+                      - 0.17161694901625085
+                      - 0.25079797681294247
+                      - -0.06984190636344022
+                      - 0.402412783105689
+                      - -0.13232509868240386
+                      - -0.12410592033624109
+                      - -0.5243896508356666
+                    - - -0.34531669337816745
+                      - 0.2590681097532894
+                      - -0.4170438578433154
+                      - 0.33209656716128205
+                      - 0.20907222698506978
+                      - 0.21026382825889875
+                      - -0.04125433055358784
+                      - -0.3362049950725693
+                    - - -0.02306669199993831
+                      - -0.27140136827851236
+                      - 0.08675906253383281
+                      - 0.20991982378397447
+                      - -0.20157467157772102
+                      - 0.10954533237221269
+                      - -0.30521247150866015
+                      - 0.1039196228402914
+                    - - 0.2927901959232568
+                      - -0.05686111266739088
+                      - -0.352867716741099
+                      - 0.06499009437306054
+                      - 0.2935084094905296
+                      - -0.5208455549268021
+                      - -0.06412894033597939
+                      - 0.2617524844957687
+                    - - -0.26859205166611555
+                      - -0.017740123512057532
+                      - -0.16973184286647353
+                      - -0.041497625408519805
+                      - -0.33848186563738925
+                      - -0.498133067071094
+                      - 0.06453515847241846
+                      - -0.28211046673410256
+                    - - -0.0031712540783364537
+                      - 0.14054927501098227
+                      - -0.16739625499774285
+                      - 0.02924799819668618
+                      - 0.19945724852581612
+                      - -0.07433092972702877
+                      - 0.33641837410477954
+                      - -0.1935354318143647
+                    - - -0.2896583115032089
+                      - -0.4291374752779325
+                      - 0.18521131755882006
+                      - 0.036186935403130116
+                      - 0.27669775576389155
+                      - -0.04763160274577408
+                      - 0.1400908330823242
+                      - 0.15697986928574623
+                    - - -0.45902865822845124
+                      - 0.33250108656046035
+                      - 0.0306169230429561
+                      - -0.035381192364331175
+                      - -0.0510947377580893
+                      - 0.03972955950151097
+                      - 0.6129808284962325
+                      - 0.027297205883797467
+              "@version": 2
+              activation_function: none
+              bias: true
+              precision: float64
+              resnet: false
+              trainable: true
+              use_timestep: false
+            node_sym_linear:
+              "@class": Layer
+              "@variables":
+                b:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - 0.04402011262092976
+                    - 0.19539133788288796
+                    - 0.02243486288225181
+                    - -0.15932598464026163
+                    - -0.1441065175896103
+                    - -0.20205704607775893
+                    - 0.007090553889850609
+                    - -0.20221671762001667
+                idt: null
+                w:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - - 0.07307166266137728
+                      - 0.06127645860254937
+                      - -0.18492998679736772
+                      - 0.04613999102916452
+                      - 0.0071079000479441
+                      - -0.03731231022031221
+                      - -0.09483134725409749
+                      - -0.04779952388125717
+                    - - -0.06975495248291474
+                      - -0.19948091645922555
+                      - -0.19101500000694704
+                      - -0.0756612239190429
+                      - 0.18223713459959498
+                      - 0.004660326879702162
+                      - -0.07331926290215518
+                      - -0.11804049351864328
+                    - - -0.008643603296694117
+                      - -0.07891692454926386
+                      - -0.24683520896350286
+                      - -0.07498319216962253
+                      - 0.14604675984008694
+                      - 0.09601184516912262
+                      - -0.01561740011879576
+                      - 0.05490167651869453
+                    - - -0.019884970427089133
+                      - 0.0007666914047260165
+                      - -0.16505651916265357
+                      - -0.16723740821054547
+                      - 0.1234653183096876
+                      - -0.04403642108952563
+                      - -0.03727304005788303
+                      - -0.18190516409632088
+                    - - -0.09168767286099873
+                      - -0.1549419399425698
+                      - 0.08193144903871091
+                      - 0.15640675555750194
+                      - -0.06305034848986912
+                      - 0.16836512195133213
+                      - 0.009048302220263893
+                      - 0.05322075713280992
+                    - - -0.06468543813846328
+                      - 0.0948348631241292
+                      - 0.006867290444906741
+                      - -0.24931773871448817
+                      - 0.08788089155489308
+                      - -0.0739514302480491
+                      - 0.025288498321181765
+                      - -0.08305521153831118
+                    - - -0.07393598220040017
+                      - 0.07042478981157554
+                      - -0.07236047649200875
+                      - -0.04706083253081129
+                      - 0.011054293351306345
+                      - -0.08799610585856558
+                      - 0.1563680796185477
+                      - 0.04333789772104407
+                    - - -0.10653678039670528
+                      - 0.18112426500221723
+                      - 0.009186401470971654
+                      - 0.006153194931152504
+                      - -0.04989535662898608
+                      - -0.17876067282409114
+                      - -0.15602193322162777
+                      - 0.00781917954318876
+                    - - -0.06699569918753231
+                      - 0.30735630566871885
+                      - -0.016096279041795645
+                      - -0.22956358044083025
+                      - -0.0065529000816888765
+                      - -0.06902463180143781
+                      - 0.06768609922953323
+                      - 0.1187567871665586
+                    - - -0.03935798540152214
+                      - -0.10867329670955546
+                      - -0.04052094555163571
+                      - -0.04078630187590839
+                      - -0.0748763378601901
+                      - 0.01860182181594992
+                      - 0.20057959184872112
+                      - 0.16046549209905156
+                    - - 0.031478615338666395
+                      - 0.11567514563874234
+                      - 0.08294594125898624
+                      - -0.07089853590674343
+                      - 0.20101923186451937
+                      - -0.11766930025015115
+                      - 0.21570163379940235
+                      - 0.14563108587004206
+                    - - 0.07932986781606469
+                      - -0.19442968907969105
+                      - 0.05697454617840562
+                      - -0.19484656091831729
+                      - 0.04754566926156801
+                      - -0.12152155059832441
+                      - 0.08105546170302243
+                      - -0.09483406966077029
+                    - - -0.10943334690817784
+                      - 0.11702284889224986
+                      - 0.06551144399385757
+                      - -0.003108503735325857
+                      - -0.1466684268106551
+                      - 0.11582453333312602
+                      - -0.19609870968779317
+                      - -0.11809063481420465
+                    - - -0.11120967058944209
+                      - -0.07178289284260277
+                      - -0.07505138171189361
+                      - -0.17137771295621249
+                      - -0.012516091428859523
+                      - 0.056132912587423756
+                      - -0.011172736867909887
+                      - 0.0014926969164057145
+                    - - -0.1652803302650934
+                      - 0.08452449793427
+                      - -0.06260662069159101
+                      - -0.07909718643578055
+                      - 0.00574135469567161
+                      - -0.05691391300603163
+                      - 0.2457179942284785
+                      - -0.08037694862142311
+                    - - 0.1761032538671494
+                      - -0.15524353322856968
+                      - -0.20338260987738993
+                      - -0.09738847488694806
+                      - 0.05960295717975261
+                      - -0.0268406105291267
+                      - 0.19154482080963495
+                      - -0.05557739958347549
+                    - - -0.23162474155138468
+                      - 0.005428848189956548
+                      - 0.14498512403306713
+                      - 0.015859517797165032
+                      - 0.13342303538966063
+                      - 0.07757097608660568
+                      - 0.061885304048992174
+                      - -0.02774862502778554
+                    - - -0.099674682792698
+                      - 0.1743242267060875
+                      - 0.0565895993699819
+                      - -0.1431728246694354
+                      - -0.04572377374634247
+                      - 0.1932522842767088
+                      - -0.13605774184771868
+                      - -0.079596349847149
+                    - - 0.015159290423222593
+                      - 0.0741788473825365
+                      - -0.025111776236424455
+                      - 0.11728977172727281
+                      - -0.05246129405331076
+                      - -0.3560652693695576
+                      - 0.22489664505020285
+                      - -0.11322150427163667
+                    - - 0.1172685876179488
+                      - 0.015449206720498673
+                      - 0.11464505230123948
+                      - -0.13045379503420262
+                      - -0.18460226345307634
+                      - -0.0735660416536509
+                      - 0.02668836976483192
+                      - 0.009471901506209893
+                    - - -0.12415218588856815
+                      - -0.028427823628392242
+                      - -0.0726329032188482
+                      - 0.2205454016716484
+                      - -0.06981635935553832
+                      - -0.06914918285976224
+                      - -0.07547512647684368
+                      - 0.19585301943839276
+                    - - 0.02068794647278527
+                      - 0.11434955856950152
+                      - 0.04733548159377606
+                      - -0.0940771421180628
+                      - 0.106950218084799
+                      - 0.11995323224700441
+                      - 0.07016105028143815
+                      - -0.07349788842232614
+                    - - -0.028316732941958092
+                      - -0.006316920155388264
+                      - 0.014323448114816232
+                      - 0.07909510285143638
+                      - 0.08089223619428912
+                      - -0.1285448965066473
+                      - -0.02731037643994388
+                      - -0.048232324099890284
+                    - - -0.04229476466912251
+                      - -0.10545582133061814
+                      - -0.1399519987577358
+                      - -0.24859786794141928
+                      - 0.04555029533580089
+                      - -0.06637709714144181
+                      - -0.11891839416041088
+                      - -0.05608836594526548
+                    - - -0.1481671676394082
+                      - -0.11826472343612228
+                      - 0.18759449377634982
+                      - -0.0027813243183313764
+                      - -0.06187858233767373
+                      - -0.16870507895423517
+                      - 0.15432198341660605
+                      - 0.2442525725033602
+                    - - 0.11655618965628044
+                      - 0.16410614799338208
+                      - -0.15922334755571288
+                      - 0.05294100944284731
+                      - -0.042676438943807564
+                      - 0.05982722192738627
+                      - 0.08818007330306689
+                      - 0.08799006862019813
+                    - - -0.1816952674192488
+                      - 0.33018315199731113
+                      - 0.14825048237904745
+                      - -0.12977688627249692
+                      - -0.014039894202582361
+                      - 0.021698570605095405
+                      - -0.10536292700472008
+                      - -0.016298405526400214
+                    - - 0.18891280861168214
+                      - -0.037066320234429954
+                      - 0.051989201606798936
+                      - -0.33236261122879446
+                      - -0.2233240290736924
+                      - 0.17632501110044835
+                      - 0.02791043546786102
+                      - 0.08058616657592761
+                    - - -0.12416787825473675
+                      - -0.0018776550590277605
+                      - -0.1361594510955972
+                      - -0.031008628174283
+                      - -0.1510470016534144
+                      - -0.1968118582063139
+                      - 0.05923927005740039
+                      - 0.10906017525194028
+                    - - -0.01747528984400593
+                      - 0.043571037430286425
+                      - 0.09735765094593854
+                      - 0.038496104792229716
+                      - 0.021583898030338507
+                      - -0.11795161808253331
+                      - -0.11404406907043374
+                      - -0.06541831356900717
+                    - - 0.05781757062086345
+                      - 0.06545403068342133
+                      - 0.07182196888387801
+                      - 0.06571017380833269
+                      - 0.25549620850343796
+                      - -0.01712221435859817
+                      - 0.02746476505848508
+                      - -0.16813933068880024
+                    - - 0.15811742659496866
+                      - -0.10097487333290259
+                      - 0.0007478905750386516
+                      - 0.15986815657402492
+                      - 0.0879704571647486
+                      - 0.051839404360383305
+                      - 0.04773139180116972
+                      - -0.1562216704347126
+                    - - -0.00554177026701311
+                      - 0.026672084558123862
+                      - -0.026556168406945337
+                      - 0.017618135480540704
+                      - 0.04290442846891425
+                      - -0.16108845422437917
+                      - 0.03885069382837762
+                      - -0.08559226312134341
+                    - - -0.10984387513362157
+                      - 0.06020841962256015
+                      - 0.013439129456291792
+                      - -0.1211722988008539
+                      - 0.0321361577334442
+                      - 0.04742132269747014
+                      - -0.08371259477093888
+                      - -0.14250805695920574
+                    - - 0.04498243399350513
+                      - -0.03633434279549633
+                      - 0.17043129619564554
+                      - 0.13738977779076048
+                      - 0.03367329367643751
+                      - 0.13141345496526305
+                      - 0.14626062464255066
+                      - -0.087660426894852
+                    - - 0.13304548046946202
+                      - -0.02074921039690319
+                      - -0.19614199925540662
+                      - 0.09145888259449976
+                      - -0.16872056060024043
+                      - -0.057806035869808946
+                      - -0.012927002228426554
+                      - -0.18968555494779932
+                    - - 0.09056415309144267
+                      - 0.19579647713404205
+                      - 0.12419307551929215
+                      - 0.03068324855507999
+                      - 0.16324257199502792
+                      - 0.28864177653836015
+                      - -0.04884842530407823
+                      - 0.05243039778651716
+                    - - -0.1354513040660592
+                      - -0.0032083328727676315
+                      - 0.035763067000830435
+                      - -0.10752629467535854
+                      - -0.004527627068300205
+                      - -0.26678729966885645
+                      - 0.16095749546546945
+                      - -0.0768457166279081
+                    - - 0.24290534029168284
+                      - -0.19993818991295886
+                      - 0.05863500838014017
+                      - 0.1075745460176732
+                      - -0.2703641493668329
+                      - 0.022882752217475207
+                      - -0.18377439784177813
+                      - -0.02475991439750886
+                    - - -0.0970343883793403
+                      - 0.022190761521183
+                      - -0.31137609288015433
+                      - -0.12852583938411438
+                      - 0.06380585650762231
+                      - -0.05537350140183391
+                      - 0.009834307052428782
+                      - -0.18327381164681603
+                    - - -0.058720582106338445
+                      - 0.012207974777885133
+                      - 0.04906298973398652
+                      - -0.0252045636071624
+                      - 0.04064401311527239
+                      - -0.12030307623147056
+                      - -0.02607458251331658
+                      - -0.12104904963385374
+                    - - 0.14380149442345772
+                      - -0.08586187966457755
+                      - -0.0562380312253021
+                      - 0.1183995520092173
+                      - -0.008618891616010692
+                      - -0.30556252122213096
+                      - 0.157107693967395
+                      - -0.150824446001649
+                    - - -0.12986340463514554
+                      - -0.13953775800615473
+                      - 0.06688782609307184
+                      - 0.30709990962247197
+                      - -0.10057794483875744
+                      - -0.15572836837520085
+                      - 0.22240522808485344
+                      - 0.07486567450323982
+                    - - -0.00026497955681491453
+                      - -0.462148220797257
+                      - -0.04683339159019641
+                      - 0.10954858908660245
+                      - 0.048155719596331595
+                      - -0.08404934441388894
+                      - 0.15848474948089222
+                      - -0.029754000979091536
+                    - - -0.008795641657631076
+                      - -0.021341761230446545
+                      - -0.10489671109204046
+                      - 0.03213370243212562
+                      - -0.021792936100149974
+                      - -0.018371450392434912
+                      - 0.0007292277382723748
+                      - 0.07679112359755517
+                    - - -0.06130007400378907
+                      - -0.06581095863692285
+                      - 0.06501448048047738
+                      - -0.14197246804370967
+                      - 0.15983589537290877
+                      - -0.15693380789472725
+                      - -0.17963845906090375
+                      - 0.10204145028546817
+                    - - -0.07077050429398143
+                      - 0.1990098057969514
+                      - -0.2525111691805106
+                      - -0.22059894251537618
+                      - -0.27531410890875607
+                      - -0.0693243961021514
+                      - 0.03876302523241355
+                      - 0.12122101629786736
+                    - - -0.12820657692829063
+                      - -0.10772035941442479
+                      - 0.10829696580051636
+                      - 0.1493715060396245
+                      - 0.13488833866187872
+                      - 0.09022524867490032
+                      - 0.007332743974581279
+                      - 0.1529338321168549
+                    - - -0.22245363971842472
+                      - -0.08917661330105822
+                      - 0.10304564318043377
+                      - -0.07026805272160686
+                      - 0.016625750231852813
+                      - 0.23074109385732217
+                      - 0.053971407495566504
+                      - -0.15089059679319458
+                    - - 0.1294396068073317
+                      - -0.038487426453509534
+                      - 0.09393650831599386
+                      - 0.09638990927578407
+                      - 0.17905918157852316
+                      - 0.06760574587425355
+                      - 0.0639998107196389
+                      - -0.1587157815816586
+                    - - 0.06077231806824999
+                      - 0.006159909130812671
+                      - 0.15285274367932117
+                      - -0.026531120401424045
+                      - 0.06104797756042876
+                      - -0.174933801016035
+                      - 0.25284181425638513
+                      - -0.16931699181750984
+                    - - -0.09480440252644158
+                      - -0.11919995631753837
+                      - 0.1374865485894956
+                      - 0.03525829583245701
+                      - 0.055414318086174905
+                      - 0.039970825479268265
+                      - -0.028476173719310948
+                      - 0.007895110382084259
+                    - - -0.08849522170883828
+                      - 0.1556903658898126
+                      - -0.06942905817654972
+                      - 0.17917871676321492
+                      - -0.12839965901095401
+                      - -0.1457242708290995
+                      - 0.2073632537418445
+                      - -0.0033056633245595168
+                    - - -0.14321940581992326
+                      - 0.016216983383358995
+                      - -0.05603214608550905
+                      - 0.034067014410779244
+                      - -0.004165932252642813
+                      - 0.03579825379823718
+                      - 0.2274077472661256
+                      - 0.12282153328534674
+                    - - -0.17424677728325255
+                      - 0.03032450606197887
+                      - -0.3407467917235723
+                      - 0.08460871296272927
+                      - -0.21233509125037692
+                      - 0.038581785470083826
+                      - -0.1271081651221865
+                      - -0.05674635282930029
+                    - - -0.06365889303105148
+                      - -0.0346798442701684
+                      - 0.04178115473238202
+                      - -0.03570145798701077
+                      - 0.2255873927499116
+                      - -0.21936512330368732
+                      - -0.19469567244011848
+                      - -0.007461014512643234
+              "@version": 2
+              activation_function: none
+              bias: true
+              precision: float64
+              resnet: false
+              trainable: true
+              use_timestep: false
+            ntypes: 4
+            optim_update: true
+            precision: float64
+            sel_reduce_factor: 10.0
+            smooth_edge_update: false
+            update_angle: false
+            update_residual: 0.1
+            update_residual_init: const
+            update_style: res_residual
+            use_dynamic_sel: false
+      trainable: true
+      type: dpa3
+      type_embedding:
+        "@class": TypeEmbedNet
+        "@version": 2
+        activation_function: Linear
+        embedding:
+          "@class": EmbeddingNetwork
+          "@version": 2
+          activation_function: Linear
+          bias: false
+          in_dim: 4
+          layers:
+            - "@class": Layer
+              "@variables":
+                b: null
+                idt: null
+                w:
+                  "@class": np.ndarray
+                  "@is_variable": true
+                  "@version": 1
+                  dtype: float64
+                  value:
+                    - - 0.06913868355931278
+                      - -0.3276059448146492
+                      - -0.22478586008940918
+                      - -0.03129740042629991
+                      - -0.2511436154794455
+                      - -0.4760319710462916
+                      - 0.183856376649989
+                      - 0.220680920691283
+                    - - -0.1331166944050067
+                      - -0.2985446381663858
+                      - -0.1299144028716818
+                      - 0.12716526105014014
+                      - 0.24445281051361242
+                      - 0.052359417290304015
+                      - -0.06639194378815659
+                      - -0.0515428623822807
+                    - - -0.3302870133986425
+                      - 0.1177804767091647
+                      - 0.06915893387117533
+                      - -0.4204302050492702
+                      - -0.3161145657939801
+                      - 0.322920377419993
+                      - 0.19395457855721343
+                      - -0.11365337655752422
+                    - - -0.16993400446851198
+                      - -0.157416126804567
+                      - -0.08090448953478106
+                      - 0.20830555342316676
+                      - -0.11308079862243182
+                      - 0.044490575624147384
+                      - 0.28211395871639494
+                      - 0.07920112686609734
+              "@version": 2
+              activation_function: Linear
+              bias: false
+              precision: float64
+              resnet: true
+              trainable: true
+              use_timestep: false
+          neuron:
+            - 8
+          precision: float64
+          resnet_dt: false
+        neuron:
+          - 8
+        ntypes: 4
+        padding: true
+        precision: float64
+        resnet_dt: false
+        trainable: true
+        type_map: &id001
+          - Ni
+          - O
+          - Ni_spin
+          - O_spin
+        use_econf_tebd: false
+        use_tebd_bias: false
+      type_map: *id001
+      use_econf_tebd: false
+      use_loc_mapping: false
+      use_tebd_bias: false
+    fitting:
+      "@class": Fitting
+      "@variables":
+        aparam_avg: null
+        aparam_inv_std: null
+        bias_atom_e:
+          "@class": np.ndarray
+          "@is_variable": true
+          "@version": 1
+          dtype: float64
+          value:
+            - - 0.0
+            - - 0.0
+            - - 0.0
+            - - 0.0
+        case_embd: null
+        fparam_avg: null
+        fparam_inv_std: null
+      "@version": 4
+      activation_function: tanh
+      atom_ener: null
+      default_fparam: null
+      dim_case_embd: 0
+      dim_descrpt: 8
+      dim_out: 1
+      exclude_types: *id002
+      layer_name: null
+      mixed_types: true
+      nets:
+        "@class": NetworkCollection
+        "@version": 1
+        ndim: 0
+        network_type: fitting_network
+        networks:
+          - "@class": FittingNetwork
+            "@version": 1
+            activation_function: tanh
+            bias_out: true
+            in_dim: 8
+            layers:
+              - "@class": Layer
+                "@variables":
+                  b:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - -0.17635825349201156
+                      - -0.3566199551346283
+                      - -0.4657350300900149
+                      - 0.49182010702811113
+                      - 0.032647600656972545
+                  idt: null
+                  w:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - - 0.19950263394684065
+                        - 0.05634765583345527
+                        - -0.1442129593712478
+                        - -0.1085774516963511
+                        - 0.11311331965894553
+                      - - -0.2775489843491954
+                        - -0.35666203499239274
+                        - -0.3389432389106902
+                        - -0.05632492275434322
+                        - -0.48859095817655873
+                      - - -0.0295274439718225
+                        - -0.1886895411820409
+                        - 0.53672545544271
+                        - 0.07574020379061007
+                        - -0.42704120525642686
+                      - - -0.00993498946754372
+                        - 0.3770750367653306
+                        - -0.4385261113155961
+                        - 0.0468328088042057
+                        - 0.012607351815014095
+                      - - 0.1092056939586687
+                        - -0.08440204904008866
+                        - -0.6198116015257329
+                        - 0.1936974618526528
+                        - -0.11584195169630225
+                      - - -0.6395628609700832
+                        - -0.3937842385131085
+                        - -0.1370675696847499
+                        - -0.08281792882082432
+                        - -0.14269944588470002
+                      - - 0.003683595092519098
+                        - -0.1064836461083355
+                        - 0.1513375212109038
+                        - -0.3798449359483027
+                        - -0.27711500793004523
+                      - - -0.24136291455222364
+                        - -0.19077785910921263
+                        - -0.12067289115480624
+                        - -0.05720709372900689
+                        - -0.044669501979496415
+                "@version": 2
+                activation_function: tanh
+                bias: true
+                precision: float64
+                resnet: true
+                trainable: true
+                use_timestep: false
+              - "@class": Layer
+                "@variables":
+                  b:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - 0.09501004658257778
+                      - 0.1663807327224991
+                      - -0.5185313341630086
+                      - -0.7740662908662731
+                      - -0.18752579321547022
+                  idt:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - -0.2188085499554977
+                      - -0.4014642473754725
+                      - 0.032489550654357095
+                      - 0.06343911616091243
+                      - -0.00407617112574573
+                  w:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - - -0.3141891418069332
+                        - 0.30132598326837057
+                        - -0.1868614701027005
+                        - -0.1853536726835805
+                        - -0.14904917553209618
+                      - - -0.4993776326714626
+                        - 0.2929711950476154
+                        - -0.3300253064210836
+                        - -0.4799775188835898
+                        - -0.12327559985245252
+                      - - 0.16627900477763782
+                        - 0.18281489789715116
+                        - -0.0796215789550366
+                        - 0.11637836794519682
+                        - 0.019126199990905587
+                      - - 0.47193798042526686
+                        - 0.3935489978037474
+                        - 0.1926588188573466
+                        - 0.11685532990383077
+                        - -0.3143759410105157
+                      - - 0.2619509948079511
+                        - 0.17134734041574828
+                        - 0.16467987243470003
+                        - -0.17768942725372738
+                        - 0.17196893072212313
+                "@version": 2
+                activation_function: tanh
+                bias: true
+                precision: float64
+                resnet: true
+                trainable: true
+                use_timestep: true
+              - "@class": Layer
+                "@variables":
+                  b:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - 0.1498329073500072
+                      - -0.10390305511196503
+                      - -0.7262688617464856
+                      - -0.14980303343140125
+                      - -0.3578894004618838
+                  idt:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - 0.3290381873321775
+                      - 0.23103250534551598
+                      - -0.6940851206117438
+                      - -0.19335307745332778
+                      - -0.9240817753801489
+                  w:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - - -0.3092290441156226
+                        - -0.496367611501348
+                        - -0.052492949379292775
+                        - 0.06663748312823926
+                        - 0.027714401468510886
+                      - - -0.10433141997317527
+                        - -0.323901631855259
+                        - -0.24739439873488192
+                        - 0.3076895568713741
+                        - 0.1593814472209255
+                      - - -0.07111829721069259
+                        - -0.27598680250101504
+                        - 0.16632764307325093
+                        - 0.1801382402999823
+                        - 0.3107523993064097
+                      - - -0.012140157566561928
+                        - 0.07469305237763302
+                        - 0.26428018852282276
+                        - -0.11500213881655802
+                        - -0.2731498304335624
+                      - - 0.29941998505510775
+                        - 0.39267279762211
+                        - 0.06586779164332648
+                        - 0.10010820203885952
+                        - -0.04143485413490972
+                "@version": 2
+                activation_function: tanh
+                bias: true
+                precision: float64
+                resnet: true
+                trainable: true
+                use_timestep: true
+              - "@class": Layer
+                "@variables":
+                  b:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - 0.044426160812178636
+                  idt: null
+                  w:
+                    "@class": np.ndarray
+                    "@is_variable": true
+                    "@version": 1
+                    dtype: float64
+                    value:
+                      - - 0.26432565710368733
+                      - - 0.17264367113482967
+                      - - -0.04729186377886323
+                      - - -0.08841444813809296
+                      - - 0.2969145415081517
+                "@version": 2
+                activation_function: none
+                bias: true
+                precision: float64
+                resnet: false
+                trainable: true
+                use_timestep: false
+            neuron:
+              - 5
+              - 5
+              - 5
+            out_dim: 1
+            precision: float64
+            resnet_dt: true
+        ntypes: 4
+      neuron:
+        - 5
+        - 5
+        - 5
+      ntypes: 4
+      numb_aparam: 0
+      numb_fparam: 0
+      precision: float64
+      rcond: null
+      resnet_dt: true
+      spin: null
+      tot_ener_zero: false
+      trainable:
+        - true
+        - true
+        - true
+        - true
+      type: ener
+      type_map:
+        - Ni
+        - O
+        - Ni_spin
+        - O_spin
+      use_aparam_as_mask: false
+      var_name: energy
+    pair_exclude_types: *id003
+    preset_out_bias: null
+    rcond: null
+    type: standard
+    type_map:
+      - Ni
+      - O
+      - Ni_spin
+      - O_spin
+  spin:
+    use_spin:
+      - true
+      - false
+    virtual_scale:
+      - 0.314
+      - 0.0
+  type: spin_ener
+model_def_script:
+  descriptor:
+    precision: float64
+    repflow:
+      a_dim: 4
+      a_rcut: 3.5
+      a_rcut_smth: 0.5
+      a_sel: 4
+      axis_neuron: 4
+      e_dim: 6
+      e_rcut: 4.0
+      e_rcut_smth: 0.5
+      e_sel: 8
+      n_dim: 8
+      nlayers: 1
+      update_angle: false
+    seed: 1
+    type: dpa3
+    use_loc_mapping: false
+  fitting_net:
+    neuron:
+      - 5
+      - 5
+      - 5
+    resnet_dt: true
+    seed: 1
+  spin:
+    use_spin:
+      - true
+      - false
+    virtual_scale:
+      - 0.314
+      - 0.0
+  type_map:
+    - Ni
+    - O
+software: deepmd-kit
+time: "2026-04-30 14:57:42.534472+00:00"
+version: 3.0.0
diff --git a/source/tests/infer/gen_spin.py b/source/tests/infer/gen_spin.py
index d37e3207ff..789a65971b 100644
--- a/source/tests/infer/gen_spin.py
+++ b/source/tests/infer/gen_spin.py
@@ -84,6 +84,66 @@ def _build_yaml(yaml_path: str) -> None:
     save_dp_model(yaml_path, data)
 
 
+def _build_dpa3_mpi_yaml(yaml_path: str) -> None:
+    """Build a DPA3 spin model with use_loc_mapping=False (multi-rank).
+
+    The default ``deeppot_dpa_spin.yaml`` uses se_atten (DPA1) which
+    is non-GNN — single-artifact .pt2, no multi-rank ghost exchange.
+    This variant uses DPA3 (repflows, GNN) with use_loc_mapping=False
+    so the dual-artifact .pt2 carries the with-comm AOTI module that
+    DeepSpinPTExpt routes to under mpirun > 1.
+
+    Type map matches the existing 4-atom Ni-O test data
+    (``write_lmp_data_spin``): two types, Ni (spin-active), O (no spin).
+    """
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+    from deepmd.dpmodel.utils.serialization import (
+        save_dp_model,
+    )
+
+    config = {
+        "type_map": ["Ni", "O"],
+        "descriptor": {
+            "type": "dpa3",
+            "repflow": {
+                "n_dim": 8,
+                "e_dim": 6,
+                "a_dim": 4,
+                "nlayers": 1,
+                "e_rcut": 4.0,
+                "e_rcut_smth": 0.5,
+                "e_sel": 8,
+                "a_rcut": 3.5,
+                "a_rcut_smth": 0.5,
+                "a_sel": 4,
+                "axis_neuron": 4,
+                "update_angle": False,
+            },
+            "use_loc_mapping": False,
+            "precision": "float64",
+            "seed": 1,
+        },
+        "fitting_net": {"neuron": [5, 5, 5], "resnet_dt": True, "seed": 1},
+        "spin": {"use_spin": [True, False], "virtual_scale": [0.3140, 0.0]},
+    }
+
+    model = get_model(copy.deepcopy(config))
+    model_dict = model.serialize()
+
+    data = {
+        "model": model_dict,
+        "model_def_script": config,
+        "backend": "dpmodel",
+        "software": "deepmd-kit",
+        "version": "3.0.0",
+    }
+
+    print(f"Building DPA3 spin dpmodel and saving to {yaml_path} ...")  # noqa: T201
+    save_dp_model(yaml_path, data)
+
+
 def main():
     from deepmd.entrypoints.convert_backend import (
         convert_backend,
@@ -96,12 +156,23 @@ def main():
     pth_path = os.path.join(base_dir, "deeppot_dpa_spin.pth")
     pt2_path = os.path.join(base_dir, "deeppot_dpa_spin.pt2")
 
-    # ---- 1. Build .yaml if it doesn't exist ----
+    # Multi-rank GNN spin variant (DPA3 + use_loc_mapping=False).
+    # Produces a dual-artifact .pt2 that DeepSpinPTExpt routes to
+    # under mpirun > 1 (Phase 4c spin multi-rank dispatch).
+    yaml_dpa3_path = os.path.join(base_dir, "deeppot_dpa3_spin_mpi.yaml")
+    pt2_dpa3_path = os.path.join(base_dir, "deeppot_dpa3_spin_mpi.pt2")
+
+    # ---- 1. Build .yamls if they don't exist ----
     if not os.path.exists(yaml_path):
         _build_yaml(yaml_path)
     else:
         print(f"Using existing {yaml_path}")  # noqa: T201
 
+    if not os.path.exists(yaml_dpa3_path):
+        _build_dpa3_mpi_yaml(yaml_dpa3_path)
+    else:
+        print(f"Using existing {yaml_dpa3_path}")  # noqa: T201
+
     # ---- 2. Convert .yaml -> .pth and .yaml -> .pt2 ----
     # Import deepmd.pt to register the backend (needed for convert_backend)
     import deepmd.pt  # noqa: F401
@@ -114,6 +185,9 @@ def main():
     print(f"Converting to {pt2_path} ...")  # noqa: T201
     convert_backend(INPUT=yaml_path, OUTPUT=pt2_path, atomic_virial=True)
 
+    print(f"Converting to {pt2_dpa3_path} ...")  # noqa: T201
+    convert_backend(INPUT=yaml_dpa3_path, OUTPUT=pt2_dpa3_path, atomic_virial=True)
+
     print("Export done.")  # noqa: T201
 
     # ---- 3. Run inference for PBC test ----

From 803b2a4b7ee6d3b46874efb2fceef92933d7eceb Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 1 May 2026 20:50:14 +0800
Subject: [PATCH 21/34] test(spin-mpi): cover empty-subdomain and NULL-type for
 spin DPA3

Resolves the two spin-specific gaps left open by the previous commit:

- test_pair_deepmd_mpi_dpa3_spin_empty_subdomain: elongated 30 A box +
  processors '2 1 1' leaves rank 1 with nloc=0. Exercises the
  copy_from_nlist empty-rank guard for the spin path (the with-comm
  artifact still runs on rank 1 with nloc_real=0).

- test_pair_deepmd_mpi_dpa3_spin_null_type: 2 NULL (LAMMPS type-3,
  deepmd atype=-1) atoms straddling the x=6.5 rank boundary, within
  rcut of real atoms on both sides. Goes through DeepSpinPTExpt with
  nall_real < nall, triggering the has_null_atoms branch that calls
  build_comm_tensors_positional_with_virtual_atoms (fwd_map-based
  sendlist remap) for spin. Asserts NULL atoms get zero forces from
  the deepmd model and real-atom values match the mpi-1 reference.

Both compare mpi-2 vs same-archive mpi-1 (atol 1e-8) so any divergence
is necessarily in the multi-rank dispatch, not in tracing precision.

Runner generalised with --pair-coeff and --mass3 flags (mirrors the
non-spin DPA3 runner).
---
 .../run_mpi_pair_deepmd_spin_dpa3_pt2.py      |  23 ++-
 source/lmp/tests/test_lammps_spin_dpa3_pt2.py | 132 +++++++++++++++++-
 2 files changed, 151 insertions(+), 4 deletions(-)

diff --git a/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py
index a8d7fe71a6..3637238968 100644
--- a/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py
+++ b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py
@@ -60,6 +60,22 @@
     "domain decomposition (nswap>0). Pass '1 1 1' for a single-rank "
     "reference run on the same archive.",
 )
+parser.add_argument(
+    "--pair-coeff",
+    type=str,
+    default="* *",
+    help="pair_coeff arguments (after 'pair_coeff'). Default '* *' "
+    "uses identity LAMMPS-type-to-deepmd-atype mapping. For NULL-type "
+    "tests pass e.g. '* * Ni O NULL' so the third LAMMPS type becomes "
+    "deepmd atype=-1 (filtered before model evaluation).",
+)
+parser.add_argument(
+    "--mass3",
+    type=float,
+    default=None,
+    help="Optional mass for LAMMPS atom type 3 (and any higher types). "
+    "Used by the NULL-type fixture; ignored when only 2 types exist.",
+)
 args = parser.parse_args()
 
 lammps = PyLammps()
@@ -73,11 +89,16 @@
 lammps.read_data(args.DATAFILE)
 lammps.mass("1 58")
 lammps.mass("2 16")
+if args.mass3 is not None:
+    # NULL-type fixture: third LAMMPS type maps to deepmd atype=-1
+    # via pair_coeff and is filtered before model evaluation. Mass
+    # is physically irrelevant.
+    lammps.mass(f"3 {args.mass3}")
 lammps.timestep(0.0005)
 lammps.fix("1 all nve")
 
 lammps.pair_style(f"deepspin {args.PB_FILE}")
-lammps.pair_coeff("* *")
+lammps.pair_coeff(args.pair_coeff)
 lammps.compute("virial all centroid/stress/atom NULL pair")
 # Per-atom magnetic force components. LAMMPS does not expose ``fm``
 # through the legacy ``extract``/``gather_atoms`` registry, so we go
diff --git a/source/lmp/tests/test_lammps_spin_dpa3_pt2.py b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py
index fd4ee0a7cb..7c7c5787a7 100644
--- a/source/lmp/tests/test_lammps_spin_dpa3_pt2.py
+++ b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py
@@ -57,6 +57,22 @@
     / "deeppot_dpa3_spin_mpi.pt2"
 )
 data_file = Path(__file__).parent / "data_dpa3_spin_pt2.lmp"
+# Elongated-box fixture for the spin empty-subdomain MPI test: x is
+# extended to 30 A while atoms remain in x in [3, 13]. Combined with
+# ``processors 2 1 1`` this leaves rank 1 (x >= 15) with zero local
+# atoms, exercising the ``copy_from_nlist`` empty-rank guard for spin.
+data_file_empty_subdomain = (
+    Path(__file__).parent / "data_dpa3_spin_pt2_empty_subdomain.lmp"
+)
+# NULL-type fixture: 4 real Ni-O atoms + 2 LAMMPS type-3 atoms
+# straddling the x=6.5 rank boundary. With ``pair_coeff * * Ni O NULL``
+# LAMMPS type 3 maps to deepmd atype=-1, so those atoms are filtered
+# by ``select_real_atoms_coord`` and the comm tensors must be remapped
+# via ``fwd_map`` before being handed to the with-comm artifact.
+# Forces / force_mag on the 4 real atoms must match the no-NULL
+# baseline (mpi-1 reference run); NULL atoms get zero forces from the
+# deepmd model.
+data_file_null_type = Path(__file__).parent / "data_dpa3_spin_pt2_null_type.lmp"
 
 # 4-atom Ni-O system; same layout as ``test_lammps_spin_pt2.py``. With
 # ``processors 2 1 1`` the split sits at x=6.5 -> 2 atoms per rank.
@@ -86,22 +102,56 @@ def setup_module() -> None:
             "Skip test because PyTorch support is not enabled.",
         )
     write_lmp_data_spin(box, coord, spin, type_NiO, data_file)
+    # Elongated x-axis; atoms unchanged. ``processors 2 1 1`` splits
+    # at x=15 A and rank 1 owns x >= 15, which is empty.
+    box_empty = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0])
+    write_lmp_data_spin(box_empty, coord, spin, type_NiO, data_file_empty_subdomain)
+    # NULL-type fixture: append 2 LAMMPS type-3 atoms within rcut
+    # (~4 A) of real atoms on BOTH sides of the x=6.5 rank boundary,
+    # so they appear in cross-rank sendlists and the fwd_map-based
+    # comm-tensor remap is genuinely exercised. NULL atoms still need
+    # spin coordinates (write_lmp_data_spin format); we give them
+    # zero spin like the type-2 (O) atoms.
+    coord_null = np.concatenate(
+        [
+            coord,
+            np.array(
+                [
+                    [5.5, 6.0, 6.0],  # rank 0 side, near boundary
+                    [7.5, 7.0, 7.0],  # rank 1 side, near boundary
+                ]
+            ),
+        ]
+    )
+    spin_null = np.concatenate([spin, np.zeros((2, 3))])
+    type_null = np.concatenate([type_NiO, np.array([3, 3])])
+    write_lmp_data_spin(box, coord_null, spin_null, type_null, data_file_null_type)
 
 
 def teardown_module() -> None:
-    if data_file.exists():
-        os.remove(data_file)
+    for f in [data_file, data_file_empty_subdomain, data_file_null_type]:
+        if f.exists():
+            os.remove(f)
 
 
 def _run_mpi_subprocess(
     nprocs: int,
     extra_args: list[str] | None = None,
     processors: str | None = None,
+    data_path: Path | None = None,
+    runner_args: list[str] | None = None,
 ) -> dict:
     """Run ``run_mpi_pair_deepmd_spin_dpa3_pt2.py`` under
     ``mpirun -n <nprocs>`` and return
     ``{"pe", "forces", "force_mag", "virials"}``.
+
+    ``data_path`` (default ``data_file``) selects the LAMMPS data file
+    -- the empty-subdomain and NULL-type tests point at non-default
+    fixtures. ``runner_args`` flows additional flags (e.g.
+    ``--pair-coeff``, ``--mass3``) to the subprocess runner.
     """
+    if data_path is None:
+        data_path = data_file
     with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
         out_path = f.name
     try:
@@ -111,7 +161,7 @@ def _run_mpi_subprocess(
             str(nprocs),
             sys.executable,
             str(Path(__file__).parent / "run_mpi_pair_deepmd_spin_dpa3_pt2.py"),
-            str(data_file.resolve()),
+            str(data_path.resolve()),
             str(pb_file_mpi.resolve()),
             out_path,
         ]
@@ -121,6 +171,8 @@ def _run_mpi_subprocess(
             argv.extend(["--processors", "1 1 1"])
         if extra_args:
             argv.extend(extra_args)
+        if runner_args:
+            argv.extend(runner_args)
         sp.check_call(argv)
         with open(out_path) as fh:
             lines = fh.read().strip().splitlines()
@@ -176,3 +228,77 @@ def test_pair_deepmd_mpi_dpa3_spin() -> None:
     np.testing.assert_allclose(
         out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
     )
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_spin_empty_subdomain() -> None:
+    """Spin DPA3 multi-rank with one empty rank.
+
+    Elongated x box (30 A) + ``processors 2 1 1`` puts all 4 atoms on
+    rank 0; rank 1 has nloc=0. Exercises the C++ ``copy_from_nlist``
+    empty-rank guard for the spin path (the with-comm artifact still
+    runs on rank 1 with nloc_real=0). Compares against same-archive
+    mpi-1 reference.
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain)
+    out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain)
+
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-10, abs=1e-12)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["force_mag"], out_ref["force_mag"], atol=1e-8, rtol=0
+    )
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_spin_null_type() -> None:
+    """Spin DPA3 multi-rank with NULL-type atoms straddling the
+    rank boundary.
+
+    Two LAMMPS type-3 atoms (mapped to deepmd atype=-1 via
+    ``pair_coeff * * Ni O NULL``) sit at x=5.5 and x=7.5, just inside
+    the rcut window of either side of the x=6.5 boundary. They appear
+    in the cross-rank sendlists and are filtered by
+    ``select_real_atoms_coord`` -- so the spin path goes through
+    ``DeepSpinPTExpt::compute`` with ``nall_real < nall``, triggering
+    the ``has_null_atoms`` branch that calls
+    ``build_comm_tensors_positional_with_virtual_atoms`` (fwd_map-based
+    sendlist remap). Compares mpi-2 vs same-archive mpi-1 reference
+    (nullifying NULL forces and using the same fwd_map remap on rank 0
+    too).
+    """
+    runner_args = ["--pair-coeff", "* * Ni O NULL", "--mass3", "1.0"]
+    out_mpi = _run_mpi_subprocess(
+        nprocs=2, data_path=data_file_null_type, runner_args=runner_args
+    )
+    out_ref = _run_mpi_subprocess(
+        nprocs=1, data_path=data_file_null_type, runner_args=runner_args
+    )
+
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-10, abs=1e-12)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["force_mag"], out_ref["force_mag"], atol=1e-8, rtol=0
+    )
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+    # Sanity: NULL atoms (ids 5, 6) get exactly zero forces from the
+    # deepmd model. ``write_lmp_data_spin`` writes atoms in the order
+    # given (id 1..N), so type-3 NULL atoms are ids 5, 6 (after the 4
+    # real Ni-O atoms).
+    np.testing.assert_array_equal(out_mpi["forces"][4:], np.zeros((2, 3)))
+    np.testing.assert_array_equal(out_mpi["force_mag"][4:], np.zeros((2, 3)))

From 47f0c29395026c5019bc3ab45cf3a6e97b0d96e0 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 1 May 2026 21:16:55 +0800
Subject: [PATCH 22/34] test(spin-mpi): drop committed
 deeppot_dpa3_spin_mpi.yaml

gen_spin.py rebuilds the yaml from the deterministic config + seed in
_build_dpa3_mpi_yaml when missing, and the multi-rank test compares
mpi-2 vs same-archive mpi-1 (no hardcoded numerical references depend
on these weights). Unlike deeppot_dpa_spin.yaml -- whose committed
weights are pinned because C++ tests hardcode reference values against
them -- nothing requires deeppot_dpa3_spin_mpi.yaml to be checkpointed
in git.
---
 source/tests/infer/deeppot_dpa3_spin_mpi.yaml | 1863 -----------------
 1 file changed, 1863 deletions(-)
 delete mode 100644 source/tests/infer/deeppot_dpa3_spin_mpi.yaml

diff --git a/source/tests/infer/deeppot_dpa3_spin_mpi.yaml b/source/tests/infer/deeppot_dpa3_spin_mpi.yaml
deleted file mode 100644
index 6fce85245a..0000000000
--- a/source/tests/infer/deeppot_dpa3_spin_mpi.yaml
+++ /dev/null
@@ -1,1863 +0,0 @@
-backend: dpmodel
-model:
-  backbone_model:
-    "@class": Model
-    "@variables":
-      out_bias:
-        "@class": np.ndarray
-        "@is_variable": true
-        "@version": 1
-        dtype: float64
-        value:
-          - - - 0.0
-            - - 0.0
-            - - 0.0
-            - - 0.0
-      out_std:
-        "@class": np.ndarray
-        "@is_variable": true
-        "@version": 1
-        dtype: float64
-        value:
-          - - - 1.0
-            - - 1.0
-            - - 1.0
-            - - 1.0
-    "@version": 2
-    atom_exclude_types: &id002
-      - 2
-      - 3
-    descriptor:
-      "@class": Descriptor
-      "@version": 2
-      activation_function: silu
-      add_chg_spin_ebd: false
-      concat_output_tebd: false
-      env_protection: 1.0e-06
-      exclude_types: &id003
-        - - 3
-          - 0
-        - - 3
-          - 1
-        - - 3
-          - 2
-        - - 3
-          - 3
-      ntypes: 4
-      precision: float64
-      repflow_args:
-        a_compress_e_rate: 1
-        a_compress_rate: 0
-        a_compress_use_split: false
-        a_dim: 4
-        a_rcut: 3.5
-        a_rcut_smth: 0.5
-        a_sel: 4
-        axis_neuron: 4
-        e_dim: 6
-        e_rcut: 4.0
-        e_rcut_smth: 0.5
-        e_sel: 8
-        edge_init_use_dist: false
-        fix_stat_std: 0.3
-        n_dim: 8
-        n_multi_edge_message: 1
-        nlayers: 1
-        optim_update: true
-        sel_reduce_factor: 10.0
-        smooth_edge_update: false
-        update_angle: false
-        update_residual: 0.1
-        update_residual_init: const
-        update_style: res_residual
-        use_dynamic_sel: false
-        use_exp_switch: false
-      repflow_variable:
-        "@variables":
-          davg:
-            "@class": np.ndarray
-            "@is_variable": true
-            "@version": 1
-            dtype: float64
-            value:
-              - - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-              - - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-              - - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-              - - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-                - - 0.0
-                  - 0.0
-                  - 0.0
-                  - 0.0
-          dstd:
-            "@class": np.ndarray
-            "@is_variable": true
-            "@version": 1
-            dtype: float64
-            value:
-              - - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-              - - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-              - - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-              - - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-                - - 0.3
-                  - 0.3
-                  - 0.3
-                  - 0.3
-        angle_embd:
-          "@class": Layer
-          "@variables":
-            b: null
-            idt: null
-            w:
-              "@class": np.ndarray
-              "@is_variable": true
-              "@version": 1
-              dtype: float64
-              value:
-                - - 0.17649720801051316
-                  - 0.26111987625920485
-                  - -0.5130082451185703
-                  - -0.473906411761865
-          "@version": 2
-          activation_function: none
-          bias: false
-          precision: float64
-          resnet: false
-          trainable: true
-          use_timestep: false
-        edge_embd:
-          "@class": Layer
-          "@variables":
-            b:
-              "@class": np.ndarray
-              "@is_variable": true
-              "@version": 1
-              dtype: float64
-              value:
-                - -0.34071690817734457
-                - -0.10233679058558394
-                - -0.042158524863509905
-                - 0.1901865304247668
-                - 0.5454968471423458
-                - -0.15466206519031384
-            idt: null
-            w:
-              "@class": np.ndarray
-              "@is_variable": true
-              "@version": 1
-              dtype: float64
-              value:
-                - - -0.41394371690540416
-                  - -0.020029235227998383
-                  - 0.7548439568852728
-                  - 0.18561320525199423
-                  - -0.1235585931191982
-                  - -0.6320668874586287
-          "@version": 2
-          activation_function: none
-          bias: true
-          precision: float64
-          resnet: false
-          trainable: true
-          use_timestep: false
-        env_mat:
-          protection: 0.0
-          rcut: 4.0
-          rcut_smth: 0.5
-          use_exp_switch: false
-        repflow_layers:
-          - "@class": RepFlowLayer
-            "@variables":
-              a_residual: []
-              e_residual:
-                - "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-              n_residual:
-                - "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                - "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                - "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-                    - 0.1
-            "@version": 2
-            a_compress_e_rate: 1
-            a_compress_rate: 0
-            a_compress_use_split: false
-            a_dim: 4
-            a_rcut: 3.5
-            a_rcut_smth: 0.5
-            a_sel: 4
-            activation_function: silu
-            axis_neuron: 4
-            e_dim: 6
-            e_rcut: 4.0
-            e_rcut_smth: 0.5
-            e_sel:
-              - 8
-            edge_self_linear:
-              "@class": Layer
-              "@variables":
-                b:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - 0.023029640256287606
-                    - -0.1553057617002839
-                    - 0.288173154238969
-                    - 0.05107253540449686
-                    - 0.2614964936530096
-                    - -0.46218537330158843
-                idt: null
-                w:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - - -0.05677878533841125
-                      - 0.08257140520321203
-                      - -0.17682541236689134
-                      - -0.06780335156677138
-                      - 0.2613225810769312
-                      - -0.2528499571680411
-                    - - -0.1060811611888437
-                      - 0.2834597688459181
-                      - 0.17021435817066793
-                      - -0.2119118384053945
-                      - 0.044257126661162875
-                      - -0.20348530980582044
-                    - - -0.17206345113221683
-                      - 0.19628652842871394
-                      - -0.03853877890775931
-                      - 0.0064469870425646406
-                      - -0.2035021228657865
-                      - 0.33893151231499635
-                    - - -0.1603541649096622
-                      - -0.07431170557593793
-                      - -0.1285051383598977
-                      - -0.09531516630926942
-                      - -0.10774430641710406
-                      - 0.10558546868439946
-                    - - -0.027408677366010784
-                      - 0.03171038951939523
-                      - -0.26649612755080526
-                      - 0.0749559135333121
-                      - 0.12753219377780048
-                      - -0.12375862279261161
-                    - - -0.3561807917324476
-                      - -0.028580689473013433
-                      - -0.2740045204725747
-                      - 0.12423725221263406
-                      - -0.0746927118825747
-                      - -0.16583458613892285
-                    - - -0.22497394079088218
-                      - -0.10329264538957945
-                      - -0.06015496745765555
-                      - -0.24047390264558702
-                      - -0.2470728805254821
-                      - -0.03091482236168157
-                    - - 0.313786674711663
-                      - -0.014345848137540798
-                      - -0.1446657411476756
-                      - -0.11134433415995286
-                      - -0.10957716367503506
-                      - 0.25318230359455945
-                    - - -0.11353216449019704
-                      - -0.24278855525542462
-                      - -0.0657328669264313
-                      - -0.08357873620530161
-                      - -0.19969579432068596
-                      - 0.0217399962565733
-                    - - -0.017346111123240478
-                      - -0.20460540022763518
-                      - 0.19580548714183002
-                      - 0.26081320850512657
-                      - -0.01937612111130992
-                      - 0.26782602217325135
-                    - - 0.169450738702664
-                      - 0.0007586409725729347
-                      - 0.2217946757639642
-                      - -0.08785618340632881
-                      - 0.08754553673729902
-                      - 0.0459550075486224
-                    - - -0.10347442434861592
-                      - -0.05665265742992996
-                      - -0.15657294594958857
-                      - 0.07518451488260593
-                      - -0.200469822163535
-                      - 0.008552407309104103
-                    - - -0.13418495292451688
-                      - -0.15007855071339413
-                      - -0.47561245640659827
-                      - 0.05519145026405931
-                      - 0.034426127944687676
-                      - -0.19833628864440492
-                    - - -0.061539077996517144
-                      - 0.140236735963936
-                      - 0.44907007382747016
-                      - -0.17502514002597466
-                      - -0.13141545313528988
-                      - -0.10225960767785013
-                    - - 0.15849623153238157
-                      - 0.14969793438965767
-                      - 0.05020396887825857
-                      - -0.42237393212574514
-                      - -0.43560848414739306
-                      - -0.34368434587411545
-                    - - -0.090558268807612
-                      - -0.10586479947976848
-                      - -0.17654116465986686
-                      - -0.17464251661717356
-                      - -0.17707748016637653
-                      - 0.4728011907076426
-                    - - 0.06839467741547146
-                      - 0.20172332403056187
-                      - -0.20761658659357723
-                      - -0.3179201458113386
-                      - 0.1570191398976539
-                      - 0.30829366728408747
-                    - - -0.07346831672915768
-                      - -0.01603422028135059
-                      - -0.2343121216044693
-                      - -0.09228986456390967
-                      - -0.12259985802096685
-                      - 0.13925704477109332
-                    - - 0.03112673892006412
-                      - -0.12259170091097643
-                      - -0.01873720650800844
-                      - -0.02825905483134531
-                      - -0.07410620360262994
-                      - -0.13890487670689447
-                    - - 0.2599426512954838
-                      - -0.030475413023044056
-                      - 0.04418102981236639
-                      - 0.14747916053674695
-                      - 0.11469436259489629
-                      - -0.12589465767715197
-                    - - 0.1534348683560527
-                      - -0.2598559665351654
-                      - 0.1691188844559884
-                      - -0.05815067519957393
-                      - -0.09922406205302857
-                      - -0.026111067214965193
-                    - - -0.09469687008709152
-                      - -0.25433614509748265
-                      - -0.3230603080176275
-                      - 0.10565697308598668
-                      - 0.11382397843310456
-                      - 0.12636033735242963
-              "@version": 2
-              activation_function: none
-              bias: true
-              precision: float64
-              resnet: false
-              trainable: true
-              use_timestep: false
-            n_dim: 8
-            n_multi_edge_message: 1
-            node_edge_linear:
-              "@class": Layer
-              "@variables":
-                b:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - -0.08691917211888521
-                    - -0.2190238624015504
-                    - -0.03155197949842874
-                    - -0.06801377552229042
-                    - -0.36593263653854924
-                    - -0.2524793728902088
-                    - -0.2985692887165394
-                    - 0.041241949395424804
-                idt: null
-                w:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - - -0.0696849625476975
-                      - 0.2297992430114777
-                      - 0.2016390404837622
-                      - -0.14268332516715398
-                      - 0.05104561028973491
-                      - -0.0047524771390660015
-                      - 0.2007647888532239
-                      - -0.13892443853282946
-                    - - -0.06269186432762
-                      - -0.3620323943560662
-                      - 0.1429598213093448
-                      - 0.32251103225335886
-                      - 0.03297508356302982
-                      - 0.09813020765990464
-                      - -0.2128967691985101
-                      - 0.1446653191521595
-                    - - 0.3408694676048958
-                      - 0.2108742996875453
-                      - 0.3708891734344055
-                      - 0.03603632207631642
-                      - -0.021861106604374982
-                      - 0.05885265981211556
-                      - -0.3850668694292795
-                      - -0.02565715855818745
-                    - - 0.2554067355579947
-                      - -0.12385934461529113
-                      - 0.20794804172087122
-                      - 0.34771760519493133
-                      - 0.0030775484903255083
-                      - 0.08033323613153229
-                      - 0.04547640227535313
-                      - 0.03256133523805088
-                    - - -0.20228475171413982
-                      - -0.40882303462099395
-                      - -0.13933573248982073
-                      - -0.09056898648309795
-                      - 0.06705102826758672
-                      - -0.10643998751725821
-                      - 0.3714789434592029
-                      - -0.15660714896565422
-                    - - 0.0620445405627027
-                      - -0.14981233984554174
-                      - 0.1377612457580642
-                      - -0.3264453797874674
-                      - 0.18886992363386892
-                      - -0.13120999191064697
-                      - 0.2639396300281778
-                      - 0.20744058178112204
-                    - - 0.0902283316504931
-                      - 0.2642720697422412
-                      - 0.11616051352480065
-                      - 0.33194344559115435
-                      - -0.07519119975054182
-                      - -0.05062288710700148
-                      - -0.0033899752949634763
-                      - 0.12074780296663348
-                    - - -0.14625494457636853
-                      - -0.39187048236106403
-                      - 0.005863181213654556
-                      - 0.08058988215606765
-                      - 0.229684677996952
-                      - 0.02491096095922189
-                      - -0.07923462148233366
-                      - 0.03463149425218323
-                    - - 0.1459761391053138
-                      - 0.1826916307305693
-                      - -0.24330282168960599
-                      - -0.15404338160080283
-                      - -0.15732528026070658
-                      - 0.10082194118502665
-                      - 0.10780094880007303
-                      - 0.10439459076027502
-                    - - 0.2967580322447816
-                      - 0.19310548831515634
-                      - 0.13271337197427788
-                      - -0.003964549207383962
-                      - -0.3053587625881187
-                      - 0.12883374510336365
-                      - 0.045960329737757634
-                      - 0.19761345822107057
-                    - - -0.13773367016862742
-                      - 0.09659775346412201
-                      - 0.13561552570758134
-                      - 0.07814681408507194
-                      - -0.28773064288403055
-                      - 0.07556744144481048
-                      - -0.09838713644355178
-                      - 0.009867107649393275
-                    - - -0.09655717020123253
-                      - -0.10871554819348611
-                      - -0.11670258568304015
-                      - 0.2177137774640066
-                      - -0.14817421356773255
-                      - -0.03606693672811542
-                      - -0.026029214690369364
-                      - -0.040666475049662726
-                    - - -0.0677385423671006
-                      - -0.12993597893178244
-                      - 0.1180039874263662
-                      - 0.1384604584579823
-                      - -0.024227421664540914
-                      - 0.1679245814762119
-                      - -0.19280274838451647
-                      - 0.0990223630355508
-                    - - 0.0758415027141385
-                      - 0.16215196433523008
-                      - 0.2767732385588474
-                      - 0.022163750613004355
-                      - -0.12254120989786124
-                      - -0.12391951174230557
-                      - -0.028791741351884195
-                      - -0.0595519969823867
-                    - - 0.22247449036902905
-                      - 0.07567917899966987
-                      - -0.18221068561029122
-                      - -0.1496346790319525
-                      - 0.01739141266484531
-                      - 0.03295277270138665
-                      - -0.27927822171693173
-                      - -0.13558030103477586
-                    - - 0.1712575942124072
-                      - 0.13705603104177683
-                      - 0.290608271870899
-                      - 0.25077636518593155
-                      - 0.06723740912894116
-                      - -0.29077479630216374
-                      - -0.25998108625190797
-                      - 0.15096707384533595
-                    - - -0.011258223444056591
-                      - 0.07940059884337107
-                      - 0.14539160696529732
-                      - -0.33401238196882443
-                      - 0.0359760729699335
-                      - -0.02226084988227022
-                      - 0.12276616178918343
-                      - 0.0439592954772777
-                    - - 0.07596667366672871
-                      - -0.11052600964268607
-                      - -0.13155071841622368
-                      - -0.07425437999013539
-                      - 0.00827734508288158
-                      - 0.07414300482320346
-                      - 0.052019022231599196
-                      - 0.16368644986528788
-                    - - 0.31022863799320216
-                      - 0.11380817934759249
-                      - 0.11671054675679823
-                      - 0.03833224311415518
-                      - 0.1545146635596559
-                      - 0.5283089690392868
-                      - -0.17235747525638992
-                      - -0.16802245441710034
-                    - - 0.19547575805994974
-                      - 0.03442738806627725
-                      - 0.035134165349037516
-                      - 0.1685202553837112
-                      - -0.13706885637245225
-                      - -0.09105484518308726
-                      - 0.24401116664356562
-                      - -0.042463896239058455
-                    - - 0.18293429344914702
-                      - -0.0797150153045118
-                      - 0.2837300628985514
-                      - -0.03290000697254011
-                      - 0.07484025269991934
-                      - 0.4486382833349405
-                      - 0.18215765586473062
-                      - 0.14222755521955213
-                    - - -0.054949228485595726
-                      - 0.2298266346316468
-                      - -0.13022437426681047
-                      - 0.31473958548227127
-                      - -0.16053599380138361
-                      - 0.12351036770696595
-                      - -0.2026640600757936
-                      - -0.3120452604960154
-              "@version": 2
-              activation_function: none
-              bias: true
-              precision: float64
-              resnet: false
-              trainable: true
-              use_timestep: false
-            node_self_mlp:
-              "@class": Layer
-              "@variables":
-                b:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - 0.3339521660758233
-                    - 0.19617864215638078
-                    - 0.11685150273643896
-                    - -0.04301114015831818
-                    - -0.2646745547826684
-                    - -0.05874585577443532
-                    - 0.4130256006886377
-                    - -0.6003500792716773
-                idt: null
-                w:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - - -0.22061013087519665
-                      - 0.17161694901625085
-                      - 0.25079797681294247
-                      - -0.06984190636344022
-                      - 0.402412783105689
-                      - -0.13232509868240386
-                      - -0.12410592033624109
-                      - -0.5243896508356666
-                    - - -0.34531669337816745
-                      - 0.2590681097532894
-                      - -0.4170438578433154
-                      - 0.33209656716128205
-                      - 0.20907222698506978
-                      - 0.21026382825889875
-                      - -0.04125433055358784
-                      - -0.3362049950725693
-                    - - -0.02306669199993831
-                      - -0.27140136827851236
-                      - 0.08675906253383281
-                      - 0.20991982378397447
-                      - -0.20157467157772102
-                      - 0.10954533237221269
-                      - -0.30521247150866015
-                      - 0.1039196228402914
-                    - - 0.2927901959232568
-                      - -0.05686111266739088
-                      - -0.352867716741099
-                      - 0.06499009437306054
-                      - 0.2935084094905296
-                      - -0.5208455549268021
-                      - -0.06412894033597939
-                      - 0.2617524844957687
-                    - - -0.26859205166611555
-                      - -0.017740123512057532
-                      - -0.16973184286647353
-                      - -0.041497625408519805
-                      - -0.33848186563738925
-                      - -0.498133067071094
-                      - 0.06453515847241846
-                      - -0.28211046673410256
-                    - - -0.0031712540783364537
-                      - 0.14054927501098227
-                      - -0.16739625499774285
-                      - 0.02924799819668618
-                      - 0.19945724852581612
-                      - -0.07433092972702877
-                      - 0.33641837410477954
-                      - -0.1935354318143647
-                    - - -0.2896583115032089
-                      - -0.4291374752779325
-                      - 0.18521131755882006
-                      - 0.036186935403130116
-                      - 0.27669775576389155
-                      - -0.04763160274577408
-                      - 0.1400908330823242
-                      - 0.15697986928574623
-                    - - -0.45902865822845124
-                      - 0.33250108656046035
-                      - 0.0306169230429561
-                      - -0.035381192364331175
-                      - -0.0510947377580893
-                      - 0.03972955950151097
-                      - 0.6129808284962325
-                      - 0.027297205883797467
-              "@version": 2
-              activation_function: none
-              bias: true
-              precision: float64
-              resnet: false
-              trainable: true
-              use_timestep: false
-            node_sym_linear:
-              "@class": Layer
-              "@variables":
-                b:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - 0.04402011262092976
-                    - 0.19539133788288796
-                    - 0.02243486288225181
-                    - -0.15932598464026163
-                    - -0.1441065175896103
-                    - -0.20205704607775893
-                    - 0.007090553889850609
-                    - -0.20221671762001667
-                idt: null
-                w:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - - 0.07307166266137728
-                      - 0.06127645860254937
-                      - -0.18492998679736772
-                      - 0.04613999102916452
-                      - 0.0071079000479441
-                      - -0.03731231022031221
-                      - -0.09483134725409749
-                      - -0.04779952388125717
-                    - - -0.06975495248291474
-                      - -0.19948091645922555
-                      - -0.19101500000694704
-                      - -0.0756612239190429
-                      - 0.18223713459959498
-                      - 0.004660326879702162
-                      - -0.07331926290215518
-                      - -0.11804049351864328
-                    - - -0.008643603296694117
-                      - -0.07891692454926386
-                      - -0.24683520896350286
-                      - -0.07498319216962253
-                      - 0.14604675984008694
-                      - 0.09601184516912262
-                      - -0.01561740011879576
-                      - 0.05490167651869453
-                    - - -0.019884970427089133
-                      - 0.0007666914047260165
-                      - -0.16505651916265357
-                      - -0.16723740821054547
-                      - 0.1234653183096876
-                      - -0.04403642108952563
-                      - -0.03727304005788303
-                      - -0.18190516409632088
-                    - - -0.09168767286099873
-                      - -0.1549419399425698
-                      - 0.08193144903871091
-                      - 0.15640675555750194
-                      - -0.06305034848986912
-                      - 0.16836512195133213
-                      - 0.009048302220263893
-                      - 0.05322075713280992
-                    - - -0.06468543813846328
-                      - 0.0948348631241292
-                      - 0.006867290444906741
-                      - -0.24931773871448817
-                      - 0.08788089155489308
-                      - -0.0739514302480491
-                      - 0.025288498321181765
-                      - -0.08305521153831118
-                    - - -0.07393598220040017
-                      - 0.07042478981157554
-                      - -0.07236047649200875
-                      - -0.04706083253081129
-                      - 0.011054293351306345
-                      - -0.08799610585856558
-                      - 0.1563680796185477
-                      - 0.04333789772104407
-                    - - -0.10653678039670528
-                      - 0.18112426500221723
-                      - 0.009186401470971654
-                      - 0.006153194931152504
-                      - -0.04989535662898608
-                      - -0.17876067282409114
-                      - -0.15602193322162777
-                      - 0.00781917954318876
-                    - - -0.06699569918753231
-                      - 0.30735630566871885
-                      - -0.016096279041795645
-                      - -0.22956358044083025
-                      - -0.0065529000816888765
-                      - -0.06902463180143781
-                      - 0.06768609922953323
-                      - 0.1187567871665586
-                    - - -0.03935798540152214
-                      - -0.10867329670955546
-                      - -0.04052094555163571
-                      - -0.04078630187590839
-                      - -0.0748763378601901
-                      - 0.01860182181594992
-                      - 0.20057959184872112
-                      - 0.16046549209905156
-                    - - 0.031478615338666395
-                      - 0.11567514563874234
-                      - 0.08294594125898624
-                      - -0.07089853590674343
-                      - 0.20101923186451937
-                      - -0.11766930025015115
-                      - 0.21570163379940235
-                      - 0.14563108587004206
-                    - - 0.07932986781606469
-                      - -0.19442968907969105
-                      - 0.05697454617840562
-                      - -0.19484656091831729
-                      - 0.04754566926156801
-                      - -0.12152155059832441
-                      - 0.08105546170302243
-                      - -0.09483406966077029
-                    - - -0.10943334690817784
-                      - 0.11702284889224986
-                      - 0.06551144399385757
-                      - -0.003108503735325857
-                      - -0.1466684268106551
-                      - 0.11582453333312602
-                      - -0.19609870968779317
-                      - -0.11809063481420465
-                    - - -0.11120967058944209
-                      - -0.07178289284260277
-                      - -0.07505138171189361
-                      - -0.17137771295621249
-                      - -0.012516091428859523
-                      - 0.056132912587423756
-                      - -0.011172736867909887
-                      - 0.0014926969164057145
-                    - - -0.1652803302650934
-                      - 0.08452449793427
-                      - -0.06260662069159101
-                      - -0.07909718643578055
-                      - 0.00574135469567161
-                      - -0.05691391300603163
-                      - 0.2457179942284785
-                      - -0.08037694862142311
-                    - - 0.1761032538671494
-                      - -0.15524353322856968
-                      - -0.20338260987738993
-                      - -0.09738847488694806
-                      - 0.05960295717975261
-                      - -0.0268406105291267
-                      - 0.19154482080963495
-                      - -0.05557739958347549
-                    - - -0.23162474155138468
-                      - 0.005428848189956548
-                      - 0.14498512403306713
-                      - 0.015859517797165032
-                      - 0.13342303538966063
-                      - 0.07757097608660568
-                      - 0.061885304048992174
-                      - -0.02774862502778554
-                    - - -0.099674682792698
-                      - 0.1743242267060875
-                      - 0.0565895993699819
-                      - -0.1431728246694354
-                      - -0.04572377374634247
-                      - 0.1932522842767088
-                      - -0.13605774184771868
-                      - -0.079596349847149
-                    - - 0.015159290423222593
-                      - 0.0741788473825365
-                      - -0.025111776236424455
-                      - 0.11728977172727281
-                      - -0.05246129405331076
-                      - -0.3560652693695576
-                      - 0.22489664505020285
-                      - -0.11322150427163667
-                    - - 0.1172685876179488
-                      - 0.015449206720498673
-                      - 0.11464505230123948
-                      - -0.13045379503420262
-                      - -0.18460226345307634
-                      - -0.0735660416536509
-                      - 0.02668836976483192
-                      - 0.009471901506209893
-                    - - -0.12415218588856815
-                      - -0.028427823628392242
-                      - -0.0726329032188482
-                      - 0.2205454016716484
-                      - -0.06981635935553832
-                      - -0.06914918285976224
-                      - -0.07547512647684368
-                      - 0.19585301943839276
-                    - - 0.02068794647278527
-                      - 0.11434955856950152
-                      - 0.04733548159377606
-                      - -0.0940771421180628
-                      - 0.106950218084799
-                      - 0.11995323224700441
-                      - 0.07016105028143815
-                      - -0.07349788842232614
-                    - - -0.028316732941958092
-                      - -0.006316920155388264
-                      - 0.014323448114816232
-                      - 0.07909510285143638
-                      - 0.08089223619428912
-                      - -0.1285448965066473
-                      - -0.02731037643994388
-                      - -0.048232324099890284
-                    - - -0.04229476466912251
-                      - -0.10545582133061814
-                      - -0.1399519987577358
-                      - -0.24859786794141928
-                      - 0.04555029533580089
-                      - -0.06637709714144181
-                      - -0.11891839416041088
-                      - -0.05608836594526548
-                    - - -0.1481671676394082
-                      - -0.11826472343612228
-                      - 0.18759449377634982
-                      - -0.0027813243183313764
-                      - -0.06187858233767373
-                      - -0.16870507895423517
-                      - 0.15432198341660605
-                      - 0.2442525725033602
-                    - - 0.11655618965628044
-                      - 0.16410614799338208
-                      - -0.15922334755571288
-                      - 0.05294100944284731
-                      - -0.042676438943807564
-                      - 0.05982722192738627
-                      - 0.08818007330306689
-                      - 0.08799006862019813
-                    - - -0.1816952674192488
-                      - 0.33018315199731113
-                      - 0.14825048237904745
-                      - -0.12977688627249692
-                      - -0.014039894202582361
-                      - 0.021698570605095405
-                      - -0.10536292700472008
-                      - -0.016298405526400214
-                    - - 0.18891280861168214
-                      - -0.037066320234429954
-                      - 0.051989201606798936
-                      - -0.33236261122879446
-                      - -0.2233240290736924
-                      - 0.17632501110044835
-                      - 0.02791043546786102
-                      - 0.08058616657592761
-                    - - -0.12416787825473675
-                      - -0.0018776550590277605
-                      - -0.1361594510955972
-                      - -0.031008628174283
-                      - -0.1510470016534144
-                      - -0.1968118582063139
-                      - 0.05923927005740039
-                      - 0.10906017525194028
-                    - - -0.01747528984400593
-                      - 0.043571037430286425
-                      - 0.09735765094593854
-                      - 0.038496104792229716
-                      - 0.021583898030338507
-                      - -0.11795161808253331
-                      - -0.11404406907043374
-                      - -0.06541831356900717
-                    - - 0.05781757062086345
-                      - 0.06545403068342133
-                      - 0.07182196888387801
-                      - 0.06571017380833269
-                      - 0.25549620850343796
-                      - -0.01712221435859817
-                      - 0.02746476505848508
-                      - -0.16813933068880024
-                    - - 0.15811742659496866
-                      - -0.10097487333290259
-                      - 0.0007478905750386516
-                      - 0.15986815657402492
-                      - 0.0879704571647486
-                      - 0.051839404360383305
-                      - 0.04773139180116972
-                      - -0.1562216704347126
-                    - - -0.00554177026701311
-                      - 0.026672084558123862
-                      - -0.026556168406945337
-                      - 0.017618135480540704
-                      - 0.04290442846891425
-                      - -0.16108845422437917
-                      - 0.03885069382837762
-                      - -0.08559226312134341
-                    - - -0.10984387513362157
-                      - 0.06020841962256015
-                      - 0.013439129456291792
-                      - -0.1211722988008539
-                      - 0.0321361577334442
-                      - 0.04742132269747014
-                      - -0.08371259477093888
-                      - -0.14250805695920574
-                    - - 0.04498243399350513
-                      - -0.03633434279549633
-                      - 0.17043129619564554
-                      - 0.13738977779076048
-                      - 0.03367329367643751
-                      - 0.13141345496526305
-                      - 0.14626062464255066
-                      - -0.087660426894852
-                    - - 0.13304548046946202
-                      - -0.02074921039690319
-                      - -0.19614199925540662
-                      - 0.09145888259449976
-                      - -0.16872056060024043
-                      - -0.057806035869808946
-                      - -0.012927002228426554
-                      - -0.18968555494779932
-                    - - 0.09056415309144267
-                      - 0.19579647713404205
-                      - 0.12419307551929215
-                      - 0.03068324855507999
-                      - 0.16324257199502792
-                      - 0.28864177653836015
-                      - -0.04884842530407823
-                      - 0.05243039778651716
-                    - - -0.1354513040660592
-                      - -0.0032083328727676315
-                      - 0.035763067000830435
-                      - -0.10752629467535854
-                      - -0.004527627068300205
-                      - -0.26678729966885645
-                      - 0.16095749546546945
-                      - -0.0768457166279081
-                    - - 0.24290534029168284
-                      - -0.19993818991295886
-                      - 0.05863500838014017
-                      - 0.1075745460176732
-                      - -0.2703641493668329
-                      - 0.022882752217475207
-                      - -0.18377439784177813
-                      - -0.02475991439750886
-                    - - -0.0970343883793403
-                      - 0.022190761521183
-                      - -0.31137609288015433
-                      - -0.12852583938411438
-                      - 0.06380585650762231
-                      - -0.05537350140183391
-                      - 0.009834307052428782
-                      - -0.18327381164681603
-                    - - -0.058720582106338445
-                      - 0.012207974777885133
-                      - 0.04906298973398652
-                      - -0.0252045636071624
-                      - 0.04064401311527239
-                      - -0.12030307623147056
-                      - -0.02607458251331658
-                      - -0.12104904963385374
-                    - - 0.14380149442345772
-                      - -0.08586187966457755
-                      - -0.0562380312253021
-                      - 0.1183995520092173
-                      - -0.008618891616010692
-                      - -0.30556252122213096
-                      - 0.157107693967395
-                      - -0.150824446001649
-                    - - -0.12986340463514554
-                      - -0.13953775800615473
-                      - 0.06688782609307184
-                      - 0.30709990962247197
-                      - -0.10057794483875744
-                      - -0.15572836837520085
-                      - 0.22240522808485344
-                      - 0.07486567450323982
-                    - - -0.00026497955681491453
-                      - -0.462148220797257
-                      - -0.04683339159019641
-                      - 0.10954858908660245
-                      - 0.048155719596331595
-                      - -0.08404934441388894
-                      - 0.15848474948089222
-                      - -0.029754000979091536
-                    - - -0.008795641657631076
-                      - -0.021341761230446545
-                      - -0.10489671109204046
-                      - 0.03213370243212562
-                      - -0.021792936100149974
-                      - -0.018371450392434912
-                      - 0.0007292277382723748
-                      - 0.07679112359755517
-                    - - -0.06130007400378907
-                      - -0.06581095863692285
-                      - 0.06501448048047738
-                      - -0.14197246804370967
-                      - 0.15983589537290877
-                      - -0.15693380789472725
-                      - -0.17963845906090375
-                      - 0.10204145028546817
-                    - - -0.07077050429398143
-                      - 0.1990098057969514
-                      - -0.2525111691805106
-                      - -0.22059894251537618
-                      - -0.27531410890875607
-                      - -0.0693243961021514
-                      - 0.03876302523241355
-                      - 0.12122101629786736
-                    - - -0.12820657692829063
-                      - -0.10772035941442479
-                      - 0.10829696580051636
-                      - 0.1493715060396245
-                      - 0.13488833866187872
-                      - 0.09022524867490032
-                      - 0.007332743974581279
-                      - 0.1529338321168549
-                    - - -0.22245363971842472
-                      - -0.08917661330105822
-                      - 0.10304564318043377
-                      - -0.07026805272160686
-                      - 0.016625750231852813
-                      - 0.23074109385732217
-                      - 0.053971407495566504
-                      - -0.15089059679319458
-                    - - 0.1294396068073317
-                      - -0.038487426453509534
-                      - 0.09393650831599386
-                      - 0.09638990927578407
-                      - 0.17905918157852316
-                      - 0.06760574587425355
-                      - 0.0639998107196389
-                      - -0.1587157815816586
-                    - - 0.06077231806824999
-                      - 0.006159909130812671
-                      - 0.15285274367932117
-                      - -0.026531120401424045
-                      - 0.06104797756042876
-                      - -0.174933801016035
-                      - 0.25284181425638513
-                      - -0.16931699181750984
-                    - - -0.09480440252644158
-                      - -0.11919995631753837
-                      - 0.1374865485894956
-                      - 0.03525829583245701
-                      - 0.055414318086174905
-                      - 0.039970825479268265
-                      - -0.028476173719310948
-                      - 0.007895110382084259
-                    - - -0.08849522170883828
-                      - 0.1556903658898126
-                      - -0.06942905817654972
-                      - 0.17917871676321492
-                      - -0.12839965901095401
-                      - -0.1457242708290995
-                      - 0.2073632537418445
-                      - -0.0033056633245595168
-                    - - -0.14321940581992326
-                      - 0.016216983383358995
-                      - -0.05603214608550905
-                      - 0.034067014410779244
-                      - -0.004165932252642813
-                      - 0.03579825379823718
-                      - 0.2274077472661256
-                      - 0.12282153328534674
-                    - - -0.17424677728325255
-                      - 0.03032450606197887
-                      - -0.3407467917235723
-                      - 0.08460871296272927
-                      - -0.21233509125037692
-                      - 0.038581785470083826
-                      - -0.1271081651221865
-                      - -0.05674635282930029
-                    - - -0.06365889303105148
-                      - -0.0346798442701684
-                      - 0.04178115473238202
-                      - -0.03570145798701077
-                      - 0.2255873927499116
-                      - -0.21936512330368732
-                      - -0.19469567244011848
-                      - -0.007461014512643234
-              "@version": 2
-              activation_function: none
-              bias: true
-              precision: float64
-              resnet: false
-              trainable: true
-              use_timestep: false
-            ntypes: 4
-            optim_update: true
-            precision: float64
-            sel_reduce_factor: 10.0
-            smooth_edge_update: false
-            update_angle: false
-            update_residual: 0.1
-            update_residual_init: const
-            update_style: res_residual
-            use_dynamic_sel: false
-      trainable: true
-      type: dpa3
-      type_embedding:
-        "@class": TypeEmbedNet
-        "@version": 2
-        activation_function: Linear
-        embedding:
-          "@class": EmbeddingNetwork
-          "@version": 2
-          activation_function: Linear
-          bias: false
-          in_dim: 4
-          layers:
-            - "@class": Layer
-              "@variables":
-                b: null
-                idt: null
-                w:
-                  "@class": np.ndarray
-                  "@is_variable": true
-                  "@version": 1
-                  dtype: float64
-                  value:
-                    - - 0.06913868355931278
-                      - -0.3276059448146492
-                      - -0.22478586008940918
-                      - -0.03129740042629991
-                      - -0.2511436154794455
-                      - -0.4760319710462916
-                      - 0.183856376649989
-                      - 0.220680920691283
-                    - - -0.1331166944050067
-                      - -0.2985446381663858
-                      - -0.1299144028716818
-                      - 0.12716526105014014
-                      - 0.24445281051361242
-                      - 0.052359417290304015
-                      - -0.06639194378815659
-                      - -0.0515428623822807
-                    - - -0.3302870133986425
-                      - 0.1177804767091647
-                      - 0.06915893387117533
-                      - -0.4204302050492702
-                      - -0.3161145657939801
-                      - 0.322920377419993
-                      - 0.19395457855721343
-                      - -0.11365337655752422
-                    - - -0.16993400446851198
-                      - -0.157416126804567
-                      - -0.08090448953478106
-                      - 0.20830555342316676
-                      - -0.11308079862243182
-                      - 0.044490575624147384
-                      - 0.28211395871639494
-                      - 0.07920112686609734
-              "@version": 2
-              activation_function: Linear
-              bias: false
-              precision: float64
-              resnet: true
-              trainable: true
-              use_timestep: false
-          neuron:
-            - 8
-          precision: float64
-          resnet_dt: false
-        neuron:
-          - 8
-        ntypes: 4
-        padding: true
-        precision: float64
-        resnet_dt: false
-        trainable: true
-        type_map: &id001
-          - Ni
-          - O
-          - Ni_spin
-          - O_spin
-        use_econf_tebd: false
-        use_tebd_bias: false
-      type_map: *id001
-      use_econf_tebd: false
-      use_loc_mapping: false
-      use_tebd_bias: false
-    fitting:
-      "@class": Fitting
-      "@variables":
-        aparam_avg: null
-        aparam_inv_std: null
-        bias_atom_e:
-          "@class": np.ndarray
-          "@is_variable": true
-          "@version": 1
-          dtype: float64
-          value:
-            - - 0.0
-            - - 0.0
-            - - 0.0
-            - - 0.0
-        case_embd: null
-        fparam_avg: null
-        fparam_inv_std: null
-      "@version": 4
-      activation_function: tanh
-      atom_ener: null
-      default_fparam: null
-      dim_case_embd: 0
-      dim_descrpt: 8
-      dim_out: 1
-      exclude_types: *id002
-      layer_name: null
-      mixed_types: true
-      nets:
-        "@class": NetworkCollection
-        "@version": 1
-        ndim: 0
-        network_type: fitting_network
-        networks:
-          - "@class": FittingNetwork
-            "@version": 1
-            activation_function: tanh
-            bias_out: true
-            in_dim: 8
-            layers:
-              - "@class": Layer
-                "@variables":
-                  b:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - -0.17635825349201156
-                      - -0.3566199551346283
-                      - -0.4657350300900149
-                      - 0.49182010702811113
-                      - 0.032647600656972545
-                  idt: null
-                  w:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - - 0.19950263394684065
-                        - 0.05634765583345527
-                        - -0.1442129593712478
-                        - -0.1085774516963511
-                        - 0.11311331965894553
-                      - - -0.2775489843491954
-                        - -0.35666203499239274
-                        - -0.3389432389106902
-                        - -0.05632492275434322
-                        - -0.48859095817655873
-                      - - -0.0295274439718225
-                        - -0.1886895411820409
-                        - 0.53672545544271
-                        - 0.07574020379061007
-                        - -0.42704120525642686
-                      - - -0.00993498946754372
-                        - 0.3770750367653306
-                        - -0.4385261113155961
-                        - 0.0468328088042057
-                        - 0.012607351815014095
-                      - - 0.1092056939586687
-                        - -0.08440204904008866
-                        - -0.6198116015257329
-                        - 0.1936974618526528
-                        - -0.11584195169630225
-                      - - -0.6395628609700832
-                        - -0.3937842385131085
-                        - -0.1370675696847499
-                        - -0.08281792882082432
-                        - -0.14269944588470002
-                      - - 0.003683595092519098
-                        - -0.1064836461083355
-                        - 0.1513375212109038
-                        - -0.3798449359483027
-                        - -0.27711500793004523
-                      - - -0.24136291455222364
-                        - -0.19077785910921263
-                        - -0.12067289115480624
-                        - -0.05720709372900689
-                        - -0.044669501979496415
-                "@version": 2
-                activation_function: tanh
-                bias: true
-                precision: float64
-                resnet: true
-                trainable: true
-                use_timestep: false
-              - "@class": Layer
-                "@variables":
-                  b:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - 0.09501004658257778
-                      - 0.1663807327224991
-                      - -0.5185313341630086
-                      - -0.7740662908662731
-                      - -0.18752579321547022
-                  idt:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - -0.2188085499554977
-                      - -0.4014642473754725
-                      - 0.032489550654357095
-                      - 0.06343911616091243
-                      - -0.00407617112574573
-                  w:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - - -0.3141891418069332
-                        - 0.30132598326837057
-                        - -0.1868614701027005
-                        - -0.1853536726835805
-                        - -0.14904917553209618
-                      - - -0.4993776326714626
-                        - 0.2929711950476154
-                        - -0.3300253064210836
-                        - -0.4799775188835898
-                        - -0.12327559985245252
-                      - - 0.16627900477763782
-                        - 0.18281489789715116
-                        - -0.0796215789550366
-                        - 0.11637836794519682
-                        - 0.019126199990905587
-                      - - 0.47193798042526686
-                        - 0.3935489978037474
-                        - 0.1926588188573466
-                        - 0.11685532990383077
-                        - -0.3143759410105157
-                      - - 0.2619509948079511
-                        - 0.17134734041574828
-                        - 0.16467987243470003
-                        - -0.17768942725372738
-                        - 0.17196893072212313
-                "@version": 2
-                activation_function: tanh
-                bias: true
-                precision: float64
-                resnet: true
-                trainable: true
-                use_timestep: true
-              - "@class": Layer
-                "@variables":
-                  b:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - 0.1498329073500072
-                      - -0.10390305511196503
-                      - -0.7262688617464856
-                      - -0.14980303343140125
-                      - -0.3578894004618838
-                  idt:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - 0.3290381873321775
-                      - 0.23103250534551598
-                      - -0.6940851206117438
-                      - -0.19335307745332778
-                      - -0.9240817753801489
-                  w:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - - -0.3092290441156226
-                        - -0.496367611501348
-                        - -0.052492949379292775
-                        - 0.06663748312823926
-                        - 0.027714401468510886
-                      - - -0.10433141997317527
-                        - -0.323901631855259
-                        - -0.24739439873488192
-                        - 0.3076895568713741
-                        - 0.1593814472209255
-                      - - -0.07111829721069259
-                        - -0.27598680250101504
-                        - 0.16632764307325093
-                        - 0.1801382402999823
-                        - 0.3107523993064097
-                      - - -0.012140157566561928
-                        - 0.07469305237763302
-                        - 0.26428018852282276
-                        - -0.11500213881655802
-                        - -0.2731498304335624
-                      - - 0.29941998505510775
-                        - 0.39267279762211
-                        - 0.06586779164332648
-                        - 0.10010820203885952
-                        - -0.04143485413490972
-                "@version": 2
-                activation_function: tanh
-                bias: true
-                precision: float64
-                resnet: true
-                trainable: true
-                use_timestep: true
-              - "@class": Layer
-                "@variables":
-                  b:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - 0.044426160812178636
-                  idt: null
-                  w:
-                    "@class": np.ndarray
-                    "@is_variable": true
-                    "@version": 1
-                    dtype: float64
-                    value:
-                      - - 0.26432565710368733
-                      - - 0.17264367113482967
-                      - - -0.04729186377886323
-                      - - -0.08841444813809296
-                      - - 0.2969145415081517
-                "@version": 2
-                activation_function: none
-                bias: true
-                precision: float64
-                resnet: false
-                trainable: true
-                use_timestep: false
-            neuron:
-              - 5
-              - 5
-              - 5
-            out_dim: 1
-            precision: float64
-            resnet_dt: true
-        ntypes: 4
-      neuron:
-        - 5
-        - 5
-        - 5
-      ntypes: 4
-      numb_aparam: 0
-      numb_fparam: 0
-      precision: float64
-      rcond: null
-      resnet_dt: true
-      spin: null
-      tot_ener_zero: false
-      trainable:
-        - true
-        - true
-        - true
-        - true
-      type: ener
-      type_map:
-        - Ni
-        - O
-        - Ni_spin
-        - O_spin
-      use_aparam_as_mask: false
-      var_name: energy
-    pair_exclude_types: *id003
-    preset_out_bias: null
-    rcond: null
-    type: standard
-    type_map:
-      - Ni
-      - O
-      - Ni_spin
-      - O_spin
-  spin:
-    use_spin:
-      - true
-      - false
-    virtual_scale:
-      - 0.314
-      - 0.0
-  type: spin_ener
-model_def_script:
-  descriptor:
-    precision: float64
-    repflow:
-      a_dim: 4
-      a_rcut: 3.5
-      a_rcut_smth: 0.5
-      a_sel: 4
-      axis_neuron: 4
-      e_dim: 6
-      e_rcut: 4.0
-      e_rcut_smth: 0.5
-      e_sel: 8
-      n_dim: 8
-      nlayers: 1
-      update_angle: false
-    seed: 1
-    type: dpa3
-    use_loc_mapping: false
-  fitting_net:
-    neuron:
-      - 5
-      - 5
-      - 5
-    resnet_dt: true
-    seed: 1
-  spin:
-    use_spin:
-      - true
-      - false
-    virtual_scale:
-      - 0.314
-      - 0.0
-  type_map:
-    - Ni
-    - O
-software: deepmd-kit
-time: "2026-04-30 14:57:42.534472+00:00"
-version: 3.0.0

From 3c9ee65d8e34e8a5c3552706598cd986d1aecdb3 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 2 May 2026 07:51:02 +0800
Subject: [PATCH 23/34] fix(jax): accept comm_dict kwarg in
 forward_common_atomic

The dpmodel layer threads a new ``comm_dict=None`` kwarg through
``forward_common_atomic`` (model and atomic-model levels) so the
pt_expt backend can wire MPI ghost-atom exchange for GNN multi-rank
LAMMPS. The JAX backend overrides ``forward_common_atomic`` with
explicit kwarg lists; without accepting ``comm_dict``, ``dp
convert-backend ... savedmodel`` fails at trace time:

    TypeError: jax_model.forward_common_atomic() got an unexpected
    keyword argument 'comm_dict'

Affected the entire CI matrix on PR #5430 (every Python shard goes
through the savedmodel build prep). Fix: add ``comm_dict: dict |
None = None`` to each JAX override and ``del comm_dict`` (the JAX
path has no MPI ghost exchange).

Files touched: dp_atomic_model, linear_atomic_model,
pairtab_atomic_model (atomic-model level), plus base_model,
dp_model, dp_zbl_model (model level). Paddle's
forward_common_atomic already accepts comm_dict and needs no change.
---
 deepmd/jax/atomic_model/dp_atomic_model.py      | 2 ++
 deepmd/jax/atomic_model/linear_atomic_model.py  | 2 ++
 deepmd/jax/atomic_model/pairtab_atomic_model.py | 2 ++
 deepmd/jax/model/base_model.py                  | 2 ++
 deepmd/jax/model/dp_model.py                    | 2 ++
 deepmd/jax/model/dp_zbl_model.py                | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/deepmd/jax/atomic_model/dp_atomic_model.py b/deepmd/jax/atomic_model/dp_atomic_model.py
index 7227839f1f..319b8e94a2 100644
--- a/deepmd/jax/atomic_model/dp_atomic_model.py
+++ b/deepmd/jax/atomic_model/dp_atomic_model.py
@@ -57,7 +57,9 @@ def forward_common_atomic(
             mapping: jnp.ndarray | None = None,
             fparam: jnp.ndarray | None = None,
             aparam: jnp.ndarray | None = None,
+            comm_dict: dict | None = None,
         ) -> dict[str, jnp.ndarray]:
+            del comm_dict  # JAX path has no MPI ghost exchange
             return super().forward_common_atomic(
                 extended_coord,
                 extended_atype,
diff --git a/deepmd/jax/atomic_model/linear_atomic_model.py b/deepmd/jax/atomic_model/linear_atomic_model.py
index 1c183db7ac..ecfc74cf95 100644
--- a/deepmd/jax/atomic_model/linear_atomic_model.py
+++ b/deepmd/jax/atomic_model/linear_atomic_model.py
@@ -61,7 +61,9 @@ def forward_common_atomic(
         mapping: jnp.ndarray | None = None,
         fparam: jnp.ndarray | None = None,
         aparam: jnp.ndarray | None = None,
+        comm_dict: dict | None = None,
     ) -> dict[str, jnp.ndarray]:
+        del comm_dict  # JAX path has no MPI ghost exchange
         return super().forward_common_atomic(
             extended_coord,
             extended_atype,
diff --git a/deepmd/jax/atomic_model/pairtab_atomic_model.py b/deepmd/jax/atomic_model/pairtab_atomic_model.py
index 7f18a6403c..0117bf1d2c 100644
--- a/deepmd/jax/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/jax/atomic_model/pairtab_atomic_model.py
@@ -46,7 +46,9 @@ def forward_common_atomic(
         mapping: jnp.ndarray | None = None,
         fparam: jnp.ndarray | None = None,
         aparam: jnp.ndarray | None = None,
+        comm_dict: dict | None = None,
     ) -> dict[str, jnp.ndarray]:
+        del comm_dict  # JAX path has no MPI ghost exchange
         return super().forward_common_atomic(
             extended_coord,
             extended_atype,
diff --git a/deepmd/jax/model/base_model.py b/deepmd/jax/model/base_model.py
index 4522e25586..f99fccd276 100644
--- a/deepmd/jax/model/base_model.py
+++ b/deepmd/jax/model/base_model.py
@@ -26,7 +26,9 @@ def forward_common_atomic(
     aparam: jnp.ndarray | None = None,
     do_atomic_virial: bool = False,
     extended_coord_corr: jnp.ndarray | None = None,
+    comm_dict: dict | None = None,
 ) -> dict[str, jnp.ndarray]:
+    del comm_dict  # JAX path has no MPI ghost exchange
     atomic_ret = self.atomic_model.forward_common_atomic(
         extended_coord,
         extended_atype,
diff --git a/deepmd/jax/model/dp_model.py b/deepmd/jax/model/dp_model.py
index 3e96eb6689..55239bb608 100644
--- a/deepmd/jax/model/dp_model.py
+++ b/deepmd/jax/model/dp_model.py
@@ -56,7 +56,9 @@ def forward_common_atomic(
             aparam: jnp.ndarray | None = None,
             do_atomic_virial: bool = False,
             extended_coord_corr: jnp.ndarray | None = None,
+            comm_dict: dict | None = None,
         ) -> dict[str, jnp.ndarray]:
+            del comm_dict  # JAX path has no MPI ghost exchange
             return forward_common_atomic(
                 self,
                 extended_coord,
diff --git a/deepmd/jax/model/dp_zbl_model.py b/deepmd/jax/model/dp_zbl_model.py
index 7751d22a1f..f2aa68ea1f 100644
--- a/deepmd/jax/model/dp_zbl_model.py
+++ b/deepmd/jax/model/dp_zbl_model.py
@@ -38,7 +38,9 @@ def forward_common_atomic(
         aparam: jnp.ndarray | None = None,
         do_atomic_virial: bool = False,
         extended_coord_corr: jnp.ndarray | None = None,
+        comm_dict: dict | None = None,
     ) -> dict[str, jnp.ndarray]:
+        del comm_dict  # JAX path has no MPI ghost exchange
         return forward_common_atomic(
             self,
             extended_coord,

From 87c9f3f8e8a65d6c43647b6de040c1770069f114 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 2 May 2026 16:23:41 +0800
Subject: [PATCH 24/34] fix(pt_expt): auto-load underlying ops in comm.py

DDP-spawned worker subprocesses re-import modules from scratch and
never run the test conftest's ``import deepmd.pt``, so when
``pt_expt.utils.comm`` is imported the underlying
``deepmd_export::{border_op,border_op_backward}`` ops are not yet
registered and the import-time guard raises:

    RuntimeError: torch.ops.deepmd_export.{border_op,border_op_backward}
    are not registered. Build libdeepmd_op_pt.so and ensure deepmd.pt
    is imported before this module.

Repro: test_training_ddp.py::TestDDPRestart::test_ddp_restart on every
Python CI shard.

Fix: ``_check_underlying_ops_loaded`` now triggers ``import deepmd.pt``
as a side effect when the ops aren't yet registered. ``deepmd/pt/cxx_op.py``
loads ``libdeepmd_op_pt.so`` which registers the schemas. The original
RuntimeError stays as a fallback if ``import deepmd.pt`` itself fails.

Verified locally: importing ``deepmd.pt_expt.utils.comm`` in a fresh
process (without explicit ``import deepmd.pt`` first) now succeeds and
``torch.ops.deepmd_export.border_op`` is available.
---
 deepmd/pt_expt/utils/comm.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py
index b985c57fe6..434d2a97b0 100644
--- a/deepmd/pt_expt/utils/comm.py
+++ b/deepmd/pt_expt/utils/comm.py
@@ -43,7 +43,27 @@ def _check_underlying_ops_loaded() -> None:
     op schemas + impls.  Without it, the ops can't be registered for
     fake/autograd metadata and callers get a cryptic AttributeError
     on ``torch.ops.deepmd_export.border_op``.
+
+    The .so is loaded as a side effect of ``import deepmd.pt`` (via
+    ``deepmd/pt/cxx_op.py``).  We trigger that import here so callers
+    don't have to remember to do it first — important for environments
+    like DDP-spawned subprocesses that re-import modules from scratch
+    and never see the test conftest's ``import deepmd.pt``.
     """
+    if not (
+        hasattr(torch.ops, "deepmd_export")
+        and hasattr(torch.ops.deepmd_export, "border_op")
+        and hasattr(torch.ops.deepmd_export, "border_op_backward")
+    ):
+        # Triggers cxx_op.py which torch.ops.load_library's the .so.
+        try:
+            import deepmd.pt  # noqa: F401
+        except Exception:
+            # If deepmd.pt itself fails to import, fall through to the
+            # explicit RuntimeError below — clearer than re-raising a
+            # potentially-unrelated import error.
+            pass
+
     if not (
         hasattr(torch.ops, "deepmd_export")
         and hasattr(torch.ops.deepmd_export, "border_op")
@@ -52,7 +72,7 @@ def _check_underlying_ops_loaded() -> None:
         raise RuntimeError(
             "torch.ops.deepmd_export.{border_op,border_op_backward} "
             "are not registered. Build libdeepmd_op_pt.so and ensure "
-            "deepmd.pt is imported before this module."
+            "deepmd.pt is importable before this module."
         )
 
 

From 4865c4e3c220efe2311f928c21d4a53ab924fda6 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 2 May 2026 16:27:54 +0800
Subject: [PATCH 25/34] chore: drop redundant ``import deepmd.pt`` preloads

After 87c9f3f8e ``deepmd.pt_expt.utils.comm`` self-bootstraps
``libdeepmd_op_pt.so`` via ``_check_underlying_ops_loaded()``, so the
explicit ``import deepmd.pt`` preloads in conftest.py and
test_border_op_backward.py are no longer needed.

Closes 2 of the 13 GitHub Advanced Security CodeQL "unused import"
alerts on the PR. The remaining 5 Python alerts (other tests'
``import deepmd.pt_expt.utils.comm`` for opaque-op registration) and
6 C++ alerts (TORCH_LIBRARY_* / border_op_export reachable only
through macro-expanded static initialization) are CodeQL false
positives that need to be dismissed in the GitHub Security UI rather
than fixed in source.
---
 source/tests/pt_expt/conftest.py                      | 11 +++--------
 source/tests/pt_expt/utils/test_border_op_backward.py |  6 +++---
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/source/tests/pt_expt/conftest.py b/source/tests/pt_expt/conftest.py
index 06bca2fec5..d4d987fe95 100644
--- a/source/tests/pt_expt/conftest.py
+++ b/source/tests/pt_expt/conftest.py
@@ -17,14 +17,9 @@
     _get_current_function_mode_stack,
 )
 
-# Import ``deepmd.pt`` at conftest evaluation time so libdeepmd_op_pt.so
-# is loaded and ``deepmd_export::{border_op, border_op_backward}`` are
-# registered before any pt_expt test module imports
-# ``deepmd.pt_expt.utils`` (which transitively imports ``comm.py`` and
-# its ``_check_underlying_ops_loaded()`` runtime check). Previously this
-# worked only when collected alongside earlier tests that happened to
-# import deepmd.pt first.
-import deepmd.pt  # noqa: F401  - side-effect: register custom ops
+# ``deepmd.pt_expt.utils.comm`` self-bootstraps libdeepmd_op_pt.so via
+# ``_check_underlying_ops_loaded()``, so we no longer need to preload
+# ``deepmd.pt`` here.
 
 
 def _pop_device_contexts() -> list:
diff --git a/source/tests/pt_expt/utils/test_border_op_backward.py b/source/tests/pt_expt/utils/test_border_op_backward.py
index c46705ad8a..aeaf491cb2 100644
--- a/source/tests/pt_expt/utils/test_border_op_backward.py
+++ b/source/tests/pt_expt/utils/test_border_op_backward.py
@@ -30,9 +30,9 @@
 import pytest
 import torch
 
-# Ensure the new C++ symbol is loaded.  pt_expt imports deepmd.pt for
-# the custom-op .so.
-import deepmd.pt
+# comm self-bootstraps the underlying libdeepmd_op_pt.so when needed, so
+# this single side-effect import is enough to register both the C++
+# ops (deepmd::border_op_backward) and their fake/autograd metadata.
 import deepmd.pt_expt.utils.comm  # noqa: F401  - registers deepmd_export::border_op
 
 

From bf1685ffc573d6d071113e9a2f994bcf35dba278 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 2 May 2026 16:52:14 +0800
Subject: [PATCH 26/34] fix: address coderabbitai review on PR 5430

Applies the substantive coderabbitai suggestions from the PR review.

Defensive guards (no behavioral change for existing callers):
- dpmodel/descriptor/{repflows,repformers}.py: raise ValueError when
  the default `_exchange_ghosts` is hit with `mapping_tiled=None` and
  `use_loc_mapping=False` instead of returning a cryptic
  array-backend error.
- pt_expt/descriptor/{repflows,repformers}.py: refuse `comm_dict` path
  when `nf != 1`. The squeeze(0)/unsqueeze(0) dance only works for a
  single frame; failing here surfaces the unsupported case loudly
  instead of producing a malformed border_op tensor.

Init robustness:
- api_cc/src/{DeepPotPTExpt,DeepSpinPTExpt}.cc: wrap the optional
  with-comm artifact load in try/catch. If `has_comm_artifact` is set
  in metadata but the nested artifact fails to extract or compile,
  log and fall back to single-rank-only dispatch instead of aborting
  init -- the hard error then surfaces only when multi-rank actually
  needs the missing artifact.

Code hygiene:
- dpmodel/descriptor/hybrid.py: rename unused unpacks (`g2/h2/sw` ->
  `_g2/_h2/_sw`) for ruff RUF059 cleanliness.
- tests/infer/gen_dpa3.py: deepcopy `config_mpi` before passing to
  `get_model()` so `data_mpi["model_def_script"]` retains the
  intended MPI export config even if the call mutates its argument.
- tests/pt_expt/model/test_export_with_comm.py: mirror the zero-ghost
  clamp from `serialization.py::_make_comm_sample_inputs` in the test
  helper, so no zero-length sendlist pointer is ever materialised.
  Also update `extra/...` -> `model/extra/...` archive paths to match
  PT2_EXTRA_PREFIX after the upstream/master merge.

Verified locally: pt_expt python (24/24), ctest (3/3, 498 tests
including 198 PtExpt), LAMMPS multi-rank GNN (19/19) all green.
---
 deepmd/dpmodel/descriptor/hybrid.py           |  2 +-
 deepmd/dpmodel/descriptor/repflows.py         |  5 +++
 deepmd/dpmodel/descriptor/repformers.py       |  6 +++
 deepmd/pt_expt/descriptor/repflows.py         | 11 ++++++
 deepmd/pt_expt/descriptor/repformers.py       | 11 ++++++
 source/api_cc/src/DeepPotPTExpt.cc            | 37 +++++++++++++------
 source/api_cc/src/DeepSpinPTExpt.cc           | 28 +++++++++-----
 source/tests/infer/gen_dpa3.py                |  5 ++-
 .../pt_expt/model/test_export_with_comm.py    | 24 ++++++++----
 9 files changed, 99 insertions(+), 30 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py
index 512a753d25..8a644885ca 100644
--- a/deepmd/dpmodel/descriptor/hybrid.py
+++ b/deepmd/dpmodel/descriptor/hybrid.py
@@ -333,7 +333,7 @@ def call(
                 # mixed_types is True, but descrpt.mixed_types is False
                 assert nl_distinguish_types is not None
                 nl = nl_distinguish_types[:, :, nci]
-            odescriptor, gr, g2, h2, sw = descrpt(
+            odescriptor, gr, _g2, _h2, _sw = descrpt(
                 coord_ext, atype_ext, nl, mapping, comm_dict=comm_dict
             )
             out_descriptor.append(odescriptor)
diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py
index 2dd64448b2..c3c6713aef 100644
--- a/deepmd/dpmodel/descriptor/repflows.py
+++ b/deepmd/dpmodel/descriptor/repflows.py
@@ -525,6 +525,11 @@ def _exchange_ghosts(
         del comm_dict, nall, nloc
         if self.use_loc_mapping:
             return node_ebd
+        if mapping_tiled is None:
+            raise ValueError(
+                "`mapping` is required when use_loc_mapping=False unless "
+                "`_exchange_ghosts` is overridden for parallel comm handling."
+            )
         return xp_take_along_axis(node_ebd, mapping_tiled, axis=1)
 
     def call(
diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py
index 3891c57c7d..55b4a1a342 100644
--- a/deepmd/dpmodel/descriptor/repformers.py
+++ b/deepmd/dpmodel/descriptor/repformers.py
@@ -498,6 +498,12 @@ def _exchange_ghosts(
         not None``.
         """
         del comm_dict, nall, nloc
+        if mapping_tiled is None:
+            raise ValueError(
+                "`mapping` is required by the default `_exchange_ghosts` "
+                "implementation; pass a valid mapping or override the method "
+                "for parallel comm handling."
+            )
         return xp_take_along_axis(g1, mapping_tiled, axis=1)
 
     def call(
diff --git a/deepmd/pt_expt/descriptor/repflows.py b/deepmd/pt_expt/descriptor/repflows.py
index efd7cba7ba..dacab9f464 100644
--- a/deepmd/pt_expt/descriptor/repflows.py
+++ b/deepmd/pt_expt/descriptor/repflows.py
@@ -64,6 +64,17 @@ def _exchange_ghosts(
                 "inference requires use_loc_mapping=False so per-layer "
                 "ghost exchange is meaningful."
             )
+        # The squeeze(0) / unsqueeze(0) dance below assumes a single
+        # frame.  LAMMPS always feeds nb=1 in production; refuse loudly
+        # if a Python caller batches frames so the mismatch surfaces
+        # here rather than as a malformed border_op tensor downstream.
+        if node_ebd.shape[0] != 1:
+            raise RuntimeError(
+                "DescrptBlockRepflows._exchange_ghosts: comm_dict path "
+                "only supports nf=1 (got nf="
+                f"{node_ebd.shape[0]}). Multi-frame batching with "
+                "comm_dict is not supported."
+            )
 
         has_spin = "has_spin" in comm_dict
         if has_spin:
diff --git a/deepmd/pt_expt/descriptor/repformers.py b/deepmd/pt_expt/descriptor/repformers.py
index f106a7a240..9b8ddb4a85 100644
--- a/deepmd/pt_expt/descriptor/repformers.py
+++ b/deepmd/pt_expt/descriptor/repformers.py
@@ -44,6 +44,17 @@ def _exchange_ghosts(
                 nall,
                 nloc,
             )
+        # The squeeze(0) / unsqueeze(0) dance below assumes a single
+        # frame.  LAMMPS always feeds nb=1 in production; refuse loudly
+        # if a Python caller batches frames so the mismatch surfaces
+        # here rather than as a malformed border_op tensor downstream.
+        if g1.shape[0] != 1:
+            raise RuntimeError(
+                "DescrptBlockRepformers._exchange_ghosts: comm_dict path "
+                "only supports nf=1 (got nf="
+                f"{g1.shape[0]}). Multi-frame batching with comm_dict is "
+                "not supported."
+            )
 
         has_spin = "has_spin" in comm_dict
         if has_spin:
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 3f01081d42..287ee3b18f 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -164,21 +164,34 @@ void DeepPotPTExpt::init(const std::string& model,
 
   // Phase 4: load the optional with-comm artifact for multi-rank GNN
   // inference. Pre-Phase-3 .pt2 files lack ``has_comm_artifact``;
-  // default to false so old artifacts keep working.
+  // default to false so old artifacts keep working. If the metadata
+  // flag is set but the nested artifact fails to extract or compile,
+  // fall back to single-rank mode rather than aborting init -- the
+  // hard error then surfaces in ``run_model_with_comm()`` only when
+  // multi-rank actually needs it.
   has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") &&
                        metadata["has_comm_artifact"].as_bool();
   if (has_comm_artifact_) {
-    // Extract the nested ``extra/forward_lower_with_comm.pt2`` into a
-    // temp file and load it as a second AOTI module. The TempFile
-    // unlinks the temp file on destruction.
-    with_comm_tempfile_ = std::make_unique<deepmd::ptexpt::TempFile>(
-        deepmd::ptexpt::TempFile::from_zip_entry(
-            model, "extra/forward_lower_with_comm.pt2"));
-    with_comm_loader =
-        std::make_unique<torch::inductor::AOTIModelPackageLoader>(
-            with_comm_tempfile_->path(), "model", false, 1,
-            gpu_enabled ? static_cast<c10::DeviceIndex>(gpu_id)
-                        : static_cast<c10::DeviceIndex>(-1));
+    try {
+      // Extract the nested ``extra/forward_lower_with_comm.pt2`` into a
+      // temp file and load it as a second AOTI module. The TempFile
+      // unlinks the temp file on destruction.
+      with_comm_tempfile_ = std::make_unique<deepmd::ptexpt::TempFile>(
+          deepmd::ptexpt::TempFile::from_zip_entry(
+              model, "extra/forward_lower_with_comm.pt2"));
+      with_comm_loader =
+          std::make_unique<torch::inductor::AOTIModelPackageLoader>(
+              with_comm_tempfile_->path(), "model", false, 1,
+              gpu_enabled ? static_cast<c10::DeviceIndex>(gpu_id)
+                          : static_cast<c10::DeviceIndex>(-1));
+    } catch (const std::exception& e) {
+      std::cerr << "DeepPotPTExpt: failed to load with-comm artifact ("
+                << e.what() << "); falling back to single-rank-only dispatch."
+                << std::endl;
+      with_comm_tempfile_.reset();
+      with_comm_loader.reset();
+      has_comm_artifact_ = false;
+    }
   }
 
   int num_intra_nthreads, num_inter_nthreads;
diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc
index 90c518c1a5..9d4f072d2a 100644
--- a/source/api_cc/src/DeepSpinPTExpt.cc
+++ b/source/api_cc/src/DeepSpinPTExpt.cc
@@ -173,18 +173,28 @@ void DeepSpinPTExpt::init(const std::string& model,
                   : static_cast<c10::DeviceIndex>(-1));
 
   // Phase 4: load the optional with-comm artifact for multi-rank GNN
-  // spin inference.  Mirrors DeepPotPTExpt; see its init() comment.
+  // spin inference.  Mirrors DeepPotPTExpt; see its init() comment for
+  // the rationale on the try/catch fallback.
   has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") &&
                        metadata["has_comm_artifact"].as_bool();
   if (has_comm_artifact_) {
-    with_comm_tempfile_ = std::make_unique<deepmd::ptexpt::TempFile>(
-        deepmd::ptexpt::TempFile::from_zip_entry(
-            model, "extra/forward_lower_with_comm.pt2"));
-    with_comm_loader =
-        std::make_unique<torch::inductor::AOTIModelPackageLoader>(
-            with_comm_tempfile_->path(), "model", false, 1,
-            gpu_enabled ? static_cast<c10::DeviceIndex>(gpu_id)
-                        : static_cast<c10::DeviceIndex>(-1));
+    try {
+      with_comm_tempfile_ = std::make_unique<deepmd::ptexpt::TempFile>(
+          deepmd::ptexpt::TempFile::from_zip_entry(
+              model, "extra/forward_lower_with_comm.pt2"));
+      with_comm_loader =
+          std::make_unique<torch::inductor::AOTIModelPackageLoader>(
+              with_comm_tempfile_->path(), "model", false, 1,
+              gpu_enabled ? static_cast<c10::DeviceIndex>(gpu_id)
+                          : static_cast<c10::DeviceIndex>(-1));
+    } catch (const std::exception& e) {
+      std::cerr << "DeepSpinPTExpt: failed to load with-comm artifact ("
+                << e.what() << "); falling back to single-rank-only dispatch."
+                << std::endl;
+      with_comm_tempfile_.reset();
+      with_comm_loader.reset();
+      has_comm_artifact_ = false;
+    }
   }
 
   int num_intra_nthreads, num_inter_nthreads;
diff --git a/source/tests/infer/gen_dpa3.py b/source/tests/infer/gen_dpa3.py
index ffe3126eb2..20304afcf2 100644
--- a/source/tests/infer/gen_dpa3.py
+++ b/source/tests/infer/gen_dpa3.py
@@ -94,7 +94,10 @@ def main():
     # source/lmp/tests/test_lammps_dpa3_pt2.py::test_pair_deepmd_mpi_dpa3.
     config_mpi = copy.deepcopy(config)
     config_mpi["descriptor"]["use_loc_mapping"] = False
-    model_mpi = get_model(config_mpi)
+    # Defensive deep copy: get_model is allowed to mutate its argument
+    # in place, and we still need ``config_mpi`` intact below for
+    # ``model_def_script``.
+    model_mpi = get_model(copy.deepcopy(config_mpi))
     data_mpi = {
         "model": model_mpi.serialize(),
         "model_def_script": config_mpi,
diff --git a/source/tests/pt_expt/model/test_export_with_comm.py b/source/tests/pt_expt/model/test_export_with_comm.py
index 24c27310ee..f905f409bc 100644
--- a/source/tests/pt_expt/model/test_export_with_comm.py
+++ b/source/tests/pt_expt/model/test_export_with_comm.py
@@ -79,16 +79,25 @@ def _build_self_comm_inputs(
     sendlist_indices: np.ndarray,
     keepalive: list,
 ) -> tuple[torch.Tensor, ...]:
-    """Build runtime comm tensors for a single-rank self-send."""
+    """Build runtime comm tensors for a single-rank self-send.
+
+    Clamps the swap count to ``max(1, nghost)`` to mirror the trace-time
+    helper in ``serialization.py::_make_comm_sample_inputs``; that
+    avoids an empty sendlist pointer when a caller happens to construct
+    a fixture with no ghost atoms.
+    """
+    send_count = max(1, nghost)
     sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32)
+    if sendlist_indices.size == 0:
+        sendlist_indices = np.zeros(send_count, dtype=np.int32)
     keepalive.append(sendlist_indices)
     nswap = 1
     addr = _addr_of(sendlist_indices)
     send_list = torch.tensor([addr], dtype=torch.int64)
     send_proc = torch.zeros(nswap, dtype=torch.int32)
     recv_proc = torch.zeros(nswap, dtype=torch.int32)
-    send_num = torch.tensor([nghost], dtype=torch.int32)
-    recv_num = torch.tensor([nghost], dtype=torch.int32)
+    send_num = torch.tensor([send_count], dtype=torch.int32)
+    recv_num = torch.tensor([send_count], dtype=torch.int32)
     communicator = torch.zeros(1, dtype=torch.int64)
     nlocal_ts = torch.tensor(nloc, dtype=torch.int32)
     nghost_ts = torch.tensor(nghost, dtype=torch.int32)
@@ -120,11 +129,12 @@ def test_pt2_dual_artifact_for_gnn(tmp_path) -> None:
     deserialize_to_file(pt2_path, data)
     assert os.path.exists(pt2_path)
 
-    # 1. ZIP layout sanity
+    # 1. ZIP layout sanity. PyTorch 2.11 strict layout puts our sidecars
+    # under ``model/extra/`` (PT2_EXTRA_PREFIX); see serialization.py.
     with zipfile.ZipFile(pt2_path, "r") as zf:
         names = set(zf.namelist())
-        meta = json.loads(zf.read("extra/metadata.json").decode("utf-8"))
-        assert "extra/forward_lower_with_comm.pt2" in names, (
+        meta = json.loads(zf.read("model/extra/metadata.json").decode("utf-8"))
+        assert "model/extra/forward_lower_with_comm.pt2" in names, (
             f"with-comm artifact missing; names={sorted(names)}"
         )
     assert meta["has_message_passing"] is True
@@ -141,7 +151,7 @@ def test_pt2_dual_artifact_for_gnn(tmp_path) -> None:
         wc_path = os.path.join(td, "fl_wc.pt2")
         with zipfile.ZipFile(pt2_path, "r") as zf:
             with open(wc_path, "wb") as f:
-                f.write(zf.read("extra/forward_lower_with_comm.pt2"))
+                f.write(zf.read("model/extra/forward_lower_with_comm.pt2"))
         with_comm = aoti_load_package(wc_path)
 
     # 3. Run both artifacts with nframes=1 (matches what the with-comm

From a429fc99e2f283a18a1032610a1f0f8c55b9c555 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 2 May 2026 23:41:51 +0800
Subject: [PATCH 27/34] refactor: replace _has_message_passing hack with
 descriptor API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removes the private-attribute fishing in
``deepmd/pt_expt/utils/serialization.py`` (which read
``descriptor.repflows.use_loc_mapping`` and friends) and replaces it
with a public method on ``BaseDescriptor``: ``has_message_passing_-
across_ranks()``.

Why
---
The old helper conflated two questions:

1. "Is this a GNN-style descriptor?" (existing ``has_message_passing()``)
2. "Do per-layer node embeddings need MPI exchange across rank
   boundaries to be correct under multi-rank LAMMPS?"

Only #2 governs whether to compile a with-comm AOTI artifact. The old
function answered #2 by special-casing the ``repflows``/``repformers``
attribute names and ``use_loc_mapping`` flag — silent breakage on any
rename and never recursing into hybrid wrappers (Tier-1 #1 in the
gnn_mpi_untested_paths catalog).

Note: every LAMMPS pair_style already exchanges ghost-atom *coords and
forces* via the standard pair-style comm topology — that's not GNN-
specific. The new method asks specifically about per-layer atomic
feature exchange (the ``node_ebd`` tensor that flows between message-
passing layers), which is the actual concern that gates the with-comm
artifact.

How
---
``BaseDescriptor.has_message_passing_across_ranks()`` returns ``False``
by default. GNN paths override:

- ``DescrptBlockRepflows``: ``not self.use_loc_mapping``
- ``DescrptBlockRepformers``: ``True`` (no ``use_loc_mapping`` opt-out
  exists)
- ``DescrptDPA3`` / ``DescrptDPA2``: delegate to their block
- ``DescrptHybrid``: ``any(child.has_message_passing_across_ranks() ...)``
  (closes the structural side of catalog Tier-1 #1)

Non-GNN dpmodel descriptors (``se_e2_a``, ``se_r``, ``se_t``,
``se_t_tebd``, ``dpa1``) get explicit ``return False`` overrides
pinning the contract; pt and pd backend descriptors inherit the
default (no edits needed there).

The serialization helper ``_has_message_passing`` is renamed to
``_needs_with_comm_artifact`` and just calls
``descriptor.has_message_passing_across_ranks()``. The metadata key
``has_message_passing`` is dropped from the .pt2 archive (C++ readers
only consume ``has_comm_artifact``).

Per-descriptor tests
--------------------
The standalone ``source/tests/pt_expt/utils/test_has_message_passing.py``
is deleted; per-descriptor coverage of *both* APIs is added to existing
descriptor test files at ``source/tests/pt_expt/descriptor/``:

| File         | has_message_passing | has_message_passing_across_ranks |
|--------------|---------------------|----------------------------------|
| se_e2_a      | False               | False                            |
| dpa1         | False               | False                            |
| dpa3         | True                | not use_loc_mapping              |
| dpa2         | True                | True                             |
| hybrid       | depends on child    | True if any child needs it       |

Bonus: also includes a CUDA segfault fix
----------------------------------------
While running the post-refactor verification, the CUDA-runner CI
exposed a latent bug in ``source/op/pt/comm.cc`` (forward + backward
kernels): when built with ``USE_MPI`` but invoked single-rank
(world_size==0), ``cuda_aware`` defaults to 0 and the CPU-fallback
``recv_g1_tensor.to(kCPU)`` block (guarded by ``world_size >= 1``) is
skipped — the tensor stays on CUDA. The inner self-send branch then
did host ``memcpy`` on what were still CUDA pointers and segfaulted.
Fix: gate the host-memcpy / CPU-copy-back paths on
``world_size >= 1 && cuda_aware == 0`` so single-rank deployments
correctly use ``gpuMemcpy DeviceToDevice``. Mirrored in three sites
(forward inner, forward post-loop, backward inner, backward post-loop).

Float32 multi-rank fixture + test
---------------------------------
Adds ``test_lammps_dpa3_pt2_fp32.py`` and a paired
``deeppot_dpa3_mpi_fp32.pt2`` fixture (gen_dpa3.py addition). Validates
that the comm_dict path is dtype-agnostic in practice (template
dispatch on ``g1.dtype()``, ``register_fake``'s ``empty_like(g1)``,
and ``MPI_FLOAT`` exchange) — not just by inspection. Compares mpi-2
vs same-archive mpi-1 with float32-appropriate tolerances (atol 1e-4 /
rel 1e-3 for force/virial; rel 1e-5 for energy).

Verified locally (CPU build): pt_expt python 965 passed / 32 skipped,
ctest 3/3 (498 C++ tests), LAMMPS multi-rank 20/20 (DPA3 + DPA2 +
spin DPA3 + DPA3 fp32).

Trade-off note
--------------
The plan called for ``has_message_passing_across_ranks()`` to be
abstract on ``BaseDescriptor`` (mirroring ``has_message_passing``).
Implementing that requires touching all 49 subclasses across pt and
pd backends — well outside the scope of "GNN MPI for pt_expt". Kept
the method concrete with a ``return False`` default; pt and pd
backend descriptors inherit that. They can override later if they
grow a multi-rank GNN path of their own.
---
 deepmd/dpmodel/descriptor/dpa1.py             |   8 +
 deepmd/dpmodel/descriptor/dpa2.py             |  10 +
 deepmd/dpmodel/descriptor/dpa3.py             |  11 +
 deepmd/dpmodel/descriptor/hybrid.py           |  10 +
 .../descriptor/make_base_descriptor.py        |  18 ++
 deepmd/dpmodel/descriptor/repflows.py         |  10 +
 deepmd/dpmodel/descriptor/repformers.py       |   9 +
 deepmd/dpmodel/descriptor/se_e2_a.py          |   4 +
 deepmd/dpmodel/descriptor/se_r.py             |   4 +
 deepmd/dpmodel/descriptor/se_t.py             |   4 +
 deepmd/dpmodel/descriptor/se_t_tebd.py        |   4 +
 deepmd/pt_expt/utils/serialization.py         |  97 ++++----
 source/lmp/tests/test_lammps_dpa3_pt2_fp32.py | 163 +++++++++++++
 source/op/pt/comm.cc                          |  25 +-
 source/tests/infer/gen_dpa2.py                |   5 +-
 source/tests/infer/gen_dpa3.py                |  24 ++
 source/tests/pt_expt/descriptor/test_dpa1.py  |  37 +++
 source/tests/pt_expt/descriptor/test_dpa2.py  |  57 +++++
 source/tests/pt_expt/descriptor/test_dpa3.py  |  40 +++
 .../tests/pt_expt/descriptor/test_hybrid.py   |  70 ++++++
 .../tests/pt_expt/descriptor/test_se_e2_a.py  |  35 +++
 .../pt_expt/model/test_export_with_comm.py    |  35 +--
 .../pt_expt/utils/test_has_message_passing.py | 229 ------------------
 23 files changed, 603 insertions(+), 306 deletions(-)
 create mode 100644 source/lmp/tests/test_lammps_dpa3_pt2_fp32.py
 delete mode 100644 source/tests/pt_expt/utils/test_has_message_passing.py

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 9d138f422a..04d0420009 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -397,6 +397,14 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor has message passing."""
         return self.se_atten.has_message_passing()
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange.
+
+        DPA1 (se_atten) is single-layer and does not exchange features
+        across ranks; same as the base se_e2_a path.
+        """
+        return False
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
         return self.se_atten.need_sorted_nlist_for_lower()
diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py
index 851422cce0..e530398ca6 100644
--- a/deepmd/dpmodel/descriptor/dpa2.py
+++ b/deepmd/dpmodel/descriptor/dpa2.py
@@ -687,6 +687,16 @@ def has_message_passing(self) -> bool:
             [self.repinit.has_message_passing(), self.repformers.has_message_passing()]
         )
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange.
+
+        DPA2's repformers always passes ``g1`` in ``[nb, nall, n_dim]``
+        layout (no ``use_loc_mapping`` opt-out exists at the block level),
+        so multi-rank deployment always needs cross-rank exchange of
+        per-atom features between layers.
+        """
+        return self.repformers.has_message_passing_across_ranks()
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
         return True
diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py
index 07d5481a91..c1d9531357 100644
--- a/deepmd/dpmodel/descriptor/dpa3.py
+++ b/deepmd/dpmodel/descriptor/dpa3.py
@@ -527,6 +527,17 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor has message passing."""
         return self.repflows.has_message_passing()
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange.
+
+        Delegates to repflows: ``False`` when ``use_loc_mapping=True``
+        (per-layer messages stay within each rank's local atoms),
+        ``True`` when ``use_loc_mapping=False`` (ghost slots in
+        ``[nb, nall, n_dim]`` layout must be filled by cross-rank
+        exchange before each layer).
+        """
+        return self.repflows.has_message_passing_across_ranks()
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
         return True
diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py
index 8a644885ca..a51220c5e2 100644
--- a/deepmd/dpmodel/descriptor/hybrid.py
+++ b/deepmd/dpmodel/descriptor/hybrid.py
@@ -168,6 +168,16 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor has message passing."""
         return any(descrpt.has_message_passing() for descrpt in self.descrpt_list)
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange.
+
+        ``True`` if any child descriptor needs cross-rank message passing
+        (e.g. a hybrid wrapping a DPA3 with ``use_loc_mapping=False``).
+        """
+        return any(
+            descrpt.has_message_passing_across_ranks() for descrpt in self.descrpt_list
+        )
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
         return True
diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py
index 47245898ce..8184b4e42a 100644
--- a/deepmd/dpmodel/descriptor/make_base_descriptor.py
+++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py
@@ -107,6 +107,24 @@ def mixed_types(self) -> bool:
         def has_message_passing(self) -> bool:
             """Returns whether the descriptor has message passing."""
 
+        def has_message_passing_across_ranks(self) -> bool:
+            """Returns whether the descriptor's message passing extends across rank
+            boundaries — i.e. whether it requires cross-rank exchange of intermediate
+            atomic features (per-layer node embeddings) during the forward pass.
+
+            Distinct from generic ghost-coord/force exchange that every LAMMPS
+            pair_style does. This question gates whether the pt_expt backend
+            compiles a second "with-comm" AOTI artifact for multi-rank deployment.
+
+            Concrete default ``False`` (non-GNN behavior) so pt and pd backend
+            descriptors that subclass ``BaseDescriptor`` directly do not have
+            to implement this method until they grow a multi-rank GNN path of
+            their own. GNN descriptors that need MPI ghost-feature exchange
+            (DPA2, DPA3 with ``use_loc_mapping=False``, hybrids wrapping such
+            children) override to return ``True``.
+            """
+            return False
+
         @abstractmethod
         def need_sorted_nlist_for_lower(self) -> bool:
             """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py
index c3c6713aef..bc94b877ea 100644
--- a/deepmd/dpmodel/descriptor/repflows.py
+++ b/deepmd/dpmodel/descriptor/repflows.py
@@ -732,6 +732,16 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor block has message passing."""
         return True
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange.
+
+        Repflows passes ``node_ebd`` either in ``[nb, nloc, n_dim]`` layout
+        (``use_loc_mapping=True``: messages stay within the rank's local atoms)
+        or ``[nb, nall, n_dim]`` layout (``use_loc_mapping=False``: ghost slots
+        must be filled by cross-rank exchange before each layer).
+        """
+        return not self.use_loc_mapping
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor block needs sorted nlist when using `forward_lower`."""
         return True
diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py
index 55b4a1a342..799ab0c3c3 100644
--- a/deepmd/dpmodel/descriptor/repformers.py
+++ b/deepmd/dpmodel/descriptor/repformers.py
@@ -600,6 +600,15 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor block has message passing."""
         return True
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer g1 needs MPI ghost exchange.
+
+        Repformers has no ``use_loc_mapping`` opt-out; it always passes
+        ``g1`` in ``[nb, nall, n_dim]`` layout, so multi-rank always needs
+        cross-rank exchange of the per-atom feature tensor.
+        """
+        return True
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor block needs sorted nlist when using `forward_lower`."""
         return False
diff --git a/deepmd/dpmodel/descriptor/se_e2_a.py b/deepmd/dpmodel/descriptor/se_e2_a.py
index 6c20699c23..f72b6f75e8 100644
--- a/deepmd/dpmodel/descriptor/se_e2_a.py
+++ b/deepmd/dpmodel/descriptor/se_e2_a.py
@@ -278,6 +278,10 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor has message passing."""
         return False
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange."""
+        return False
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
         return False
diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py
index 55a774bb71..6846710735 100644
--- a/deepmd/dpmodel/descriptor/se_r.py
+++ b/deepmd/dpmodel/descriptor/se_r.py
@@ -257,6 +257,10 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor has message passing."""
         return False
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange."""
+        return False
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
         return False
diff --git a/deepmd/dpmodel/descriptor/se_t.py b/deepmd/dpmodel/descriptor/se_t.py
index 38eb7cc16c..2d61736235 100644
--- a/deepmd/dpmodel/descriptor/se_t.py
+++ b/deepmd/dpmodel/descriptor/se_t.py
@@ -249,6 +249,10 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor has message passing."""
         return False
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange."""
+        return False
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
         return False
diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py
index 445260b861..2f6e749e19 100644
--- a/deepmd/dpmodel/descriptor/se_t_tebd.py
+++ b/deepmd/dpmodel/descriptor/se_t_tebd.py
@@ -255,6 +255,10 @@ def has_message_passing(self) -> bool:
         """Returns whether the descriptor has message passing."""
         return self.se_ttebd.has_message_passing()
 
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange."""
+        return self.se_ttebd.has_message_passing_across_ranks()
+
     def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
         return self.se_ttebd.need_sorted_nlist_for_lower()
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index abd40662d4..d85a334493 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -98,45 +98,29 @@ def _json_to_numpy(model_obj: dict) -> dict:
     )
 
 
-def _has_message_passing(model: torch.nn.Module) -> bool:
-    """Detect whether a model's descriptor uses GNN-style message passing.
-
-    GNN descriptors (DPA2 with repformers, DPA3 with repflows) require
-    a per-layer ghost-atom MPI exchange when running multi-rank LAMMPS,
-    which means a separate ``with-comm`` AOTInductor artifact must be
-    compiled.  Non-GNN descriptors (se_e2_a, se_r, se_t, se_t_tebd,
-    DPA1, hybrid-of-non-GNN) need only the regular artifact.
-
-    Additional gate: ``use_loc_mapping=True`` GNN models (the default
-    for DPA3) keep nlist in local-only indexing, so per-layer ghost
-    exchange is meaningless — these get only the regular artifact.
-    Multi-rank LAMMPS for GNN requires use_loc_mapping=False.
-
-    Returns False if the descriptor's ``has_message_passing()`` query
-    cannot be answered (e.g. linear/zbl/frozen models without a single
-    descriptor) — those are assumed local.
+def _needs_with_comm_artifact(model: torch.nn.Module) -> bool:
+    """Return ``True`` if the model needs a "with-comm" AOTI artifact compiled.
+
+    The with-comm artifact carries the per-layer ``deepmd_export::border_op``
+    calls that exchange node-embedding tensors across MPI ranks. Multi-rank
+    LAMMPS dispatches to it when the descriptor's message passing extends
+    across rank boundaries (i.e. layers consume neighbour features that
+    live on a different rank). Non-GNN descriptors and GNN descriptors with
+    ``use_loc_mapping=True`` keep all per-layer messaging local to each
+    rank's owned atoms; they need only the regular artifact.
+
+    Delegates to ``descriptor.has_message_passing_across_ranks()``, which
+    descriptor classes implement explicitly. Returns ``False`` defensively
+    when the model has no single descriptor (linear/zbl/frozen) or when
+    the method is somehow missing or raises.
     """
-    try:
-        descriptor = model.atomic_model.descriptor
-    except AttributeError:
-        return False
-    if not hasattr(descriptor, "has_message_passing"):
+    desc = getattr(getattr(model, "atomic_model", None), "descriptor", None)
+    if desc is None or not hasattr(desc, "has_message_passing_across_ranks"):
         return False
     try:
-        if not descriptor.has_message_passing():
-            return False
+        return bool(desc.has_message_passing_across_ranks())
     except (AttributeError, NotImplementedError):
         return False
-    # Walk into the GNN block (repflows / repformers) to inspect
-    # ``use_loc_mapping``. The attribute lives on the block, not on the
-    # top-level descriptor wrapper.
-    for attr in ("repflows", "repformers"):
-        block = getattr(descriptor, attr, None)
-        if block is None:
-            continue
-        if getattr(block, "use_loc_mapping", False):
-            return False
-    return True
 
 
 # Module-level cache for the trace-time sendlist buffer. The pointer
@@ -454,11 +438,10 @@ def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict:
     if is_spin:
         meta["ntypes_spin"] = model.spin.get_ntypes_spin()
         meta["use_spin"] = [bool(v) for v in model.spin.use_spin]
-    # Record whether the model uses GNN-style message passing.  When
-    # True, .pt2 deserialization compiles a second ``with-comm`` artifact
-    # so multi-rank LAMMPS can drive ghost-atom MPI exchange through
-    # the model.  C++ DeepPotPTExpt branches on this flag at load time.
-    meta["has_message_passing"] = _has_message_passing(model)
+    # Whether multi-rank LAMMPS needs a second "with-comm" AOTI artifact
+    # (per-layer ghost-feature MPI exchange via deepmd_export::border_op).
+    # The C++ DeepPotPTExpt / DeepSpinPTExpt loaders branch on this flag.
+    meta["has_comm_artifact"] = _needs_with_comm_artifact(model)
     return meta
 
 
@@ -588,7 +571,8 @@ def _trace_and_export(
         ``send_proc``, ``recv_proc``, ``send_num``, ``recv_num``,
         ``communicator``, ``nlocal``, ``nghost``) used by the pt_expt
         Repflow/Repformer override to drive MPI ghost-atom exchange.
-        Only valid for GNN models (see ``_has_message_passing``).
+        Only valid for models that need cross-rank ghost-feature exchange
+        (see ``_needs_with_comm_artifact``).
     do_atomic_virial
         If True, the traced graph computes per-atom virial (extra
         autograd.grad backward passes); off by default to keep .pt2
@@ -686,10 +670,12 @@ def _trace_and_export(
     # matter for tracing — only that they're valid tensors of the right
     # shape and dtype.  See ``_make_comm_sample_inputs``.
     if with_comm_dict:
-        if not metadata.get("has_message_passing"):
+        if not _needs_with_comm_artifact(model):
             raise ValueError(
-                "with_comm_dict=True requested but model has no GNN "
-                "message-passing descriptor — there's nothing to compile."
+                "with_comm_dict=True requested but the model's descriptor "
+                "does not need cross-rank message passing "
+                "(has_message_passing_across_ranks() is False) — "
+                "there's nothing to compile."
             )
         nloc_sample = nlist_t.shape[1]
         nall_sample = ext_atype.shape[1]
@@ -847,21 +833,22 @@ def _deserialize_to_file_pt2(
     program into a .pt2 package (ZIP archive with compiled shared libraries),
     then embeds metadata into the archive.
 
-    For GNN models (descriptor.has_message_passing() is True), compiles
-    a SECOND ``with-comm`` artifact and packs it alongside the regular
-    one.  The ``with-comm`` variant accepts comm-dict tensors as
+    For models whose descriptor reports
+    ``has_message_passing_across_ranks() == True`` (DPA2, DPA3 with
+    ``use_loc_mapping=False``, or hybrids wrapping such children),
+    compiles a SECOND ``with-comm`` artifact and packs it alongside the
+    regular one. The ``with-comm`` variant accepts comm-dict tensors as
     additional positional inputs and drives MPI ghost-atom exchange via
-    ``deepmd_export::border_op``.  The C++ ``DeepPotPTExpt`` loader picks
+    ``deepmd_export::border_op``. The C++ ``DeepPotPTExpt`` loader picks
     the artifact based on the LAMMPS rank count at runtime.
 
     Layout inside the .pt2 ZIP (PyTorch 2.11 strict layout):
         regular   →  artifact at ``model/`` (AOTInductor's own layout)
         with-comm →  ``model/extra/forward_lower_with_comm.pt2`` (nested ZIP)
         metadata  →  ``model/extra/metadata.json`` with
-                     ``has_message_passing`` and ``has_comm_artifact``
-                     flags. The C++ reader matches by ``/``-delimited
-                     suffix so the legacy root-level ``extra/`` layout
-                     still loads.
+                     ``has_comm_artifact`` flag. The C++ reader matches
+                     by ``/``-delimited suffix so the legacy root-level
+                     ``extra/`` layout still loads.
 
     Old .pt2 files (pre-this-change) lack ``has_comm_artifact`` so the
     C++ loader must default to ``False`` when the field is missing.
@@ -903,9 +890,11 @@ def _deserialize_to_file_pt2(
     finally:
         _inductor_config.realize_opcount_threshold = saved_threshold
 
-    # Second artifact: with-comm. Only for GNN models.
-    has_comm_artifact = bool(metadata.get("has_message_passing"))
-    metadata["has_comm_artifact"] = has_comm_artifact
+    # Second artifact: with-comm. Only for descriptors whose message
+    # passing extends across rank boundaries. The flag was computed
+    # from the model in ``_collect_metadata`` and is already in
+    # ``metadata`` here.
+    has_comm_artifact = bool(metadata.get("has_comm_artifact"))
     with_comm_bytes: bytes | None = None
     with_comm_output_keys: list[str] | None = None
     if has_comm_artifact:
diff --git a/source/lmp/tests/test_lammps_dpa3_pt2_fp32.py b/source/lmp/tests/test_lammps_dpa3_pt2_fp32.py
new file mode 100644
index 0000000000..1f8eed2512
--- /dev/null
+++ b/source/lmp/tests/test_lammps_dpa3_pt2_fp32.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Float32 multi-rank LAMMPS test for DPA3 GNN .pt2.
+
+The float64 multi-rank test in ``test_lammps_dpa3_pt2.py`` validates the
+comm_dict path against a same-archive single-rank reference (atol 1e-8).
+This file does the same thing for the float32 variant of the fixture
+(``deeppot_dpa3_mpi_fp32.pt2``) — the model and trace are byte-identical
+in every respect except ``descriptor.precision``/``fitting_net.precision``
+being set to ``float32``.
+
+Why a separate test file:
+    1. The fp32 fixture is not packaged into ``deeppot_dpa3_mpi.pt2``;
+       it is a sibling artifact produced by the same gen script.
+    2. fp32 needs looser tolerances. The C++ ``border_op`` kernel's
+       ``forward_t<float>`` template path (chosen automatically via
+       ``g1.dtype()`` dispatch in ``source/op/pt/comm.cc``) loses ~7
+       decimal digits of precision relative to the ``forward_t<double>``
+       path. Single-precision GEMM in the AOTI-compiled kernel adds
+       further drift.
+
+What this file validates that the float64 test does not:
+    * ``border_op`` template dispatch on ``g1.dtype() == kFloat`` (vs
+      ``kDouble``) actually fires under MPI.
+    * ``register_fake`` returns ``torch.empty_like(g1)`` so the FX trace
+      preserves float32 dtype through the opaque op.
+    * ``register_autograd``'s ``border_op_backward`` invocation also
+      runs under float32, returning float32 gradients.
+    * MPI exchange uses ``MPI_FLOAT`` (vs ``MPI_DOUBLE``), halving the
+      bandwidth per ghost atom — relevant for slow interconnects.
+
+This is a regression-only test for the comm path. It does not pin any
+hardcoded numerical values; mpi-2 must agree with same-archive mpi-1
+within float32 tolerances.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import importlib.util
+import os
+import shutil
+import subprocess as sp
+import sys
+import tempfile
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import pytest
+from write_lmp_data import (
+    write_lmp_data,
+)
+
+pb_file_mpi_fp32 = (
+    Path(__file__).parent.parent.parent
+    / "tests"
+    / "infer"
+    / "deeppot_dpa3_mpi_fp32.pt2"
+)
+data_file = Path(__file__).parent / "data_dpa3_pt2_fp32.lmp"
+
+# Same 6-atom O-H system as the float64 test. ``processors 2 1 1``
+# splits at x=6.5 -> 3 atoms per rank.
+box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
+coord = np.array(
+    [
+        [12.83, 2.56, 2.18],
+        [12.09, 2.87, 2.74],
+        [0.25, 3.32, 1.68],
+        [3.36, 3.00, 1.81],
+        [3.51, 2.51, 2.60],
+        [4.27, 3.22, 1.56],
+    ]
+)
+type_OH = np.array([1, 2, 2, 1, 2, 2])
+
+
+def setup_module() -> None:
+    if os.environ.get("ENABLE_PYTORCH", "1") != "1":
+        pytest.skip("Skip test because PyTorch support is not enabled.")
+    write_lmp_data(box, coord, type_OH, data_file)
+
+
+def teardown_module() -> None:
+    if data_file.exists():
+        os.remove(data_file)
+
+
+def _run_mpi_subprocess(
+    nprocs: int,
+    processors: str | None = None,
+) -> dict:
+    """Run ``run_mpi_pair_deepmd_dpa3_pt2.py`` against the fp32 archive.
+
+    Returns ``{"pe", "forces", "virials"}`` parsed from the runner's
+    output file.
+    """
+    with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
+        out_path = f.name
+    try:
+        argv = [
+            "mpirun",
+            "-n",
+            str(nprocs),
+            sys.executable,
+            str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"),
+            str(data_file.resolve()),
+            str(pb_file_mpi_fp32.resolve()),
+            out_path,
+        ]
+        if processors is not None:
+            argv.extend(["--processors", processors])
+        elif nprocs == 1:
+            argv.extend(["--processors", "1 1 1"])
+        sp.check_call(argv)
+        with open(out_path) as fh:
+            lines = fh.read().strip().splitlines()
+        pe = float(lines[0])
+        rows = np.array(
+            [list(map(float, line.split())) for line in lines[1:]],
+            dtype=np.float64,
+        )
+        forces = rows[:, :3]
+        virials = rows[:, 3:]
+        return {"pe": pe, "forces": forces, "virials": virials}
+    finally:
+        if os.path.exists(out_path):
+            os.remove(out_path)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa3_fp32() -> None:
+    """Float32 DPA3 multi-rank must match same-archive single-rank.
+
+    Tolerances follow standard float32 expectations:
+    * energy: ``rel=1e-5``  (~7 decimal digits, with mantissa noise)
+    * force:  ``atol=1e-4`` absolute (force magnitudes are O(1e-1) for
+                                       this system, so ``rel=1e-3``)
+    * virial: ``atol=5e-4`` per component
+
+    Single-rank uses the regular artifact (nswap=0); multi-rank uses
+    the with-comm artifact -- so any divergence beyond float32 noise
+    is necessarily in the multi-rank dispatch (border_op template
+    dispatch, MPI_FLOAT exchange, register_fake/register_autograd
+    dtype handling).
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=2)
+    out_ref = _run_mpi_subprocess(nprocs=1)
+
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-5, abs=1e-7)
+    np.testing.assert_allclose(
+        out_mpi["forces"], out_ref["forces"], atol=1e-4, rtol=1e-3
+    )
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=5e-4, rtol=1e-3
+    )
diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
index 3bb7516155..6eb49624ec 100644
--- a/source/op/pt/comm.cc
+++ b/source/op/pt/comm.cc
@@ -140,7 +140,12 @@ class Border : public torch::autograd::Function<Border> {
 #endif
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 #ifdef USE_MPI
-        if (cuda_aware == 0) {
+        // The CPU-fallback ``recv_g1_tensor.to(kCPU)`` above only runs
+        // when ``world_size >= 1`` (MPI initialized). With no MPI
+        // (single-rank, world_size == 0) the tensor is still on CUDA,
+        // so memcpy on CUDA pointers would segfault — gpuMemcpy is
+        // correct in that case regardless of ``cuda_aware``.
+        if (world_size >= 1 && cuda_aware == 0) {
           memcpy(recv_g1, send_g1,
                  (unsigned long)nsend * tensor_size * sizeof(FPTYPE));
         } else {
@@ -164,7 +169,10 @@ class Border : public torch::autograd::Function<Border> {
     }
 #ifdef USE_MPI
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-    if (cuda_aware == 0) {
+    // Only copy back when ``recv_g1_tensor`` was moved to CPU above
+    // (world_size >= 1 && cuda_aware == 0). With world_size == 0 the
+    // tensor is still aliased to g1 — no copy needed.
+    if (world_size >= 1 && cuda_aware == 0) {
       g1.copy_(recv_g1_tensor);
     }
 #endif
@@ -305,7 +313,10 @@ class Border : public torch::autograd::Function<Border> {
         if (nrecv) {
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 #ifdef USE_MPI
-          if (cuda_aware == 0) {
+          // See forward kernel: when world_size==0 the data stays on
+          // CUDA, so memcpy on device pointers segfaults. Only use
+          // host memcpy when we explicitly moved data to CPU above.
+          if (world_size >= 1 && cuda_aware == 0) {
             memcpy(recv_g1, send_g1,
                    (unsigned long)nrecv * tensor_size * sizeof(FPTYPE));
           } else {
@@ -333,9 +344,11 @@ class Border : public torch::autograd::Function<Border> {
     }
 #ifdef USE_MPI
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-    if (cuda_aware == 0) {
-      // Move result back to the device of the input grad. This replaces
-      // the original in-place copy_ into grad_output[0].
+    // Move result back to the device of the input grad only when
+    // ``d_local_g1_tensor`` was moved to CPU above (world_size >= 1
+    // && cuda_aware == 0). With world_size == 0 the tensor stayed on
+    // its original device — no move needed.
+    if (world_size >= 1 && cuda_aware == 0) {
       d_local_g1_tensor = d_local_g1_tensor.to(grad_g1.device());
     }
 #endif
diff --git a/source/tests/infer/gen_dpa2.py b/source/tests/infer/gen_dpa2.py
index 5aff706aab..e640514ee3 100644
--- a/source/tests/infer/gen_dpa2.py
+++ b/source/tests/infer/gen_dpa2.py
@@ -110,8 +110,9 @@ def main():
     print(f"Exporting to {pt2_path} ...")  # noqa: T201
     # DPA2's repformer block has no ``use_loc_mapping`` knob (unlike
     # DPA3), so a single .pt2 already carries the dual-artifact layout
-    # (regular + with-comm) — _has_message_passing returns True and the
-    # serializer produces both. No separate _mpi.pt2 needed.
+    # (regular + with-comm) — ``has_message_passing_across_ranks``
+    # returns True and the serializer produces both. No separate _mpi.pt2
+    # needed.
     pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data), do_atomic_virial=True)
 
     pth_path = os.path.join(base_dir, "deeppot_dpa2.pth")
diff --git a/source/tests/infer/gen_dpa3.py b/source/tests/infer/gen_dpa3.py
index 20304afcf2..1bfe0f9c65 100644
--- a/source/tests/infer/gen_dpa3.py
+++ b/source/tests/infer/gen_dpa3.py
@@ -111,6 +111,30 @@ def main():
         pt2_mpi_path, copy.deepcopy(data_mpi), do_atomic_virial=True
     )
 
+    # Float32 multi-rank variant — same architecture as the float64
+    # MPI fixture but with ``precision: float32``.  Used by
+    # source/lmp/tests/test_lammps_dpa3_pt2_fp32.py to validate that
+    # the comm_dict path (border_op + register_fake/register_autograd)
+    # is dtype-agnostic in practice, not just by inspection.
+    config_mpi_fp32 = copy.deepcopy(config_mpi)
+    config_mpi_fp32["descriptor"]["precision"] = "float32"
+    config_mpi_fp32["fitting_net"]["precision"] = "float32"
+    model_mpi_fp32 = get_model(copy.deepcopy(config_mpi_fp32))
+    data_mpi_fp32 = {
+        "model": model_mpi_fp32.serialize(),
+        "model_def_script": config_mpi_fp32,
+        "backend": "dpmodel",
+        "software": "deepmd-kit",
+        "version": "3.0.0",
+    }
+    pt2_mpi_fp32_path = os.path.join(base_dir, "deeppot_dpa3_mpi_fp32.pt2")
+    print(f"Exporting to {pt2_mpi_fp32_path} ...")  # noqa: T201
+    pt_expt_deserialize_to_file(
+        pt2_mpi_fp32_path,
+        copy.deepcopy(data_mpi_fp32),
+        do_atomic_virial=True,
+    )
+
     pth_path = os.path.join(base_dir, "deeppot_dpa3.pth")
     print(f"Exporting to {pth_path} ...")  # noqa: T201
     try:
diff --git a/source/tests/pt_expt/descriptor/test_dpa1.py b/source/tests/pt_expt/descriptor/test_dpa1.py
index 24a1d36078..8edb25ccdf 100644
--- a/source/tests/pt_expt/descriptor/test_dpa1.py
+++ b/source/tests/pt_expt/descriptor/test_dpa1.py
@@ -290,3 +290,40 @@ def test_share_params(self, shared_level) -> None:
         # invalid level raises
         with pytest.raises(NotImplementedError):
             dd1.share_params(dd0, shared_level=2)
+
+
+def test_has_message_passing_across_ranks() -> None:
+    """DPA1 (se_atten) is single-layer attention; no cross-rank
+    feature exchange is needed at multi-rank deployment.
+    """
+    import copy
+
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+
+    config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "se_atten",
+            "rcut": 6.0,
+            "rcut_smth": 0.5,
+            "sel": 20,
+            "neuron": [2, 4],
+            "axis_neuron": 2,
+            "attn": 5,
+            "attn_layer": 1,
+            "type_one_side": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+        "fitting_net": {
+            "neuron": [4, 4],
+            "resnet_dt": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+    }
+    desc = get_model(copy.deepcopy(config)).atomic_model.descriptor
+    assert desc.has_message_passing() is False
+    assert desc.has_message_passing_across_ranks() is False
diff --git a/source/tests/pt_expt/descriptor/test_dpa2.py b/source/tests/pt_expt/descriptor/test_dpa2.py
index fb0005e13a..217bcdb230 100644
--- a/source/tests/pt_expt/descriptor/test_dpa2.py
+++ b/source/tests/pt_expt/descriptor/test_dpa2.py
@@ -426,3 +426,60 @@ def fn(coord_ext, atype_ext, nlist, mapping):
             rtol=rtol,
             atol=atol,
         )
+
+
+def test_has_message_passing_across_ranks() -> None:
+    """DPA2's repformer always passes ``g1`` in ``[nb, nall, n_dim]``
+    layout (no ``use_loc_mapping`` opt-out exists), so cross-rank
+    message passing is always required for multi-rank deployment.
+    """
+    import copy
+
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+
+    config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "dpa2",
+            "repinit": {
+                "rcut": 6.0,
+                "rcut_smth": 2.0,
+                "nsel": 20,
+                "neuron": [2, 4],
+                "axis_neuron": 4,
+                "tebd_dim": 8,
+                "tebd_input_mode": "concat",
+                "set_davg_zero": True,
+                "type_one_side": True,
+                "use_three_body": False,
+            },
+            "repformer": {
+                "rcut": 3.0,
+                "rcut_smth": 1.5,
+                "nsel": 10,
+                "nlayers": 1,
+                "g1_dim": 8,
+                "g2_dim": 5,
+                "axis_neuron": 4,
+                "update_g1_has_conv": True,
+                "update_g1_has_drrd": True,
+                "update_g1_has_grrg": True,
+                "update_g2_has_attn": True,
+                "attn1_hidden": 8,
+                "attn1_nhead": 2,
+                "attn2_hidden": 5,
+                "attn2_nhead": 1,
+                "update_style": "res_avg",
+                "set_davg_zero": True,
+            },
+            "concat_output_tebd": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+        "fitting_net": {"neuron": [4, 4], "resnet_dt": True, "seed": 1},
+    }
+    desc = get_model(copy.deepcopy(config)).atomic_model.descriptor
+    assert desc.has_message_passing() is True
+    assert desc.has_message_passing_across_ranks() is True
diff --git a/source/tests/pt_expt/descriptor/test_dpa3.py b/source/tests/pt_expt/descriptor/test_dpa3.py
index ef4b479724..3013f5cc65 100644
--- a/source/tests/pt_expt/descriptor/test_dpa3.py
+++ b/source/tests/pt_expt/descriptor/test_dpa3.py
@@ -311,3 +311,43 @@ def test_share_params(self, shared_level) -> None:
         # invalid level raises
         with pytest.raises(NotImplementedError):
             dd1.share_params(dd0, shared_level=2)
+
+
+@pytest.mark.parametrize("use_loc_mapping", [True, False])
+def test_has_message_passing_across_ranks(use_loc_mapping) -> None:
+    """DPA3 always reports message passing; cross-rank only when
+    ``use_loc_mapping=False`` (so per-layer node embeddings must flow
+    via MPI ghost exchange instead of a local gather).
+    """
+    import copy
+
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+
+    config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "dpa3",
+            "repflow": {
+                "n_dim": 8,
+                "e_dim": 6,
+                "a_dim": 4,
+                "nlayers": 1,
+                "e_rcut": 4.0,
+                "e_rcut_smth": 0.5,
+                "e_sel": 8,
+                "a_rcut": 3.5,
+                "a_rcut_smth": 0.5,
+                "a_sel": 4,
+                "axis_neuron": 4,
+                "update_angle": False,
+            },
+            "use_loc_mapping": use_loc_mapping,
+        },
+        "fitting_net": {"neuron": [16, 16], "seed": 1},
+    }
+    model = get_model(copy.deepcopy(config))
+    desc = model.atomic_model.descriptor
+    assert desc.has_message_passing() is True
+    assert desc.has_message_passing_across_ranks() is (not use_loc_mapping)
diff --git a/source/tests/pt_expt/descriptor/test_hybrid.py b/source/tests/pt_expt/descriptor/test_hybrid.py
index 5fa8970bf1..86575180c7 100644
--- a/source/tests/pt_expt/descriptor/test_hybrid.py
+++ b/source/tests/pt_expt/descriptor/test_hybrid.py
@@ -284,3 +284,73 @@ def test_share_params(self) -> None:
         # invalid level raises
         with pytest.raises(NotImplementedError):
             dd1.share_params(dd0, shared_level=1)
+
+
+def _se_e2_a_child() -> dict:
+    return {
+        "type": "se_e2_a",
+        "rcut": 6.0,
+        "rcut_smth": 0.5,
+        "sel": [20, 20],
+        "neuron": [2, 4],
+        "axis_neuron": 2,
+        "type_one_side": True,
+        "precision": "float64",
+        "seed": 1,
+    }
+
+
+def _dpa3_child(use_loc_mapping: bool) -> dict:
+    return {
+        "type": "dpa3",
+        "repflow": {
+            "n_dim": 8,
+            "e_dim": 6,
+            "a_dim": 4,
+            "nlayers": 1,
+            "e_rcut": 4.0,
+            "e_rcut_smth": 0.5,
+            "e_sel": 8,
+            "a_rcut": 3.5,
+            "a_rcut_smth": 0.5,
+            "a_sel": 4,
+            "axis_neuron": 4,
+            "update_angle": False,
+        },
+        "use_loc_mapping": use_loc_mapping,
+    }
+
+
+@pytest.mark.parametrize(
+    "child_factory,expected_hmp,expected_hmp_ar",
+    [
+        (lambda: _se_e2_a_child(), False, False),
+        (lambda: _dpa3_child(use_loc_mapping=True), True, False),
+        (lambda: _dpa3_child(use_loc_mapping=False), True, True),
+    ],
+    ids=["se_e2_a-only", "dpa3-ulm-true", "dpa3-ulm-false"],
+)
+def test_has_message_passing_across_ranks(
+    child_factory, expected_hmp, expected_hmp_ar
+) -> None:
+    """Hybrid descriptor recurses into its children; cross-rank message
+    passing is required iff any child needs it. Closes the structural
+    side of catalog Tier-1 #1.
+    """
+    import copy
+
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+
+    config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "hybrid",
+            "list": [child_factory()],
+        },
+        "fitting_net": {"neuron": [4, 4], "seed": 1},
+    }
+    desc = get_model(copy.deepcopy(config)).atomic_model.descriptor
+    assert desc.has_message_passing() is expected_hmp
+    assert desc.has_message_passing_across_ranks() is expected_hmp_ar
diff --git a/source/tests/pt_expt/descriptor/test_se_e2_a.py b/source/tests/pt_expt/descriptor/test_se_e2_a.py
index e4bd1e385e..e3a8ca5c21 100644
--- a/source/tests/pt_expt/descriptor/test_se_e2_a.py
+++ b/source/tests/pt_expt/descriptor/test_se_e2_a.py
@@ -221,3 +221,38 @@ def fn(coord_ext, atype_ext, nlist):
             rtol=rtol,
             atol=atol,
         )
+
+
+def test_has_message_passing_across_ranks() -> None:
+    """se_e2_a is a single-layer local descriptor: no message passing,
+    no cross-rank exchange ever needed.
+    """
+    import copy
+
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+
+    config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "se_e2_a",
+            "rcut": 6.0,
+            "rcut_smth": 0.5,
+            "sel": [20, 20],
+            "neuron": [2, 4],
+            "axis_neuron": 2,
+            "type_one_side": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+        "fitting_net": {
+            "neuron": [4, 4],
+            "resnet_dt": True,
+            "precision": "float64",
+            "seed": 1,
+        },
+    }
+    desc = get_model(copy.deepcopy(config)).atomic_model.descriptor
+    assert desc.has_message_passing() is False
+    assert desc.has_message_passing_across_ranks() is False
diff --git a/source/tests/pt_expt/model/test_export_with_comm.py b/source/tests/pt_expt/model/test_export_with_comm.py
index f905f409bc..f338397639 100644
--- a/source/tests/pt_expt/model/test_export_with_comm.py
+++ b/source/tests/pt_expt/model/test_export_with_comm.py
@@ -9,8 +9,7 @@
 
 This test verifies:
   1. Both artifacts are present in the archive.
-  2. ``metadata.json`` carries the new ``has_message_passing`` and
-     ``has_comm_artifact`` flags.
+  2. ``metadata.json`` carries the ``has_comm_artifact`` flag.
   3. The with-comm artifact loads via ``aoti_load_package`` and runs
      when fed valid comm-dict tensors built via the ctypes pointer
      trick (see ``test_repflow_parallel.py``).
@@ -137,7 +136,6 @@ def test_pt2_dual_artifact_for_gnn(tmp_path) -> None:
         assert "model/extra/forward_lower_with_comm.pt2" in names, (
             f"with-comm artifact missing; names={sorted(names)}"
         )
-    assert meta["has_message_passing"] is True
     assert meta["has_comm_artifact"] is True
 
     # 2. Both artifacts load.
@@ -245,19 +243,20 @@ def test_make_comm_sample_inputs_clamps_zero_nghost() -> None:
     assert nghost_t.item() == 0
 
 
-def test_has_message_passing_for_hybrid_with_gnn() -> None:
-    """``_has_message_passing`` correctly reports True for hybrid
-    descriptors whose children include a GNN block.
+def test_needs_with_comm_artifact_for_hybrid_with_gnn() -> None:
+    """``_needs_with_comm_artifact`` correctly reports True for hybrid
+    descriptors whose children include a GNN block needing cross-rank
+    message passing.
 
-    The hybrid descriptor delegates ``has_message_passing()`` to its
-    children — if any child has message passing, the hybrid does too.
-    Our metadata flag (``has_message_passing``) is what
-    ``_deserialize_to_file_pt2`` uses to decide whether to compile
-    the with-comm artifact, so the hybrid case must route correctly.
+    The hybrid descriptor delegates ``has_message_passing_across_ranks()``
+    to its children — if any child needs cross-rank message passing,
+    the hybrid does too. ``_deserialize_to_file_pt2`` uses this gate
+    to decide whether to compile the with-comm artifact, so the
+    hybrid case must route correctly.
     """
     from deepmd.pt_expt.model.get_model import get_model as get_pt_expt_model
     from deepmd.pt_expt.utils.serialization import (
-        _has_message_passing,
+        _needs_with_comm_artifact,
     )
 
     config = {
@@ -301,8 +300,10 @@ def test_has_message_passing_for_hybrid_with_gnn() -> None:
     model = get_pt_expt_model(config)
     model.to("cpu")
     model.eval()
-    assert _has_message_passing(model) is True, (
-        "hybrid model with a GNN child must report has_message_passing=True"
+    assert _needs_with_comm_artifact(model) is True, (
+        "hybrid model with a use_loc_mapping=False GNN child must "
+        "report has_message_passing_across_ranks=True so a with-comm "
+        "artifact gets compiled"
     )
 
 
@@ -330,7 +331,11 @@ def test_pte_with_comm_dict_traces_and_loads(tmp_path) -> None:
         model_json_override=None,
         with_comm_dict=True,
     )
-    assert metadata["has_message_passing"] is True
+    # ``_trace_and_export(with_comm_dict=True)`` is the with-comm path
+    # by construction; metadata at this layer no longer carries the
+    # has_message_passing flag (only ``has_comm_artifact``, written
+    # later in _deserialize_to_file_pt2). Sanity-check via output_keys
+    # that the trace produced energy outputs.
     # output_keys mirrors what the regular trace would produce; at
     # least one energy-related key must be present.
     assert any(k.startswith("energy") for k in output_keys), (
diff --git a/source/tests/pt_expt/utils/test_has_message_passing.py b/source/tests/pt_expt/utils/test_has_message_passing.py
deleted file mode 100644
index 673e4d8bd0..0000000000
--- a/source/tests/pt_expt/utils/test_has_message_passing.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# SPDX-License-Identifier: LGPL-3.0-or-later
-"""Schema-drift regression test for ``_has_message_passing``.
-
-``_has_message_passing`` (in ``deepmd/pt_expt/utils/serialization.py``)
-gates whether the dual-artifact ``.pt2`` is produced for GNN models —
-specifically, whether the with-comm AOTInductor module is compiled and
-nested inside the archive. The detection relies on a chain of attribute
-lookups:
-
-* ``model.atomic_model.descriptor``
-* ``descriptor.has_message_passing()``
-* For repflows/repformers: ``block.use_loc_mapping``
-
-A rename of any of these (refactor in the dpmodel descriptor layer, a
-new GNN block name, etc.) silently disables the with-comm artifact and
-multi-rank LAMMPS users get a single-artifact .pt2 that crashes on the
-first ghost exchange — with no test failure to flag the breakage.
-
-This test pins the contract: assert ``_has_message_passing`` returns
-the documented value for each baseline configuration.
-"""
-
-from __future__ import (
-    annotations,
-)
-
-import copy
-
-import pytest
-
-from deepmd.dpmodel.model.model import (
-    get_model,
-)
-from deepmd.pt_expt.utils.serialization import (
-    _has_message_passing,
-)
-
-
-def _se_e2_a_config() -> dict:
-    """Non-GNN descriptor — must report False."""
-    return {
-        "type_map": ["O", "H"],
-        "descriptor": {
-            "type": "se_e2_a",
-            "rcut": 6.0,
-            "rcut_smth": 0.5,
-            "sel": [20, 20],
-            "neuron": [2, 4],
-            "axis_neuron": 2,
-            "type_one_side": True,
-            "precision": "float64",
-            "seed": 1,
-        },
-        "fitting_net": {
-            "neuron": [4, 4],
-            "resnet_dt": True,
-            "precision": "float64",
-            "seed": 1,
-        },
-    }
-
-
-def _dpa1_config() -> dict:
-    """DPA1 (se_atten) — non-GNN; must report False."""
-    return {
-        "type_map": ["O", "H"],
-        "descriptor": {
-            "type": "se_atten",
-            "rcut": 6.0,
-            "rcut_smth": 0.5,
-            "sel": 20,
-            "neuron": [2, 4],
-            "axis_neuron": 2,
-            "attn": 5,
-            "attn_layer": 1,
-            "type_one_side": True,
-            "precision": "float64",
-            "seed": 1,
-        },
-        "fitting_net": {
-            "neuron": [4, 4],
-            "resnet_dt": True,
-            "precision": "float64",
-            "seed": 1,
-        },
-    }
-
-
-def _dpa3_config(use_loc_mapping: bool) -> dict:
-    """DPA3 (repflows). use_loc_mapping=False -> True, True -> False."""
-    return {
-        "type_map": ["O", "H"],
-        "descriptor": {
-            "type": "dpa3",
-            "repflow": {
-                "n_dim": 8,
-                "e_dim": 6,
-                "a_dim": 4,
-                "nlayers": 1,
-                "e_rcut": 4.0,
-                "e_rcut_smth": 0.5,
-                "e_sel": 8,
-                "a_rcut": 3.5,
-                "a_rcut_smth": 0.5,
-                "a_sel": 4,
-                "axis_neuron": 4,
-                "update_angle": False,
-            },
-            "use_loc_mapping": use_loc_mapping,
-        },
-        "fitting_net": {"neuron": [16, 16], "seed": 1},
-    }
-
-
-def _dpa2_config() -> dict:
-    """DPA2 (repformer) — GNN; repformer has no use_loc_mapping knob,
-    so always reports True.
-    """
-    return {
-        "type_map": ["O", "H"],
-        "descriptor": {
-            "type": "dpa2",
-            "repinit": {
-                "rcut": 6.0,
-                "rcut_smth": 2.0,
-                "nsel": 20,
-                "neuron": [2, 4],
-                "axis_neuron": 4,
-                "tebd_dim": 8,
-                "tebd_input_mode": "concat",
-                "set_davg_zero": True,
-                "type_one_side": True,
-                "use_three_body": False,
-            },
-            "repformer": {
-                "rcut": 3.0,
-                "rcut_smth": 1.5,
-                "nsel": 10,
-                "nlayers": 1,
-                "g1_dim": 8,
-                "g2_dim": 5,
-                "axis_neuron": 4,
-                "update_g1_has_conv": True,
-                "update_g1_has_drrd": True,
-                "update_g1_has_grrg": True,
-                "update_g2_has_attn": True,
-                "attn1_hidden": 8,
-                "attn1_nhead": 2,
-                "attn2_hidden": 5,
-                "attn2_nhead": 1,
-                "update_style": "res_avg",
-                "set_davg_zero": True,
-            },
-            "concat_output_tebd": True,
-            "precision": "float64",
-            "seed": 1,
-        },
-        "fitting_net": {
-            "neuron": [4, 4],
-            "resnet_dt": True,
-            "seed": 1,
-        },
-    }
-
-
-@pytest.mark.parametrize(
-    "config_factory,expected",
-    [
-        (_se_e2_a_config, False),
-        (_dpa1_config, False),
-        (lambda: _dpa3_config(use_loc_mapping=True), False),
-        (lambda: _dpa3_config(use_loc_mapping=False), True),
-        (_dpa2_config, True),
-    ],
-    ids=[
-        "se_e2_a-non-gnn",
-        "dpa1-non-gnn",
-        "dpa3-use-loc-mapping-true",
-        "dpa3-use-loc-mapping-false",
-        "dpa2-repformer",
-    ],
-)
-def test_has_message_passing_matches_descriptor_kind(config_factory, expected) -> None:
-    """``_has_message_passing`` must report the documented value for
-    each baseline descriptor configuration.
-
-    A False positive (non-GNN reported as GNN) wastes compile time on
-    a useless with-comm artifact. A False negative (GNN with
-    use_loc_mapping=False reported as non-GNN) is worse: multi-rank
-    LAMMPS gets a single-artifact .pt2 and crashes on the first ghost
-    exchange. This test pins both directions.
-    """
-    config = config_factory()
-    model = get_model(copy.deepcopy(config))
-    assert _has_message_passing(model) is expected
-
-
-def test_has_message_passing_no_descriptor_returns_false() -> None:
-    """Models without a single ``atomic_model.descriptor`` (e.g. linear
-    / ZBL / frozen) must report False — the function defends against
-    AttributeError and treats the model as local.
-    """
-
-    class _StubAtomicModel:
-        # Intentionally no ``descriptor`` attribute.
-        pass
-
-    class _StubModel:
-        atomic_model = _StubAtomicModel()
-
-    assert _has_message_passing(_StubModel()) is False
-
-
-def test_has_message_passing_descriptor_without_query_returns_false() -> None:
-    """If the descriptor exists but lacks ``has_message_passing``, the
-    function must report False rather than raise.
-    """
-
-    class _StubDescriptor:
-        # Intentionally no ``has_message_passing`` method.
-        pass
-
-    class _StubAtomicModel:
-        descriptor = _StubDescriptor()
-
-    class _StubModel:
-        atomic_model = _StubAtomicModel()
-
-    assert _has_message_passing(_StubModel()) is False

From 08805b6474dc0cb8e1844a198ea2dd04066e655c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 3 May 2026 12:21:18 +0800
Subject: [PATCH 28/34] fix(test): build comm_dict control tensors on CPU for
 repflow_parallel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The C++ ``border_op`` host code dereferences ``send_proc``,
``recv_proc``, ``send_num``, ``recv_num`` (and ``send_list`` /
``communicator`` / ``nlocal`` / ``nghost``) directly via
``data_ptr<int>()`` from host code — see ``source/op/pt/comm.cc``
forward_t/backward_t.  Production code in
``source/api_cc/src/commonPTExpt.h::build_comm_tensors_positional``
explicitly creates them on ``torch::kCPU``.

The test ``_build_self_comm_dict`` helper was constructing them on
``device`` (which on a CUDA build is ``cuda:0``).  On CPU-only
builds this happened to work; on a CUDA-enabled build the host
read of ``recvnum[iswap]`` walks a CUDA pointer and segfaults.

This is a test bug, not a runtime contract change.  Fix by forcing
the control tensors to CPU regardless of caller-supplied device,
matching production semantics, and document why in the docstring.

Reproduces the intermittent CUDA CI segfault on PR #5430:
``test_repflow_parallel.py`` was the failure point in
https://github.com/deepmodeling/deepmd-kit/actions/runs/25264766026
---
 .../descriptor/test_repflow_parallel.py       | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/source/tests/pt_expt/descriptor/test_repflow_parallel.py b/source/tests/pt_expt/descriptor/test_repflow_parallel.py
index 61b84fe5af..f5c68fabed 100644
--- a/source/tests/pt_expt/descriptor/test_repflow_parallel.py
+++ b/source/tests/pt_expt/descriptor/test_repflow_parallel.py
@@ -81,24 +81,32 @@ def _build_self_comm_dict(
         int32 array of length ``nghost`` giving local indices to copy
         into successive ghost slots [nloc, nloc+1, ...].
     device
-        Target torch device for tensors.
+        Target torch device for the data tensors. The control tensors
+        (send_proc / recv_proc / send_num / recv_num / send_list /
+        communicator / nlocal / nghost) are forced to CPU regardless of
+        ``device`` because the C++ ``border_op`` host-side code derefer-
+        ences ``data_ptr<int>()`` directly — production builds them on
+        CPU in ``commonPTExpt.h::build_comm_tensors_positional`` and a
+        CUDA-built kernel will segfault if it tries to read CUDA memory
+        from the host.
     keepalive
         List into which we store numpy buffers that must outlive the
         forward pass (their addresses are referenced by sendlist_tensor).
     """
+    del device  # control tensors are always CPU; see docstring
     sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32)
     keepalive.append(sendlist_indices)
     nswap = 1
     addr = _addr_of(sendlist_indices)
     # int** packed as one int64 entry per swap.
-    sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device=device)
-    sendproc = torch.zeros(nswap, dtype=torch.int32, device=device)
-    recvproc = torch.zeros(nswap, dtype=torch.int32, device=device)
-    sendnum = torch.tensor([nghost], dtype=torch.int32, device=device)
-    recvnum = torch.tensor([nghost], dtype=torch.int32, device=device)
-    communicator = torch.zeros(1, dtype=torch.int64, device=device)
-    nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device)
-    nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device)
+    sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device="cpu")
+    sendproc = torch.zeros(nswap, dtype=torch.int32, device="cpu")
+    recvproc = torch.zeros(nswap, dtype=torch.int32, device="cpu")
+    sendnum = torch.tensor([nghost], dtype=torch.int32, device="cpu")
+    recvnum = torch.tensor([nghost], dtype=torch.int32, device="cpu")
+    communicator = torch.zeros(1, dtype=torch.int64, device="cpu")
+    nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device="cpu")
+    nghost_ts = torch.tensor(nghost, dtype=torch.int32, device="cpu")
     return {
         "send_list": sendlist_tensor,
         "send_proc": sendproc,

From afa99c7b97d3c16bf361eac631dbef5ec404e1f6 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 3 May 2026 19:01:30 +0800
Subject: [PATCH 29/34] fix(test): build comm_dict control tensors on CPU for
 repformer_parallel

Same bug as the previous commit in ``test_repflow_parallel.py``:
``_build_self_comm_dict`` constructs the control tensors
(send_proc / recv_proc / send_num / recv_num / send_list /
communicator / nlocal / nghost) on the caller-supplied ``device``,
which is ``cuda`` on a CUDA build.  The C++ ``border_op`` host code
dereferences these via ``data_ptr<int>()`` from the host, so a
CUDA-device control tensor segfaults the read.

Production code in ``commonPTExpt.h::build_comm_tensors_positional``
explicitly builds them on CPU.  Force CPU regardless of the
caller-supplied device, matching the production contract.

This was the second segfault revealed on PR #5430 CI after
08805b647 fixed test_repflow_parallel.py:
    test_repflow_parallel.py ....    [ 13%]
    Segmentation fault (core dumped)
    test_repformer_parallel.py
---
 .../descriptor/test_repformer_parallel.py     | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/source/tests/pt_expt/descriptor/test_repformer_parallel.py b/source/tests/pt_expt/descriptor/test_repformer_parallel.py
index ca0bd035e7..24e6e6ce33 100644
--- a/source/tests/pt_expt/descriptor/test_repformer_parallel.py
+++ b/source/tests/pt_expt/descriptor/test_repformer_parallel.py
@@ -54,18 +54,27 @@ def _build_self_comm_dict(
     device: torch.device,
     keepalive: list,
 ) -> dict:
+    """Control tensors must live on CPU because the C++ ``border_op``
+    host code dereferences ``data_ptr<int>()`` directly.  Production
+    builds them on CPU in
+    ``commonPTExpt.h::build_comm_tensors_positional``; on a CUDA build
+    a CUDA-device control tensor segfaults the host read.  See
+    ``test_repflow_parallel.py::_build_self_comm_dict`` for the full
+    rationale.
+    """
+    del device  # control tensors are always CPU
     sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32)
     keepalive.append(sendlist_indices)
     nswap = 1
     addr = _addr_of(sendlist_indices)
-    sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device=device)
-    sendproc = torch.zeros(nswap, dtype=torch.int32, device=device)
-    recvproc = torch.zeros(nswap, dtype=torch.int32, device=device)
-    sendnum = torch.tensor([nghost], dtype=torch.int32, device=device)
-    recvnum = torch.tensor([nghost], dtype=torch.int32, device=device)
-    communicator = torch.zeros(1, dtype=torch.int64, device=device)
-    nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device)
-    nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device)
+    sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device="cpu")
+    sendproc = torch.zeros(nswap, dtype=torch.int32, device="cpu")
+    recvproc = torch.zeros(nswap, dtype=torch.int32, device="cpu")
+    sendnum = torch.tensor([nghost], dtype=torch.int32, device="cpu")
+    recvnum = torch.tensor([nghost], dtype=torch.int32, device="cpu")
+    communicator = torch.zeros(1, dtype=torch.int64, device="cpu")
+    nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device="cpu")
+    nghost_ts = torch.tensor(nghost, dtype=torch.int32, device="cpu")
     return {
         "send_list": sendlist_tensor,
         "send_proc": sendproc,

From e19108d96f4078c6ffd33ac2ace582da56e0cfd1 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 4 May 2026 10:50:50 +0800
Subject: [PATCH 30/34] fix(op): dispatch border_op self-send on tensor device,
 not MPI state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CUDA self-send branch in ``Border::forward_t`` and ``backward_t``
was guarded by ``if (world_size >= 1 && cuda_aware == 0)`` to choose
between host ``memcpy`` and ``gpuMemcpy(D2D)``.  The intent was
"world_size >= 1 means MPI is initialised so the pre-loop CPU
fallback ran and the buffer is now on CPU; otherwise it's still on
its original device (assumed CUDA)".

That assumption is wrong for one important case: a USE_MPI build
called from Python with CPU tensors and no MPI init (``world_size
== 0``).  Unit tests in ``source/tests/pt_expt/utils/test_border_op_-
backward.py`` do exactly this — they construct CPU comm tensors and
a CPU ``grad_g1``, never call MPI_Init, and expect the kernel to do
plain CPU accumulation.  The old guard fell through to ``gpuMemcpy
(...DeviceToDevice)`` on host pointers.  CUDA returns
``cudaErrorInvalidValue`` from that call; the return code is
unchecked and ``recv_g1`` is left uninitialised.  Subsequent
``index_add_`` then writes garbage into ``d_local_g1_tensor`` —
the test sees mixed denormals + sigmoid-shaped values from leaked
buffer memory.

Same bug bit ``test_spin_export_with_comm.py::test_spin_dpa3_eager-
_parity``: it compares the no-comm path against the comm_dict path
for a spin DPA3, and the comm_dict path went through the broken
self-send.  Energy diverged by ~0.1 instead of being bit-identical.

Fix: dispatch the self-send memcpy on the actual device of the
buffer (``recv_g1_tensor.is_cuda()``).  The post-loop copy-back to
``g1.device()`` is changed analogously to use ``!is_alias_of(g1)``
— the buffer was moved if and only if the pre-loop CPU fallback
created a fresh tensor.  Both checks are precise correctness
conditions that work for every combination of (USE_MPI on/off,
GOOGLE_CUDA on/off, MPI initialised or not, CUDA or CPU tensors).

Verified on remote with CUDA build + USE_MPI:
    test_border_op_backward.py             5 passed
    test_spin_export_with_comm.py          1 passed
    test_repflow_parallel.py + sibling     6 passed
    broader pt_expt sweep                 58 passed
---
 source/op/pt/comm.cc | 69 +++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
index 6eb49624ec..4b175370e4 100644
--- a/source/op/pt/comm.cc
+++ b/source/op/pt/comm.cc
@@ -139,25 +139,22 @@ class Border : public torch::autograd::Function<Border> {
       } else {
 #endif
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-#ifdef USE_MPI
-        // The CPU-fallback ``recv_g1_tensor.to(kCPU)`` above only runs
-        // when ``world_size >= 1`` (MPI initialized). With no MPI
-        // (single-rank, world_size == 0) the tensor is still on CUDA,
-        // so memcpy on CUDA pointers would segfault — gpuMemcpy is
-        // correct in that case regardless of ``cuda_aware``.
-        if (world_size >= 1 && cuda_aware == 0) {
-          memcpy(recv_g1, send_g1,
-                 (unsigned long)nsend * tensor_size * sizeof(FPTYPE));
-        } else {
+        // Self-send branch: choose the host-vs-device memcpy based on
+        // where the data actually lives, not on MPI state. The buffer
+        // we read/write is ``recv_g1_tensor`` whose device is either
+        // (a) the original ``g1`` device, or (b) CPU after the
+        // non-cuda-aware MPI fallback above. Reading that device
+        // directly is the only correct dispatch for build configs
+        // where USE_MPI is on but the call site uses CPU tensors
+        // (e.g. unit tests of border_op without MPI init).
+        if (recv_g1_tensor.is_cuda()) {
           gpuMemcpy(recv_g1, send_g1,
                     (unsigned long)nsend * tensor_size * sizeof(FPTYPE),
                     gpuMemcpyDeviceToDevice);
+        } else {
+          memcpy(recv_g1, send_g1,
+                 (unsigned long)nsend * tensor_size * sizeof(FPTYPE));
         }
-#else
-        gpuMemcpy(recv_g1, send_g1,
-                  (unsigned long)nsend * tensor_size * sizeof(FPTYPE),
-                  gpuMemcpyDeviceToDevice);
-#endif
 #else
       memcpy(recv_g1, send_g1,
              (unsigned long)nsend * tensor_size * sizeof(FPTYPE));
@@ -169,10 +166,12 @@ class Border : public torch::autograd::Function<Border> {
     }
 #ifdef USE_MPI
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-    // Only copy back when ``recv_g1_tensor`` was moved to CPU above
-    // (world_size >= 1 && cuda_aware == 0). With world_size == 0 the
-    // tensor is still aliased to g1 — no copy needed.
-    if (world_size >= 1 && cuda_aware == 0) {
+    // Only copy back when ``recv_g1_tensor`` was actually moved to a
+    // different device above (the cuda_aware==0 CPU fallback). When
+    // ``recv_g1_tensor`` still aliases ``g1`` no copy is needed; the
+    // is_alias_of check is the precise correctness condition and works
+    // for both CUDA and CPU call sites.
+    if (!recv_g1_tensor.is_alias_of(g1)) {
       g1.copy_(recv_g1_tensor);
     }
 #endif
@@ -312,23 +311,20 @@ class Border : public torch::autograd::Function<Border> {
 #endif
         if (nrecv) {
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-#ifdef USE_MPI
-          // See forward kernel: when world_size==0 the data stays on
-          // CUDA, so memcpy on device pointers segfaults. Only use
-          // host memcpy when we explicitly moved data to CPU above.
-          if (world_size >= 1 && cuda_aware == 0) {
-            memcpy(recv_g1, send_g1,
-                   (unsigned long)nrecv * tensor_size * sizeof(FPTYPE));
-          } else {
+          // Self-send branch: dispatch on the actual device of the
+          // ``recv_g1_tensor`` buffer, not on MPI state. Same rationale
+          // as the forward kernel — USE_MPI builds may be called with
+          // CPU tensors (unit tests of border_op_backward) where the
+          // gpuMemcpy path silently fails with cudaErrorInvalidValue
+          // and leaves recv_g1 uninitialized.
+          if (recv_g1_tensor.is_cuda()) {
             gpuMemcpy(recv_g1, send_g1,
                       (unsigned long)nrecv * tensor_size * sizeof(FPTYPE),
                       gpuMemcpyDeviceToDevice);
+          } else {
+            memcpy(recv_g1, send_g1,
+                   (unsigned long)nrecv * tensor_size * sizeof(FPTYPE));
           }
-#else
-          gpuMemcpy(recv_g1, send_g1,
-                    (unsigned long)nrecv * tensor_size * sizeof(FPTYPE),
-                    gpuMemcpyDeviceToDevice);
-#endif
 #else
         memcpy(recv_g1, send_g1,
                (unsigned long)nrecv * tensor_size * sizeof(FPTYPE));
@@ -345,10 +341,11 @@ class Border : public torch::autograd::Function<Border> {
 #ifdef USE_MPI
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
     // Move result back to the device of the input grad only when
-    // ``d_local_g1_tensor`` was moved to CPU above (world_size >= 1
-    // && cuda_aware == 0). With world_size == 0 the tensor stayed on
-    // its original device — no move needed.
-    if (world_size >= 1 && cuda_aware == 0) {
+    // ``d_local_g1_tensor`` was actually moved to a different device
+    // above (the cuda_aware==0 CPU fallback). The is_alias_of check
+    // is the precise correctness condition and works for both CUDA
+    // and CPU call sites (no-op when the tensor still aliases input).
+    if (!d_local_g1_tensor.is_alias_of(grad_g1)) {
       d_local_g1_tensor = d_local_g1_tensor.to(grad_g1.device());
     }
 #endif

From 4f8240ea66e9383da7ebe0f9258efa8e3a1834a2 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 4 May 2026 21:56:59 +0800
Subject: [PATCH 31/34] fix(op): drain pending MPI eager-send ACKs in border_op
 via Barrier

The empty-subdomain spin LAMMPS test (``processors 2 1 1`` with all
atoms on rank 0, rank 1 nloc=0) failed at MPI_Finalize with
"Communicator (handle=0x44000000) being freed has 2 unmatched
message(s)".  Test outputs were correct; the failure was purely in
the MPI cleanup path.

Root cause is the asymmetric ghost-exchange pattern that arises when
one rank only Sends and the other only Irecvs at a given swap (no
local atoms means nothing to send back).  Under MPICH eager protocol:

* The sender's MPI_Send returns once the message is queued in the
  eager buffer; the receiver's ACK round-trip is processed
  asynchronously by MPI's progress engine.
* In symmetric swaps the sender also calls MPI_Wait on its own
  Irecv, which advances the progress engine and drains pending ACKs.
* In asymmetric swaps the sender makes no further MPI call inside
  border_op, so the ACK stays unprocessed.  The "in-flight" counter
  remains nonzero, and MPI_Finalize reports it as unmatched.

Fix: add a single ``MPI_Barrier(world)`` at the end of
``Border::forward_t`` and ``Border::backward_t``.  The Barrier
forces a round-trip on every rank, which advances every rank's
progress engine and drains pending ACKs.  Cost is one collective
per ghost-exchange call; on a 2-rank, 6-swap, 4-atom case this is
in the noise vs the surrounding model forward.

Verified on remote (CUDA + MPICH):

  test_lammps_spin_dpa3_pt2.py ...                  [3 passed]
  test_lammps_dpa3_pt2.py ...............           [15 passed]

Restores the multi-rank LAMMPS spin GNN with empty-subdomain
support (PR #5430 CI's last failing case).
---
 source/op/pt/comm.cc | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
index 4b175370e4..32949bc339 100644
--- a/source/op/pt/comm.cc
+++ b/source/op/pt/comm.cc
@@ -165,6 +165,22 @@ class Border : public torch::autograd::Function<Border> {
       recv_g1 += nrecv * tensor_size;
     }
 #ifdef USE_MPI
+    // Drain pending eager-send ACKs before returning.  In the
+    // asymmetric ghost-exchange pattern (one rank only Sends, the
+    // other only Irecvs at a given swap — e.g. an empty subdomain
+    // under ``processors 2 1 1``) the sender's MPI_Send returns once
+    // the eager-buffered message is queued, but MPICH's internal
+    // accounting marks the message as "in flight" until the sender's
+    // progress engine processes the receiver's ACK.  In the symmetric
+    // case the sender's own MPI_Wait on its Irecv drains those ACKs.
+    // In the asymmetric case there is no such Wait, and the message
+    // stays "in flight" all the way to MPI_Finalize, which then
+    // reports ``Communicator (...) being freed has N unmatched
+    // message(s)``.  An MPI_Barrier on the same communicator forces a
+    // round-trip on every rank, drains ACKs, and clears the counter.
+    if (mpi_init && world_size >= 1) {
+      MPI_Barrier(world);
+    }
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
     // Only copy back when ``recv_g1_tensor`` was actually moved to a
     // different device above (the cuda_aware==0 CPU fallback). When
@@ -339,6 +355,13 @@ class Border : public torch::autograd::Function<Border> {
       }
     }
 #ifdef USE_MPI
+    // Drain pending eager-send ACKs before returning — see forward_t
+    // for the full rationale.  Backward has the same asymmetric
+    // Send/Irecv pattern (now in the reverse direction) and the same
+    // unmatched-message trap when one rank only Sends.
+    if (mpi_init && world_size >= 1) {
+      MPI_Barrier(world);
+    }
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
     // Move result back to the device of the input grad only when
     // ``d_local_g1_tensor`` was actually moved to a different device

From 7632db8945166cb7db13330395c12c0cde35a06e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 9 May 2026 17:31:35 +0800
Subject: [PATCH 32/34] fix(pt_expt): fail fast on with-comm artifact errors
 instead of silently zeroing

Address @iProzd review on PR #5430:

- border_op_export: throw on empty output list rather than returning
  empty_like(g1), which masked internal kernel bugs as zero outputs.
- DeepPotPTExpt / DeepSpinPTExpt: if the with-comm artifact is declared
  in metadata but fails to load, keep has_comm_artifact_=true so
  multi-rank dispatch (nswap>0) throws explicitly. Previously
  has_comm_artifact_ was reset to false on load failure, making
  multi-rank silently fall through to the single-rank artifact and
  skip the MPI ghost-embedding exchange.
---
 source/api_cc/src/DeepPotPTExpt.cc  | 27 +++++++++++++++++++--------
 source/api_cc/src/DeepSpinPTExpt.cc | 22 +++++++++++++++++-----
 source/op/pt/comm.cc                |  7 ++++++-
 3 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 287ee3b18f..910c2f6f7a 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -166,9 +166,10 @@ void DeepPotPTExpt::init(const std::string& model,
   // inference. Pre-Phase-3 .pt2 files lack ``has_comm_artifact``;
   // default to false so old artifacts keep working. If the metadata
   // flag is set but the nested artifact fails to extract or compile,
-  // fall back to single-rank mode rather than aborting init -- the
-  // hard error then surfaces in ``run_model_with_comm()`` only when
-  // multi-rank actually needs it.
+  // keep ``has_comm_artifact_=true`` and let single-rank dispatch
+  // continue working; multi-rank dispatch then fails fast at
+  // ``run_model_with_comm()`` rather than silently dropping the MPI
+  // exchange and producing wrong results.
   has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") &&
                        metadata["has_comm_artifact"].as_bool();
   if (has_comm_artifact_) {
@@ -186,11 +187,12 @@ void DeepPotPTExpt::init(const std::string& model,
                           : static_cast<c10::DeviceIndex>(-1));
     } catch (const std::exception& e) {
       std::cerr << "DeepPotPTExpt: failed to load with-comm artifact ("
-                << e.what() << "); falling back to single-rank-only dispatch."
+                << e.what()
+                << "); single-rank inference will still work, but multi-rank "
+                   "LAMMPS dispatch will throw."
                 << std::endl;
       with_comm_tempfile_.reset();
       with_comm_loader.reset();
-      has_comm_artifact_ = false;
     }
   }
 
@@ -244,9 +246,12 @@ std::vector<torch::Tensor> DeepPotPTExpt::run_model_with_comm(
     const std::vector<at::Tensor>& comm_tensors) {
   if (!with_comm_loader) {
     throw deepmd::deepmd_exception(
-        "run_model_with_comm called but the .pt2 file has no with-comm "
-        "artifact. This is a programming error: the caller should check "
-        "has_comm_artifact_ before invoking this path.");
+        "run_model_with_comm called but the with-comm artifact is not "
+        "available. Either the .pt2 file has no with-comm artifact compiled "
+        "(programming error: the caller should check has_comm_artifact_ "
+        "before invoking this path), or the artifact was present in the "
+        ".pt2 metadata but failed to load at init time (see earlier stderr "
+        "log). Multi-rank LAMMPS requires a working with-comm artifact.");
   }
   if (comm_tensors.size() != 8) {
     throw deepmd::deepmd_exception(
@@ -431,6 +436,12 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   // tensor to gather ghost embeddings from local atoms.
   std::vector<torch::Tensor> flat_outputs;
   bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0;
+  if (use_with_comm && !with_comm_loader) {
+    throw deepmd::deepmd_exception(
+        "Multi-rank LAMMPS requires the with-comm artifact, but it failed "
+        "to load at init time. See the earlier stderr log for the underlying "
+        "error.");
+  }
   // When NULL-type atoms exist, remapped storage must outlive comm
   // tensors (the int** pointer-array tensor references it).
   std::vector<std::vector<int>> remapped_sendlist;
diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc
index 9d4f072d2a..2ac4369f5f 100644
--- a/source/api_cc/src/DeepSpinPTExpt.cc
+++ b/source/api_cc/src/DeepSpinPTExpt.cc
@@ -174,7 +174,9 @@ void DeepSpinPTExpt::init(const std::string& model,
 
   // Phase 4: load the optional with-comm artifact for multi-rank GNN
   // spin inference.  Mirrors DeepPotPTExpt; see its init() comment for
-  // the rationale on the try/catch fallback.
+  // the rationale on keeping ``has_comm_artifact_=true`` on load
+  // failure so multi-rank dispatch fails fast rather than silently
+  // dropping the MPI exchange.
   has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") &&
                        metadata["has_comm_artifact"].as_bool();
   if (has_comm_artifact_) {
@@ -189,11 +191,12 @@ void DeepSpinPTExpt::init(const std::string& model,
                           : static_cast<c10::DeviceIndex>(-1));
     } catch (const std::exception& e) {
       std::cerr << "DeepSpinPTExpt: failed to load with-comm artifact ("
-                << e.what() << "); falling back to single-rank-only dispatch."
+                << e.what()
+                << "); single-rank inference will still work, but multi-rank "
+                   "LAMMPS dispatch will throw."
                 << std::endl;
       with_comm_tempfile_.reset();
       with_comm_loader.reset();
-      has_comm_artifact_ = false;
     }
   }
 
@@ -249,8 +252,11 @@ std::vector<torch::Tensor> DeepSpinPTExpt::run_model_with_comm(
     const std::vector<at::Tensor>& comm_tensors) {
   if (!with_comm_loader) {
     throw deepmd::deepmd_exception(
-        "DeepSpinPTExpt::run_model_with_comm called but the .pt2 has no "
-        "with-comm artifact.");
+        "DeepSpinPTExpt::run_model_with_comm called but the with-comm "
+        "artifact is not available. Either the .pt2 file has no with-comm "
+        "artifact compiled, or the artifact was present in the .pt2 metadata "
+        "but failed to load at init time (see earlier stderr log). Multi-rank "
+        "LAMMPS requires a working with-comm artifact.");
   }
   if (comm_tensors.size() != 8) {
     throw deepmd::deepmd_exception(
@@ -448,6 +454,12 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener,
   // (pre atom-doubling); the spin override halves them internally.
   std::vector<torch::Tensor> flat_outputs;
   bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0;
+  if (use_with_comm && !with_comm_loader) {
+    throw deepmd::deepmd_exception(
+        "Multi-rank LAMMPS requires the with-comm artifact, but it failed "
+        "to load at init time. See the earlier stderr log for the underlying "
+        "error.");
+  }
   std::vector<std::vector<int>> remapped_sendlist;
   std::vector<int*> remapped_sendlist_ptrs;
   std::vector<int> remapped_sendnum, remapped_recvnum;
diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
index 32949bc339..cfe78321af 100644
--- a/source/op/pt/comm.cc
+++ b/source/op/pt/comm.cc
@@ -523,7 +523,12 @@ torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor,
                        communicator_tensor, nlocal_tensor, nghost_tensor);
   // border_op returns {g1_tensor} — a list whose first element aliases
   // g1_tensor. Clone for AOTI graph-output correctness.
-  return out.empty() ? torch::empty_like(g1_tensor) : out[0].clone();
+  if (out.empty()) {
+    throw std::runtime_error(
+        "border_op_export: border_op returned an empty output list, which "
+        "indicates an internal error in the underlying border_op kernel.");
+  }
+  return out[0].clone();
 }
 
 torch::Tensor border_op_backward_export(

From 68c72a3091610ba0668a6c76d67ca3471a511697 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 9 May 2026 20:34:48 +0800
Subject: [PATCH 33/34] test(pt_expt): cover with-comm artifact load-failure
 dispatch guard

Add gtest cases that exercise the explicit ``use_with_comm &&
!with_comm_loader`` throw added to DeepPotPTExpt::compute and
DeepSpinPTExpt::compute. Fixtures: copies of deeppot_dpa3_mpi.pt2 and
deeppot_dpa3_spin_mpi.pt2 with the nested
``model/extra/forward_lower_with_comm.pt2`` entry replaced by garbage
bytes, produced by gen_corrupt_with_comm.py via zip rewrite (no AOTI
recompilation).

Each variant asserts:
- init() succeeds (catch path keeps regular artifact usable)
- single-rank compute (nswap=0) succeeds (uses regular artifact)
- multi-rank compute (nswap=1) throws deepmd::deepmd_exception
---
 .../test_with_comm_load_failure_ptexpt.cc     | 202 ++++++++++++++++++
 source/tests/infer/gen_corrupt_with_comm.py   |  67 ++++++
 2 files changed, 269 insertions(+)
 create mode 100644 source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc
 create mode 100644 source/tests/infer/gen_corrupt_with_comm.py

diff --git a/source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc b/source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc
new file mode 100644
index 0000000000..10111a41b7
--- /dev/null
+++ b/source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// Tests for the dispatch-site fail-fast guard when the with-comm AOTI
+// artifact failed to load at init time. The fixtures are produced by
+// source/tests/infer/gen_corrupt_with_comm.py: copies of the valid
+// multi-rank .pt2 archives whose nested
+// ``model/extra/forward_lower_with_comm.pt2`` entry has been replaced
+// with garbage bytes. The outer metadata still claims
+// ``has_comm_artifact: true`` so the loader exercises the catch path.
+//
+// Expectations:
+//   * init() succeeds (the loader logs and falls back instead of aborting).
+//   * Single-rank dispatch (nswap == 0) keeps working through the regular
+//     forward_lower artifact.
+//   * Multi-rank dispatch (nswap > 0) throws a deepmd::deepmd_exception
+//     instead of silently dropping the MPI ghost-embedding exchange.
+#include <gtest/gtest.h>
+
+#include <fstream>
+#include <vector>
+
+#include "DeepPot.h"
+// Include the PT_Expt headers so BUILD_PT_EXPT / BUILD_PT_EXPT_SPIN are
+// visible to the GTEST_SKIP guard below.
+#include "DeepPotPTExpt.h"
+#include "DeepSpin.h"
+#include "DeepSpinPTExpt.h"
+#include "common.h"
+#include "neighbor_list.h"
+#include "test_utils.h"
+
+namespace {
+constexpr const char* kPotCorrupt =
+    "../../tests/infer/deeppot_dpa3_mpi_corrupt_with_comm.pt2";
+constexpr const char* kSpinCorrupt =
+    "../../tests/infer/deeppot_dpa3_spin_mpi_corrupt_with_comm.pt2";
+
+bool file_exists(const char* path) {
+  std::ifstream f(path);
+  return f.good();
+}
+}  // namespace
+
+// ============================================================================
+// DeepPot (non-spin) — corrupted with-comm artifact
+// ============================================================================
+
+class TestDeepPotPTExptWithCommLoadFailure : public ::testing::Test {
+ protected:
+  // Coordinates / atype / box copied from gen_dpa3.py so the regular
+  // forward_lower artifact has well-formed inputs to evaluate.
+  std::vector<double> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                               00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                               3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<double> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+
+  deepmd::DeepPot dp;
+
+  void SetUp() override {
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT
+    GTEST_SKIP() << "Skip because PyTorch / pt_expt support is not enabled.";
+#endif
+    if (!file_exists(kPotCorrupt)) {
+      GTEST_SKIP() << "Skipping: " << kPotCorrupt
+                   << " not found. Run source/tests/infer/"
+                      "gen_corrupt_with_comm.py first.";
+    }
+    // Init must succeed: the with-comm loader fails internally and the
+    // catch block keeps the regular single-rank artifact usable.
+    ASSERT_NO_THROW(dp.init(kPotCorrupt));
+  }
+};
+
+TEST_F(TestDeepPotPTExptWithCommLoadFailure, single_rank_compute_succeeds) {
+  // nswap == 0 (default InputNlist) routes through the regular
+  // forward_lower artifact; the broken with-comm artifact is not
+  // consulted, so compute must succeed.
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<double> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int>> nlist_data;
+  _build_nlist<double>(nlist_data, coord_cpy, atype_cpy, mapping, coord, atype,
+                       box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, ilist.data(), numneigh.data(),
+                            firstneigh.data());
+  convert_nlist(inlist, nlist_data);
+  inlist.mapping = mapping.data();
+  ASSERT_EQ(inlist.nswap, 0);  // pre-condition: single-rank dispatch
+
+  double ener;
+  std::vector<double> force_, virial;
+  EXPECT_NO_THROW(dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box,
+                             nall - nloc, inlist, 0));
+  EXPECT_EQ(force_.size(), nall * 3);
+  EXPECT_EQ(virial.size(), 9);
+}
+
+TEST_F(TestDeepPotPTExptWithCommLoadFailure, multi_rank_compute_throws) {
+  // nswap > 0 forces the dispatch site to ``run_model_with_comm``; the
+  // load-failure guard added by PR #5430 must throw rather than silently
+  // falling back to the single-rank path. The send/recv arrays remain
+  // null — the guard fires before any of them are dereferenced.
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<double> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int>> nlist_data;
+  _build_nlist<double>(nlist_data, coord_cpy, atype_cpy, mapping, coord, atype,
+                       box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, ilist.data(), numneigh.data(),
+                            firstneigh.data());
+  convert_nlist(inlist, nlist_data);
+  inlist.mapping = mapping.data();
+  inlist.nswap = 1;  // simulate multi-rank without populating send/recv
+
+  double ener;
+  std::vector<double> force_, virial;
+  EXPECT_THROW(dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box,
+                          nall - nloc, inlist, 0),
+               deepmd::deepmd_exception);
+}
+
+// ============================================================================
+// DeepSpin — corrupted with-comm artifact
+// ============================================================================
+
+class TestDeepSpinPTExptWithCommLoadFailure : public ::testing::Test {
+ protected:
+  std::vector<double> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                               00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                               3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  // Match deeppot_dpa3_spin_mpi.pt2 spin layout (type 0 has spin, types
+  // 1+ do not) — spin vector packed alongside coord.
+  std::vector<double> spin = {0.13, 0.02, 0.03, 0., 0., 0., 0., 0., 0.,
+                              0.14, 0.10, 0.12, 0., 0., 0., 0., 0., 0.};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<double> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+
+  deepmd::DeepSpin dp;
+
+  void SetUp() override {
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT_SPIN
+    GTEST_SKIP() << "Skip because PyTorch / pt_expt spin support is not "
+                    "enabled.";
+#endif
+    if (!file_exists(kSpinCorrupt)) {
+      GTEST_SKIP() << "Skipping: " << kSpinCorrupt
+                   << " not found. Run source/tests/infer/"
+                      "gen_corrupt_with_comm.py first.";
+    }
+    ASSERT_NO_THROW(dp.init(kSpinCorrupt));
+  }
+};
+
+TEST_F(TestDeepSpinPTExptWithCommLoadFailure, single_rank_compute_succeeds) {
+  // NoPBC + hardcoded all-pairs nlist mirrors the
+  // ``cpu_lmp_nlist`` pattern in test_deeppot_dpa_ptexpt_spin.cc:
+  // nloc == natoms == nall, no ghost atoms.
+  const int natoms = static_cast<int>(atype.size());
+  std::vector<double> empty_box;
+  std::vector<std::vector<int>> nlist_data = {{1, 2, 3, 4, 5}, {0, 2, 3, 4, 5},
+                                              {0, 1, 3, 4, 5}, {0, 1, 2, 4, 5},
+                                              {0, 1, 2, 3, 5}, {0, 1, 2, 3, 4}};
+  std::vector<int> ilist(natoms), numneigh(natoms);
+  std::vector<int*> firstneigh(natoms);
+  deepmd::InputNlist inlist(natoms, ilist.data(), numneigh.data(),
+                            firstneigh.data());
+  convert_nlist(inlist, nlist_data);
+  ASSERT_EQ(inlist.nswap, 0);
+
+  double ener;
+  std::vector<double> force_, force_mag, virial;
+  EXPECT_NO_THROW(dp.compute(ener, force_, force_mag, virial, coord, spin,
+                             atype, empty_box, 0, inlist, 0));
+}
+
+TEST_F(TestDeepSpinPTExptWithCommLoadFailure, multi_rank_compute_throws) {
+  const int natoms = static_cast<int>(atype.size());
+  std::vector<double> empty_box;
+  std::vector<std::vector<int>> nlist_data = {{1, 2, 3, 4, 5}, {0, 2, 3, 4, 5},
+                                              {0, 1, 3, 4, 5}, {0, 1, 2, 4, 5},
+                                              {0, 1, 2, 3, 5}, {0, 1, 2, 3, 4}};
+  std::vector<int> ilist(natoms), numneigh(natoms);
+  std::vector<int*> firstneigh(natoms);
+  deepmd::InputNlist inlist(natoms, ilist.data(), numneigh.data(),
+                            firstneigh.data());
+  convert_nlist(inlist, nlist_data);
+  inlist.nswap = 1;  // simulate multi-rank without populating send/recv
+
+  double ener;
+  std::vector<double> force_, force_mag, virial;
+  EXPECT_THROW(dp.compute(ener, force_, force_mag, virial, coord, spin, atype,
+                          empty_box, 0, inlist, 0),
+               deepmd::deepmd_exception);
+}
diff --git a/source/tests/infer/gen_corrupt_with_comm.py b/source/tests/infer/gen_corrupt_with_comm.py
new file mode 100644
index 0000000000..ff0d16158c
--- /dev/null
+++ b/source/tests/infer/gen_corrupt_with_comm.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Generate ``deeppot_*_corrupt_with_comm.pt2`` fixtures.
+
+The fixtures are copies of the corresponding multi-rank ``.pt2`` archives
+in which the nested ``model/extra/forward_lower_with_comm.pt2`` entry has
+been overwritten with garbage bytes. The outer metadata still claims
+``has_comm_artifact: true``, so:
+
+- ``DeepPotPTExpt::init`` / ``DeepSpinPTExpt::init`` exercise the
+  try/catch fallback path on the with-comm AOTI loader.
+- Single-rank dispatch (``nswap == 0``) keeps working via the regular
+  artifact.
+- Multi-rank dispatch (``nswap > 0``) hits the explicit dispatch-site
+  throw added in PR #5430, instead of silently dropping the MPI
+  ghost-embedding exchange.
+
+Consumed by ``source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc``.
+"""
+
+import os
+import zipfile
+
+WITH_COMM_ENTRY = "model/extra/forward_lower_with_comm.pt2"
+GARBAGE = b"NOT_A_VALID_AOTI_ARCHIVE_" * 32
+
+
+def corrupt_with_comm(src: str, dst: str) -> None:
+    """Copy ``src`` to ``dst`` with the nested with-comm entry replaced."""
+    with (
+        zipfile.ZipFile(src, "r") as zin,
+        zipfile.ZipFile(dst, "w", compression=zipfile.ZIP_STORED) as zout,
+    ):
+        replaced = False
+        for info in zin.infolist():
+            data = zin.read(info.filename)
+            if info.filename == WITH_COMM_ENTRY:
+                data = GARBAGE
+                replaced = True
+            zout.writestr(info, data)
+        if not replaced:
+            raise RuntimeError(
+                f"{src} does not contain {WITH_COMM_ENTRY}; cannot corrupt."
+            )
+
+
+def main() -> None:
+    base_dir = os.path.dirname(__file__)
+    pairs = [
+        ("deeppot_dpa3_mpi.pt2", "deeppot_dpa3_mpi_corrupt_with_comm.pt2"),
+        (
+            "deeppot_dpa3_spin_mpi.pt2",
+            "deeppot_dpa3_spin_mpi_corrupt_with_comm.pt2",
+        ),
+    ]
+    for src_name, dst_name in pairs:
+        src = os.path.join(base_dir, src_name)
+        dst = os.path.join(base_dir, dst_name)
+        if not os.path.exists(src):
+            print(f"Skipping {dst_name}: source {src_name} not found.")  # noqa: T201
+            continue
+        corrupt_with_comm(src, dst)
+        print(f"Wrote {dst}")  # noqa: T201
+
+
+if __name__ == "__main__":
+    main()

From 5359abc4f8319a8fe42bd1007aa0fbe2e530738a Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 13 May 2026 10:33:48 +0800
Subject: [PATCH 34/34] chore: silence CodeQL alerts on PR #5430

- Python (5x py/unused-import): add `# lgtm[py/unused-import]` to the
  side-effect imports of `deepmd.pt_expt.utils.comm` (which register
  the deepmd_export::border_op fake/autograd metadata via decorators).
- Python (1x py/unnecessary-lambda): replace `lambda: _se_e2_a_child()`
  with `_se_e2_a_child` in test_hybrid.py parametrize table (the other
  two entries keep their lambdas because they pass kwargs).
- C++ (6x cpp/unused-static-function): annotate border_op_export and
  border_op_backward_export with `DEEPMD_MAYBE_UNUSED`, a macro that
  expands to `[[maybe_unused]]` under C++17 and to nothing under C++14
  (the fallback when older torch < 2.1 forces the legacy standard).
  CodeQL doesn't see through TORCH_LIBRARY_IMPL's function-pointer
  registration; the attribute documents that this is intentional.
---
 source/op/pt/comm.cc                          | 33 +++++++++++++------
 .../tests/pt_expt/descriptor/test_hybrid.py   |  2 +-
 .../descriptor/test_repflow_parallel.py       |  2 +-
 .../descriptor/test_repformer_parallel.py     |  2 +-
 .../pt_expt/model/test_export_with_comm.py    |  2 +-
 .../model/test_spin_export_with_comm.py       |  2 +-
 .../pt_expt/utils/test_border_op_backward.py  |  2 +-
 7 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
index cfe78321af..31691d5e7d 100644
--- a/source/op/pt/comm.cc
+++ b/source/op/pt/comm.cc
@@ -509,15 +509,27 @@ TORCH_LIBRARY_FRAGMENT(deepmd, m) {
 // ============================================================================
 
 namespace {
-torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor,
-                               const torch::Tensor& sendproc_tensor,
-                               const torch::Tensor& recvproc_tensor,
-                               const torch::Tensor& sendnum_tensor,
-                               const torch::Tensor& recvnum_tensor,
-                               const torch::Tensor& g1_tensor,
-                               const torch::Tensor& communicator_tensor,
-                               const torch::Tensor& nlocal_tensor,
-                               const torch::Tensor& nghost_tensor) {
+// ``DEEPMD_MAYBE_UNUSED`` silences CodeQL's ``cpp/unused-static-function``
+// query — the functions ARE used: ``TORCH_LIBRARY_IMPL(...)`` below
+// registers them as op implementations via function-pointer arguments,
+// which CodeQL's static dataflow can't see through. The attribute is
+// C++17, so guard it for the legacy-torch (< 2.1) build path which
+// CMakeLists.txt holds at C++14.
+#if __cplusplus >= 201703L
+#define DEEPMD_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define DEEPMD_MAYBE_UNUSED
+#endif
+DEEPMD_MAYBE_UNUSED torch::Tensor border_op_export(
+    const torch::Tensor& sendlist_tensor,
+    const torch::Tensor& sendproc_tensor,
+    const torch::Tensor& recvproc_tensor,
+    const torch::Tensor& sendnum_tensor,
+    const torch::Tensor& recvnum_tensor,
+    const torch::Tensor& g1_tensor,
+    const torch::Tensor& communicator_tensor,
+    const torch::Tensor& nlocal_tensor,
+    const torch::Tensor& nghost_tensor) {
   auto out = border_op(sendlist_tensor, sendproc_tensor, recvproc_tensor,
                        sendnum_tensor, recvnum_tensor, g1_tensor,
                        communicator_tensor, nlocal_tensor, nghost_tensor);
@@ -531,7 +543,7 @@ torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor,
   return out[0].clone();
 }
 
-torch::Tensor border_op_backward_export(
+DEEPMD_MAYBE_UNUSED torch::Tensor border_op_backward_export(
     const torch::Tensor& sendlist_tensor,
     const torch::Tensor& sendproc_tensor,
     const torch::Tensor& recvproc_tensor,
@@ -547,6 +559,7 @@ torch::Tensor border_op_backward_export(
       .clone();
 }
 }  // namespace
+#undef DEEPMD_MAYBE_UNUSED
 
 TORCH_LIBRARY_FRAGMENT(deepmd_export, m) {
   m.def(
diff --git a/source/tests/pt_expt/descriptor/test_hybrid.py b/source/tests/pt_expt/descriptor/test_hybrid.py
index 86575180c7..b45a7bea19 100644
--- a/source/tests/pt_expt/descriptor/test_hybrid.py
+++ b/source/tests/pt_expt/descriptor/test_hybrid.py
@@ -324,7 +324,7 @@ def _dpa3_child(use_loc_mapping: bool) -> dict:
 @pytest.mark.parametrize(
     "child_factory,expected_hmp,expected_hmp_ar",
     [
-        (lambda: _se_e2_a_child(), False, False),
+        (_se_e2_a_child, False, False),
         (lambda: _dpa3_child(use_loc_mapping=True), True, False),
         (lambda: _dpa3_child(use_loc_mapping=False), True, True),
     ],
diff --git a/source/tests/pt_expt/descriptor/test_repflow_parallel.py b/source/tests/pt_expt/descriptor/test_repflow_parallel.py
index f5c68fabed..f5b4d40bcd 100644
--- a/source/tests/pt_expt/descriptor/test_repflow_parallel.py
+++ b/source/tests/pt_expt/descriptor/test_repflow_parallel.py
@@ -32,7 +32,7 @@
 import torch
 
 # Trigger registration of the deepmd_export::border_op opaque wrapper.
-import deepmd.pt_expt.utils.comm  # noqa: F401
+import deepmd.pt_expt.utils.comm  # noqa: F401  # lgtm[py/unused-import]
 from deepmd.dpmodel.descriptor.dpa3 import (
     RepFlowArgs,
 )
diff --git a/source/tests/pt_expt/descriptor/test_repformer_parallel.py b/source/tests/pt_expt/descriptor/test_repformer_parallel.py
index 24e6e6ce33..1a6413d08f 100644
--- a/source/tests/pt_expt/descriptor/test_repformer_parallel.py
+++ b/source/tests/pt_expt/descriptor/test_repformer_parallel.py
@@ -18,7 +18,7 @@
 import torch
 
 # Trigger registration of the deepmd_export::border_op opaque wrapper.
-import deepmd.pt_expt.utils.comm  # noqa: F401
+import deepmd.pt_expt.utils.comm  # noqa: F401  # lgtm[py/unused-import]
 from deepmd.dpmodel.descriptor.dpa2 import (
     RepformerArgs,
     RepinitArgs,
diff --git a/source/tests/pt_expt/model/test_export_with_comm.py b/source/tests/pt_expt/model/test_export_with_comm.py
index f338397639..dcbc628e53 100644
--- a/source/tests/pt_expt/model/test_export_with_comm.py
+++ b/source/tests/pt_expt/model/test_export_with_comm.py
@@ -35,7 +35,7 @@
 
 # Trigger registration of the deepmd_export::border_op opaque wrapper
 # (needed by the with-comm artifact at runtime).
-import deepmd.pt_expt.utils.comm  # noqa: F401
+import deepmd.pt_expt.utils.comm  # noqa: F401  # lgtm[py/unused-import]
 from deepmd.pt_expt.model.get_model import (
     get_model,
 )
diff --git a/source/tests/pt_expt/model/test_spin_export_with_comm.py b/source/tests/pt_expt/model/test_spin_export_with_comm.py
index f77c9fe415..0e403d2b42 100644
--- a/source/tests/pt_expt/model/test_spin_export_with_comm.py
+++ b/source/tests/pt_expt/model/test_spin_export_with_comm.py
@@ -30,7 +30,7 @@
 import numpy as np
 import torch
 
-import deepmd.pt_expt.utils.comm  # noqa: F401  - opaque op registration
+import deepmd.pt_expt.utils.comm  # noqa: F401  # lgtm[py/unused-import]  - opaque op registration
 from deepmd.dpmodel.model.model import get_model as get_model_dp
 from deepmd.pt_expt.model.spin_ener_model import (
     SpinEnergyModel,
diff --git a/source/tests/pt_expt/utils/test_border_op_backward.py b/source/tests/pt_expt/utils/test_border_op_backward.py
index aeaf491cb2..b33e575f1a 100644
--- a/source/tests/pt_expt/utils/test_border_op_backward.py
+++ b/source/tests/pt_expt/utils/test_border_op_backward.py
@@ -33,7 +33,7 @@
 # comm self-bootstraps the underlying libdeepmd_op_pt.so when needed, so
 # this single side-effect import is enough to register both the C++
 # ops (deepmd::border_op_backward) and their fake/autograd metadata.
-import deepmd.pt_expt.utils.comm  # noqa: F401  - registers deepmd_export::border_op
+import deepmd.pt_expt.utils.comm  # noqa: F401  # lgtm[py/unused-import]  - registers deepmd_export::border_op
 
 
 def _addr_of(np_arr: np.ndarray) -> int: