From ae35ea8a513443501093c1e714ae7df7e6839ff8 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 25 Apr 2026 23:55:14 +0800 Subject: [PATCH 01/34] refactor(dpmodel): plumb comm_dict and extract _exchange_ghosts hook Lifts the per-layer node_ebd_ext gather inside DescrptBlockRepflows.call and DescrptBlockRepformers.call into a new _exchange_ghosts(...) method so subclasses can override it. Default behaviour is byte-identical to before for non-parallel inference (comm_dict is None). Threads an optional comm_dict kwarg through: - make_model.call_common_lower / forward_common_atomic - {base,dp,linear,pairtab}_atomic_model - dpa1/dpa2/dpa3/hybrid/se_* descriptors - repflows/repformers blocks Non-GNN descriptors accept and ignore comm_dict (noqa-marked unused). DPA2 routes around its pre-block gather when comm_dict is supplied so the repformers' per-layer override drives ghost exchange instead. This is the dpmodel-side groundwork for pt_expt multi-rank LAMMPS support; default behaviour unchanged. --- .../dpmodel/atomic_model/base_atomic_model.py | 5 +++ .../dpmodel/atomic_model/dp_atomic_model.py | 5 +++ .../atomic_model/linear_atomic_model.py | 6 +++ .../atomic_model/pairtab_atomic_model.py | 2 + deepmd/dpmodel/descriptor/dpa1.py | 1 + deepmd/dpmodel/descriptor/dpa2.py | 22 +++++++-- deepmd/dpmodel/descriptor/dpa3.py | 5 +++ deepmd/dpmodel/descriptor/hybrid.py | 5 ++- deepmd/dpmodel/descriptor/repflows.py | 45 ++++++++++++++++--- deepmd/dpmodel/descriptor/repformers.py | 42 +++++++++++++++-- deepmd/dpmodel/descriptor/se_e2_a.py | 1 + deepmd/dpmodel/descriptor/se_r.py | 1 + deepmd/dpmodel/descriptor/se_t.py | 1 + deepmd/dpmodel/descriptor/se_t_tebd.py | 1 + deepmd/dpmodel/model/make_model.py | 9 ++++ 15 files changed, 137 insertions(+), 14 deletions(-) diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py index 1120078bb2..debddba6e7 100644 --- a/deepmd/dpmodel/atomic_model/base_atomic_model.py +++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py @@ -231,6 +231,7 @@ def forward_common_atomic( mapping: Array | None = None, fparam: Array | None = None, aparam: Array | None = None, + comm_dict: dict | None = None, ) -> dict[str, Array]: """Common interface for atomic inference. @@ -252,6 +253,9 @@ def forward_common_atomic( frame parameters, shape: nf x dim_fparam aparam atomic parameter, shape: nf x nloc x dim_aparam + comm_dict + MPI communication metadata for parallel inference. ``None`` for + non-parallel inference (default). Returns ------- @@ -279,6 +283,7 @@ def forward_common_atomic( mapping=mapping, fparam=fparam, aparam=aparam, + comm_dict=comm_dict, ) ret_dict = self.apply_out_stat(ret_dict, atype) diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py index 466e3ddd95..0505f63d83 100644 --- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py +++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py @@ -157,6 +157,7 @@ def forward_atomic( mapping: Array | None = None, fparam: Array | None = None, aparam: Array | None = None, + comm_dict: dict | None = None, ) -> dict[str, Array]: """Models' atomic predictions. @@ -174,6 +175,9 @@ def forward_atomic( frame parameter. nf x ndf aparam atomic parameter. nf x nloc x nda + comm_dict + MPI communication metadata for parallel inference. ``None`` for + non-parallel inference (default). Forwarded to the descriptor. Returns ------- @@ -215,6 +219,7 @@ def forward_atomic( nlist, mapping=mapping, fparam=fparam_input_for_des if self.add_chg_spin_ebd else None, + comm_dict=comm_dict, ) ret = self.fitting_net( descriptor, diff --git a/deepmd/dpmodel/atomic_model/linear_atomic_model.py b/deepmd/dpmodel/atomic_model/linear_atomic_model.py index 3ed9077df7..05ff8499f8 100644 --- a/deepmd/dpmodel/atomic_model/linear_atomic_model.py +++ b/deepmd/dpmodel/atomic_model/linear_atomic_model.py @@ -224,6 +224,7 @@ def forward_atomic( mapping: Array | None = None, fparam: Array | None = None, aparam: Array | None = None, + comm_dict: dict | None = None, ) -> dict[str, Array]: """Return atomic prediction. @@ -241,6 +242,10 @@ def forward_atomic( frame parameter. (nframes, ndf) aparam atomic parameter. (nframes, nloc, nda) + comm_dict + MPI communication metadata. Forwarded to each sub-model so GNN + sub-descriptors can perform parallel ghost exchange. ``None`` for + non-parallel inference (default). Returns ------- @@ -280,6 +285,7 @@ def forward_atomic( mapping, fparam, aparam, + comm_dict, )["energy"] ) weights = self._compute_weight(extended_coord, extended_atype, nlists_) diff --git a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py index 51c370eca0..c1ec9d2a00 100644 --- a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py +++ b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py @@ -253,7 +253,9 @@ def forward_atomic( mapping: Array | None = None, fparam: Array | None = None, aparam: Array | None = None, + comm_dict: dict | None = None, ) -> dict[str, Array]: + del comm_dict # pairtab is local; no MPI ghost exchange needed. xp = array_api_compat.array_namespace(extended_coord, extended_atype, nlist) nframes, nloc, nnei = nlist.shape extended_coord = xp.reshape(extended_coord, (nframes, -1, 3)) diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py index bc2a04a836..9d138f422a 100644 --- a/deepmd/dpmodel/descriptor/dpa1.py +++ b/deepmd/dpmodel/descriptor/dpa1.py @@ -500,6 +500,7 @@ def call( nlist: Array, mapping: Array | None = None, fparam: Array | None = None, + comm_dict: dict | None = None, ) -> Array: """Compute the descriptor. diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py index 2fa765f04b..851422cce0 100644 --- a/deepmd/dpmodel/descriptor/dpa2.py +++ b/deepmd/dpmodel/descriptor/dpa2.py @@ -831,6 +831,7 @@ def call( nlist: Array, mapping: Array | None = None, fparam: Array | None = None, + comm_dict: dict | None = None, ) -> tuple[Array, Array, Array, Array, Array]: """Compute the descriptor. @@ -844,6 +845,11 @@ def call( The neighbor list. shape: nf x nloc x nnei mapping The index mapping, maps extended region index to local region. + comm_dict + MPI communication metadata for parallel inference. Forwarded to + the repformer block (the message-passing part). The repinit + sub-block does no message passing and does not receive it. + ``None`` for non-parallel inference (default). Returns ------- @@ -912,9 +918,18 @@ def call( assert self.tebd_transform is not None g1 = g1 + self.tebd_transform(g1_inp) # mapping g1 - assert mapping is not None - mapping_ext = xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, g1.shape[-1])) - g1_ext = xp_take_along_axis(g1, mapping_ext, axis=1) + if comm_dict is None: + # non-parallel: gather g1 -> g1_ext via mapping, hand the + # nall-sized embedding to the repformer block. + assert mapping is not None + mapping_ext = xp.tile( + xp.expand_dims(mapping, axis=-1), (1, 1, g1.shape[-1]) + ) + g1_ext = xp_take_along_axis(g1, mapping_ext, axis=1) + else: + # parallel mode: hand the local-only g1 to the repformer block; + # its per-layer override fills ghosts via the MPI exchange. + g1_ext = g1 # repformer g1, g2, h2, rot_mat, sw = self.repformers( nlist_dict[ @@ -926,6 +941,7 @@ def call( atype_ext, g1_ext, mapping, + comm_dict=comm_dict, ) if self.concat_output_tebd: g1 = xp.concat([g1, g1_inp], axis=-1) diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py index 5f5aea50e5..07d5481a91 100644 --- a/deepmd/dpmodel/descriptor/dpa3.py +++ b/deepmd/dpmodel/descriptor/dpa3.py @@ -616,6 +616,7 @@ def call( nlist: Array, mapping: Array | None = None, fparam: Array | None = None, + comm_dict: dict | None = None, ) -> tuple[Array, Array, Array, Array, Array]: """Compute the descriptor. @@ -629,6 +630,9 @@ def call( The neighbor list. shape: nf x nloc x nnei mapping The index mapping, mapps extended region index to local region. + comm_dict + MPI communication metadata for parallel inference. Forwarded to + the repflows block. ``None`` for non-parallel inference (default). Returns ------- @@ -695,6 +699,7 @@ def call( atype_ext, node_ebd_ext, mapping, + comm_dict=comm_dict, ) if self.concat_output_tebd: node_ebd = xp.concat([node_ebd, node_ebd_inp], axis=-1) diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py index b15fbc15d2..512a753d25 100644 --- a/deepmd/dpmodel/descriptor/hybrid.py +++ b/deepmd/dpmodel/descriptor/hybrid.py @@ -276,6 +276,7 @@ def call( nlist: Array, mapping: Array | None = None, fparam: Array | None = None, + comm_dict: dict | None = None, ) -> tuple[ Array, Array | None, @@ -332,7 +333,9 @@ def call( # mixed_types is True, but descrpt.mixed_types is False assert nl_distinguish_types is not None nl = nl_distinguish_types[:, :, nci] - odescriptor, gr, g2, h2, sw = descrpt(coord_ext, atype_ext, nl, mapping) + odescriptor, gr, g2, h2, sw = descrpt( + coord_ext, atype_ext, nl, mapping, comm_dict=comm_dict + ) out_descriptor.append(odescriptor) if gr is not None: out_gr.append(gr) diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py index 30637dc75a..2dd64448b2 100644 --- a/deepmd/dpmodel/descriptor/repflows.py +++ b/deepmd/dpmodel/descriptor/repflows.py @@ -506,6 +506,27 @@ def reinit_exclude( self.exclude_types = exclude_types self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types) + def _exchange_ghosts( + self, + node_ebd: Array, + mapping_tiled: Array | None, + comm_dict: dict | None, + nall: int, + nloc: int, + ) -> Array: + """Build node_ebd_ext (the ghost-aware embedding) for the per-layer loop. + + Default: array-api gather via the pre-tiled `mapping_tiled`, or pass the + local-only `node_ebd` through when ``self.use_loc_mapping`` is set. + ``comm_dict``, ``nall``, ``nloc`` are unused in this default impl; they + exist so the pt_expt subclass can perform the per-layer MPI ghost + exchange (``deepmd_export::border_op``) when ``comm_dict is not None``. + """ + del comm_dict, nall, nloc + if self.use_loc_mapping: + return node_ebd + return xp_take_along_axis(node_ebd, mapping_tiled, axis=1) + def call( self, nlist: Array, @@ -514,6 +535,7 @@ def call( atype_embd_ext: Array | None = None, mapping: Array | None = None, type_embedding: Array | None = None, + comm_dict: dict | None = None, ) -> tuple[Array, Array, Array, Array, Array]: xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext) nframes, nloc, nnei = nlist.shape @@ -641,15 +663,24 @@ def call( # nf x nloc x a_nnei x a_nnei x a_dim [OR] n_angle x a_dim angle_ebd = self.angle_embd(angle_input) - # nb x nall x n_dim - mapping = xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, self.n_dim)) + # nb x nall x n_dim (pre-tiled mapping reused across layers when not + # using comm_dict). Skip the tile when mapping is None — pt_expt's + # parallel-mode override consults comm_dict instead. + mapping_tiled = ( + xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, self.n_dim)) + if mapping is not None + else None + ) for idx, ll in enumerate(self.layers): # node_ebd: nb x nloc x n_dim - # node_ebd_ext: nb x nall x n_dim - node_ebd_ext = ( - node_ebd - if self.use_loc_mapping - else xp_take_along_axis(node_ebd, mapping, axis=1) + # node_ebd_ext: nb x nall x n_dim (or nb x nloc x n_dim when + # use_loc_mapping=True) + node_ebd_ext = self._exchange_ghosts( + node_ebd, + mapping_tiled, + comm_dict, + nall, + nloc, ) node_ebd, edge_ebd, angle_ebd = ll.call( node_ebd_ext, diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py index 5881b3a0b3..3891c57c7d 100644 --- a/deepmd/dpmodel/descriptor/repformers.py +++ b/deepmd/dpmodel/descriptor/repformers.py @@ -480,6 +480,26 @@ def reinit_exclude( self.exclude_types = exclude_types self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types) + def _exchange_ghosts( + self, + g1: Array, + mapping_tiled: Array | None, + comm_dict: dict | None, + nall: int, + nloc: int, + ) -> Array: + """Build g1_ext (the ghost-aware single-atom embedding) for the + per-layer loop. + + Default: array-api gather via the pre-tiled ``mapping_tiled``. + ``comm_dict``, ``nall``, ``nloc`` are unused in this default impl; + they exist so the pt_expt subclass can perform the per-layer MPI + ghost exchange (``deepmd_export::border_op``) when ``comm_dict is + not None``. + """ + del comm_dict, nall, nloc + return xp_take_along_axis(g1, mapping_tiled, axis=1) + def call( self, nlist: Array, @@ -488,6 +508,7 @@ def call( atype_embd_ext: Array | None = None, mapping: Array | None = None, type_embedding: Array | None = None, + comm_dict: dict | None = None, ) -> Array: xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext) exclude_mask = self.emask.build_type_exclude_mask(nlist, atype_ext) @@ -524,12 +545,27 @@ def call( # set all padding positions to index of 0 # if a neighbor is real or not is indicated by nlist_mask nlist = xp.where(nlist == -1, xp.zeros_like(nlist), nlist) - # nf x nall x ng1 - mapping = xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, self.g1_dim)) + # nall computed for the pt_expt parallel-mode override (uses nall to + # size the pad before MPI ghost exchange). dpmodel default ignores it. + nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3 + # nf x nall x ng1 (pre-tiled mapping reused across layers when not + # using comm_dict). Skip the tile when mapping is None — pt_expt's + # parallel-mode override consults comm_dict instead. + mapping_tiled = ( + xp.tile(xp.expand_dims(mapping, axis=-1), (1, 1, self.g1_dim)) + if mapping is not None + else None + ) for idx, ll in enumerate(self.layers): # g1: nf x nloc x ng1 # g1_ext: nf x nall x ng1 - g1_ext = xp_take_along_axis(g1, mapping, axis=1) + g1_ext = self._exchange_ghosts( + g1, + mapping_tiled, + comm_dict, + nall, + nloc, + ) g1, g2, h2 = ll.call( g1_ext, g2, diff --git a/deepmd/dpmodel/descriptor/se_e2_a.py b/deepmd/dpmodel/descriptor/se_e2_a.py index 8997412325..6c20699c23 100644 --- a/deepmd/dpmodel/descriptor/se_e2_a.py +++ b/deepmd/dpmodel/descriptor/se_e2_a.py @@ -399,6 +399,7 @@ def call( nlist: Array, mapping: Array | None = None, fparam: Array | None = None, + comm_dict: dict | None = None, ) -> Array: """Compute the descriptor. diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py index b5ba7a282f..55a774bb71 100644 --- a/deepmd/dpmodel/descriptor/se_r.py +++ b/deepmd/dpmodel/descriptor/se_r.py @@ -371,6 +371,7 @@ def call( nlist: Array, mapping: Array | None = None, fparam: Array | None = None, + comm_dict: dict | None = None, ) -> Array: """Compute the descriptor. diff --git a/deepmd/dpmodel/descriptor/se_t.py b/deepmd/dpmodel/descriptor/se_t.py index e599669068..38eb7cc16c 100644 --- a/deepmd/dpmodel/descriptor/se_t.py +++ b/deepmd/dpmodel/descriptor/se_t.py @@ -346,6 +346,7 @@ def call( nlist: Array, mapping: Array | None = None, fparam: Array | None = None, + comm_dict: dict | None = None, ) -> tuple[Array, Array]: """Compute the descriptor. diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py index 2d36994d61..445260b861 100644 --- a/deepmd/dpmodel/descriptor/se_t_tebd.py +++ b/deepmd/dpmodel/descriptor/se_t_tebd.py @@ -354,6 +354,7 @@ def call( nlist: Array, mapping: Array | None = None, fparam: Array | None = None, + comm_dict: dict | None = None, ) -> tuple[Array, Array]: """Compute the descriptor. diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py index 597f8ea006..d57b0b3790 100644 --- a/deepmd/dpmodel/model/make_model.py +++ b/deepmd/dpmodel/model/make_model.py @@ -326,6 +326,7 @@ def call_common_lower( aparam: Array | None = None, do_atomic_virial: bool = False, extended_coord_corr: Array | None = None, + comm_dict: dict | None = None, ) -> dict[str, Array]: """Return model prediction. Lower interface that takes extended atomic coordinates and types, nlist, and mapping @@ -351,6 +352,11 @@ def call_common_lower( extended_coord_corr coordinates correction for virial in extended region. nf x (nall x 3) + comm_dict + MPI communication metadata for parallel inference (e.g. + LAMMPS multi-rank). Carries send/recv lists, processor IDs, + the MPI communicator handle, and per-rank nlocal/nghost. + ``None`` for non-parallel inference (default). Returns ------- @@ -379,6 +385,7 @@ def call_common_lower( aparam=ap, do_atomic_virial=do_atomic_virial, extended_coord_corr=extended_coord_corr, + comm_dict=comm_dict, ) model_predict = self._output_type_cast(model_predict, input_prec) return model_predict @@ -393,6 +400,7 @@ def forward_common_atomic( aparam: Array | None = None, do_atomic_virial: bool = False, extended_coord_corr: Array | None = None, + comm_dict: dict | None = None, ) -> dict[str, Array]: atomic_ret = self.atomic_model.forward_common_atomic( extended_coord, @@ -401,6 +409,7 @@ def forward_common_atomic( mapping=mapping, fparam=fparam, aparam=aparam, + comm_dict=comm_dict, ) return fit_output_to_model_output( atomic_ret, From bfe650f70e821ee4f2b38ca5f9c0c8501d42b33e Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 00:03:39 +0800 Subject: [PATCH 02/34] feat(op): expose deepmd::border_op_backward as a standalone op Refactors Border::backward into a free function take/return interface (positional comm tensors + grad_g1, returns grad_in) and registers it as ``torch.ops.deepmd.border_op_backward``. The autograd Function's backward delegates to the new symbol so existing pt-backend behaviour is unchanged; the new symbol is what pt_expt's opaque op wrapper (``deepmd_export::border_op``) dispatches to from its ``register_autograd`` callback. The standalone op is needed because the ``custom_op`` API requires the backward to be expressible as a registered op (it cannot reference the autograd Function directly), and AOTInductor must serialise the call into the compiled .pt2. --- source/op/pt/comm.cc | 128 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 108 insertions(+), 20 deletions(-) diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 97466a4833..9dd9b50c3b 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -174,17 +174,6 @@ class Border : public torch::autograd::Function { static torch::autograd::variable_list backward( torch::autograd::AutogradContext* ctx, torch::autograd::variable_list grad_output) { - bool type_flag = (grad_output[0].dtype() == torch::kDouble) ? true : false; - if (type_flag) { - return backward_t(ctx, grad_output); - } else { - return backward_t(ctx, grad_output); - } - } - template - static torch::autograd::variable_list backward_t( - torch::autograd::AutogradContext* ctx, - torch::autograd::variable_list grad_output) { torch::autograd::variable_list saved_variables = ctx->get_saved_variables(); torch::Tensor sendlist_tensor = saved_variables[0]; torch::Tensor sendproc_tensor = saved_variables[1]; @@ -194,8 +183,41 @@ class Border : public torch::autograd::Function { torch::Tensor communicator_tensor = saved_variables[5]; torch::Tensor nlocal_tensor = saved_variables[6]; torch::Tensor nghost_tensor = saved_variables[7]; + torch::Tensor d_in = border_op_backward_dispatch( + sendlist_tensor, sendproc_tensor, recvproc_tensor, sendnum_tensor, + recvnum_tensor, grad_output[0], communicator_tensor, nlocal_tensor, + nghost_tensor); + return {torch::Tensor(), torch::Tensor(), torch::Tensor(), + torch::Tensor(), torch::Tensor(), d_in, + torch::Tensor(), torch::Tensor(), torch::Tensor(), + torch::Tensor()}; + } - torch::Tensor d_local_g1_tensor = grad_output[0].contiguous(); + // Forward declaration; defined as a free function below so it can be + // registered as a separate torch op (deepmd::border_op_backward) used by + // the pt_expt opaque-op autograd wrapper. + static torch::Tensor border_op_backward_dispatch( + const torch::Tensor& sendlist_tensor, + const torch::Tensor& sendproc_tensor, + const torch::Tensor& recvproc_tensor, + const torch::Tensor& sendnum_tensor, + const torch::Tensor& recvnum_tensor, + const torch::Tensor& grad_g1, + const torch::Tensor& communicator_tensor, + const torch::Tensor& nlocal_tensor, + const torch::Tensor& nghost_tensor); + + template + static torch::Tensor backward_t(const torch::Tensor& sendlist_tensor, + const torch::Tensor& sendproc_tensor, + const torch::Tensor& recvproc_tensor, + const torch::Tensor& sendnum_tensor, + const torch::Tensor& recvnum_tensor, + const torch::Tensor& grad_g1, + const torch::Tensor& communicator_tensor, + const torch::Tensor& nlocal_tensor, + const torch::Tensor& nghost_tensor) { + torch::Tensor d_local_g1_tensor = grad_g1.contiguous(); #ifdef USE_MPI int mpi_init = 0; MPI_Initialized(&mpi_init); @@ -216,8 +238,8 @@ class Border : public torch::autograd::Function { cuda_aware = MPIX_Query_cuda_support(); #endif if (cuda_aware == 0) { - d_local_g1_tensor = torch::empty_like(grad_output[0]).to(torch::kCPU); - d_local_g1_tensor.copy_(grad_output[0]); + d_local_g1_tensor = torch::empty_like(grad_g1).to(torch::kCPU); + d_local_g1_tensor.copy_(grad_g1); } } #endif @@ -312,15 +334,15 @@ class Border : public torch::autograd::Function { #ifdef USE_MPI #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) if (cuda_aware == 0) { - grad_output[0].copy_(d_local_g1_tensor); + // Move result back to the device of the input grad. This replaces + // the original in-place copy_ into grad_output[0]. + d_local_g1_tensor = d_local_g1_tensor.to(grad_g1.device()); } #endif #endif - - return {torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), - torch::Tensor(), grad_output[0], torch::Tensor(), torch::Tensor(), - torch::Tensor(), torch::Tensor()}; + return d_local_g1_tensor; } + #ifdef USE_MPI static void unpack_communicator(const torch::Tensor& communicator_tensor, MPI_Comm& mpi_comm) { @@ -363,4 +385,70 @@ std::vector border_op(const torch::Tensor& sendlist_tensor, communicator_tensor, nlocal_tensor, nghost_tensor); } -TORCH_LIBRARY_FRAGMENT(deepmd, m) { m.def("border_op", border_op); } +// Define Border::border_op_backward_dispatch out-of-line so the type-flag +// dispatch can refer to the templated backward_t members declared in the +// class. +torch::Tensor Border::border_op_backward_dispatch( + const torch::Tensor& sendlist_tensor, + const torch::Tensor& sendproc_tensor, + const torch::Tensor& recvproc_tensor, + const torch::Tensor& sendnum_tensor, + const torch::Tensor& recvnum_tensor, + const torch::Tensor& grad_g1, + const torch::Tensor& communicator_tensor, + const torch::Tensor& nlocal_tensor, + const torch::Tensor& nghost_tensor) { + bool type_flag = (grad_g1.dtype() == torch::kDouble); + if (type_flag) { + return backward_t(sendlist_tensor, sendproc_tensor, recvproc_tensor, + sendnum_tensor, recvnum_tensor, grad_g1, + communicator_tensor, nlocal_tensor, + nghost_tensor); + } else { + return backward_t(sendlist_tensor, sendproc_tensor, recvproc_tensor, + sendnum_tensor, recvnum_tensor, grad_g1, + communicator_tensor, nlocal_tensor, nghost_tensor); + } +} + +/** + * @brief Standalone backward of border_op for use by pt_expt's opaque-op + * autograd wrapper. Performs the symmetric MPI exchange that the autograd + * Border::backward applies, but without an autograd context — comm tensors + * are passed directly so the op can be registered as a torch op and + * embedded in an AOTInductor graph. + * + * The comm topology is symmetric: the same sendlist/sendnum/recvnum buffers + * encode the forward exchange; backward simply swaps send <-> recv and + * accumulates gradients into the local atom slots. + * + * @param[in] sendlist_tensor send-list pointer-array (forward direction) + * @param[in] sendproc_tensor send-proc IDs (forward direction) + * @param[in] recvproc_tensor recv-proc IDs (forward direction) + * @param[in] sendnum_tensor atoms sent per swap (forward direction) + * @param[in] recvnum_tensor atoms received per swap (forward direction) + * @param[in] grad_g1 upstream gradient w.r.t. g1 of forward + * @param[in] communicator_tensor MPI communicator handle as int64 + * @param[in] nlocal_tensor number of local atoms (per rank) + * @param[in] nghost_tensor number of ghost atoms (per rank) + * @return d_in (gradient w.r.t. forward g1 input), same shape as grad_g1. + */ +torch::Tensor border_op_backward(const torch::Tensor& sendlist_tensor, + const torch::Tensor& sendproc_tensor, + const torch::Tensor& recvproc_tensor, + const torch::Tensor& sendnum_tensor, + const torch::Tensor& recvnum_tensor, + const torch::Tensor& grad_g1, + const torch::Tensor& communicator_tensor, + const torch::Tensor& nlocal_tensor, + const torch::Tensor& nghost_tensor) { + return Border::border_op_backward_dispatch( + sendlist_tensor, sendproc_tensor, recvproc_tensor, sendnum_tensor, + recvnum_tensor, grad_g1, communicator_tensor, nlocal_tensor, + nghost_tensor); +} + +TORCH_LIBRARY_FRAGMENT(deepmd, m) { + m.def("border_op", border_op); + m.def("border_op_backward", border_op_backward); +} From 3af514aedb50d7710839841f3787e8460b74a9ef Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 00:18:58 +0800 Subject: [PATCH 03/34] feat(pt_expt): add deepmd_export::border_op opaque wrapper + block overrides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes that together let GNN models drive MPI ghost-atom exchange through the pt_expt forward pass: 1. ``deepmd/pt_expt/utils/comm.py`` registers a NEW torch op ``deepmd_export::border_op`` via ``torch.library.custom_op``. The wrapper: - Forwards to the existing ``torch.ops.deepmd.border_op`` (clones the in-place output to satisfy custom_op aliasing rules). - Has a ``register_fake`` impl returning ``empty_like(g1)`` so ``torch.export`` / ``make_fx`` can trace through it. - Has a ``register_autograd`` callback that dispatches to ``torch.ops.deepmd.border_op_backward`` (the standalone op added in the previous commit). The existing ``deepmd::border_op`` is registered as ``CompositeImplicitAutograd`` and therefore tries to decompose into primitive aten ops during export — which fails because the C++ kernel calls ``data_ptr()`` on FakeTensors. The new opaque wrapper sidesteps this by being registered as an opaque op that ``torch.export`` records as a single black-box call. 2. ``deepmd/pt_expt/descriptor/{repflows,repformers}.py`` add pt_expt subclasses of ``DescrptBlockRepflows`` / ``DescrptBlockRepformers`` that override ``_exchange_ghosts``. When ``comm_dict is None`` the override defers to the dpmodel default; otherwise it pads ``node_ebd`` to nall and calls the opaque wrapper. Includes the spin-aware ``has_spin`` path (split real/virtual + concat_switch _virtual) ported from pt's repflows. 3. ``forward_common_lower_exportable_with_comm`` is added on the pt_expt CM (and SpinModel) classes. Same as the existing ``forward_common_lower_exportable`` but accepts the 8 comm tensors as additional positional inputs and reconstructs ``comm_dict`` inside the traced function (spin variant injects ``has_spin`` so the override takes the spin branch). This becomes the new traced entry point for the with-comm AOTI artifact (next commit). Existing pt_expt descriptor wrappers (dpa1, dpa2, se_*) and the ``CM.forward_common_atomic`` override get an extra ``comm_dict`` kwarg that is plumbed straight through to the dpmodel call — no behavioural change for ``comm_dict is None``. Phase 0 de-risk experiment (scratch/derisk_border_op.py) verified that the opaque wrapper survives ``torch.export.export`` + ``aoti_compile_and_package`` + ``aoti_load_package`` round-trips for both forward and backward. --- deepmd/pt_expt/descriptor/__init__.py | 6 +- deepmd/pt_expt/descriptor/dpa1.py | 1 + deepmd/pt_expt/descriptor/dpa2.py | 10 +- deepmd/pt_expt/descriptor/repflows.py | 103 ++++++++++++++++ deepmd/pt_expt/descriptor/repformers.py | 88 ++++++++++++++ deepmd/pt_expt/descriptor/se_e2_a.py | 1 + deepmd/pt_expt/descriptor/se_r.py | 1 + deepmd/pt_expt/descriptor/se_t.py | 1 + deepmd/pt_expt/descriptor/se_t_tebd.py | 1 + deepmd/pt_expt/model/make_model.py | 93 +++++++++++++++ deepmd/pt_expt/model/spin_model.py | 94 +++++++++++++++ deepmd/pt_expt/utils/__init__.py | 3 + deepmd/pt_expt/utils/comm.py | 149 ++++++++++++++++++++++++ 13 files changed, 549 insertions(+), 2 deletions(-) create mode 100644 deepmd/pt_expt/descriptor/repflows.py create mode 100644 deepmd/pt_expt/descriptor/repformers.py create mode 100644 deepmd/pt_expt/utils/comm.py diff --git a/deepmd/pt_expt/descriptor/__init__.py b/deepmd/pt_expt/descriptor/__init__.py index 1667182d84..8253ed6338 100644 --- a/deepmd/pt_expt/descriptor/__init__.py +++ b/deepmd/pt_expt/descriptor/__init__.py @@ -1,6 +1,10 @@ # SPDX-License-Identifier: LGPL-3.0-or-later # Import to register converters -from . import se_t_tebd_block # noqa: F401 +from . import ( # noqa: F401 + repflows, + repformers, + se_t_tebd_block, +) from .base_descriptor import ( BaseDescriptor, ) diff --git a/deepmd/pt_expt/descriptor/dpa1.py b/deepmd/pt_expt/descriptor/dpa1.py index 01df91abd6..c43b07f9c2 100644 --- a/deepmd/pt_expt/descriptor/dpa1.py +++ b/deepmd/pt_expt/descriptor/dpa1.py @@ -183,6 +183,7 @@ def call( nlist: torch.Tensor, mapping: torch.Tensor | None = None, fparam: torch.Tensor | None = None, + comm_dict: dict | None = None, ) -> Any: if not self.compress: return DescrptDPA1DP.call.__wrapped__( diff --git a/deepmd/pt_expt/descriptor/dpa2.py b/deepmd/pt_expt/descriptor/dpa2.py index 1723df5a30..21c392cd3c 100644 --- a/deepmd/pt_expt/descriptor/dpa2.py +++ b/deepmd/pt_expt/descriptor/dpa2.py @@ -233,11 +233,19 @@ def call( nlist: torch.Tensor, mapping: torch.Tensor | None = None, fparam: torch.Tensor | None = None, + comm_dict: dict | None = None, ) -> Any: if not self.compress: return DescrptDPA2DP.call.__wrapped__( - self, coord_ext, atype_ext, nlist, mapping + self, + coord_ext, + atype_ext, + nlist, + mapping, + fparam, + comm_dict=comm_dict, ) + # Compressed path is local-only (no message passing during compress). return self._call_compressed(coord_ext, atype_ext, nlist, mapping) def _call_compressed( diff --git a/deepmd/pt_expt/descriptor/repflows.py b/deepmd/pt_expt/descriptor/repflows.py new file mode 100644 index 0000000000..2f680703bf --- /dev/null +++ b/deepmd/pt_expt/descriptor/repflows.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""pt_expt wrapper around dpmodel ``DescrptBlockRepflows``. + +The wrapper overrides ``_exchange_ghosts`` so that, when running under +LAMMPS multi-rank with a non-None ``comm_dict``, each layer of the +RepFlow message-passing block exchanges ghost-atom embeddings via the +opaque ``deepmd_export::border_op`` wrapper (registered in +``deepmd/pt_expt/utils/comm.py``). This survives ``torch.export`` and +AOTInductor packaging. + +When ``comm_dict is None`` (single-rank inference / training), the +default array-api ``_exchange_ghosts`` from the dpmodel block is used — +zero behavioural change. +""" + +from __future__ import ( + annotations, +) + +import torch + +from deepmd.dpmodel.descriptor.repflows import ( + DescrptBlockRepflows as DescrptBlockRepflowsDP, +) +from deepmd.pt.utils.spin import ( + concat_switch_virtual, +) +from deepmd.pt_expt.common import ( + register_dpmodel_mapping, + torch_module, +) + + +@torch_module +class DescrptBlockRepflows(DescrptBlockRepflowsDP): + """pt_expt wrapper for the RepFlow descriptor block.""" + + def _exchange_ghosts( + self, + node_ebd: torch.Tensor, + mapping_tiled: torch.Tensor | None, + comm_dict: dict | None, + nall: int, + nloc: int, + ) -> torch.Tensor: + if comm_dict is None: + return super()._exchange_ghosts( + node_ebd, + mapping_tiled, + comm_dict, + nall, + nloc, + ) + + has_spin = "has_spin" in comm_dict + if has_spin: + real_nloc, real_nall = nloc // 2, nall // 2 + real_pad = real_nall - real_nloc + node_real, node_virt = torch.split( + node_ebd, + [real_nloc, real_nloc], + dim=1, + ) + # combine real + virtual along feature dim, then pad to nall. + mix = torch.cat([node_real, node_virt], dim=2) + padded = torch.nn.functional.pad( + mix.squeeze(0), + (0, 0, 0, real_pad), + value=0.0, + ) + else: + padded = torch.nn.functional.pad( + node_ebd.squeeze(0), + (0, 0, 0, nall - nloc), + value=0.0, + ) + + exchanged = torch.ops.deepmd_export.border_op( + comm_dict["send_list"], + comm_dict["send_proc"], + comm_dict["recv_proc"], + comm_dict["send_num"], + comm_dict["recv_num"], + padded, + comm_dict["communicator"], + comm_dict["nlocal"], + comm_dict["nghost"], + ).unsqueeze(0) + + if has_spin: + n_dim = node_ebd.shape[-1] + real_ext, virt_ext = torch.split(exchanged, [n_dim, n_dim], dim=2) + return concat_switch_virtual(real_ext, virt_ext, real_nloc) + return exchanged + + +# Register the converter so dpmodel's auto-wrap path picks up our pt_expt +# subclass instead of the generic _auto_wrap_native_op fallback. Without +# this, the override above would never fire. +register_dpmodel_mapping( + DescrptBlockRepflowsDP, + lambda v: DescrptBlockRepflows.deserialize(v.serialize()), +) diff --git a/deepmd/pt_expt/descriptor/repformers.py b/deepmd/pt_expt/descriptor/repformers.py new file mode 100644 index 0000000000..f106a7a240 --- /dev/null +++ b/deepmd/pt_expt/descriptor/repformers.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""pt_expt wrapper around dpmodel ``DescrptBlockRepformers``. + +Mirrors ``deepmd/pt_expt/descriptor/repflows.py``: overrides +``_exchange_ghosts`` so the per-layer ghost exchange uses the opaque +``deepmd_export::border_op`` when a ``comm_dict`` is provided. +""" + +from __future__ import ( + annotations, +) + +import torch + +from deepmd.dpmodel.descriptor.repformers import ( + DescrptBlockRepformers as DescrptBlockRepformersDP, +) +from deepmd.pt.utils.spin import ( + concat_switch_virtual, +) +from deepmd.pt_expt.common import ( + register_dpmodel_mapping, + torch_module, +) + + +@torch_module +class DescrptBlockRepformers(DescrptBlockRepformersDP): + """pt_expt wrapper for the Repformers descriptor block.""" + + def _exchange_ghosts( + self, + g1: torch.Tensor, + mapping_tiled: torch.Tensor | None, + comm_dict: dict | None, + nall: int, + nloc: int, + ) -> torch.Tensor: + if comm_dict is None: + return super()._exchange_ghosts( + g1, + mapping_tiled, + comm_dict, + nall, + nloc, + ) + + has_spin = "has_spin" in comm_dict + if has_spin: + real_nloc, real_nall = nloc // 2, nall // 2 + real_pad = real_nall - real_nloc + g1_real, g1_virt = torch.split(g1, [real_nloc, real_nloc], dim=1) + mix = torch.cat([g1_real, g1_virt], dim=2) + padded = torch.nn.functional.pad( + mix.squeeze(0), + (0, 0, 0, real_pad), + value=0.0, + ) + else: + padded = torch.nn.functional.pad( + g1.squeeze(0), + (0, 0, 0, nall - nloc), + value=0.0, + ) + + exchanged = torch.ops.deepmd_export.border_op( + comm_dict["send_list"], + comm_dict["send_proc"], + comm_dict["recv_proc"], + comm_dict["send_num"], + comm_dict["recv_num"], + padded, + comm_dict["communicator"], + comm_dict["nlocal"], + comm_dict["nghost"], + ).unsqueeze(0) + + if has_spin: + ng1 = g1.shape[-1] + real_ext, virt_ext = torch.split(exchanged, [ng1, ng1], dim=2) + return concat_switch_virtual(real_ext, virt_ext, real_nloc) + return exchanged + + +register_dpmodel_mapping( + DescrptBlockRepformersDP, + lambda v: DescrptBlockRepformers.deserialize(v.serialize()), +) diff --git a/deepmd/pt_expt/descriptor/se_e2_a.py b/deepmd/pt_expt/descriptor/se_e2_a.py index 61d611036e..45120c6d5d 100644 --- a/deepmd/pt_expt/descriptor/se_e2_a.py +++ b/deepmd/pt_expt/descriptor/se_e2_a.py @@ -139,6 +139,7 @@ def call( nlist: torch.Tensor, mapping: torch.Tensor | None = None, fparam: torch.Tensor | None = None, + comm_dict: dict | None = None, ) -> Any: if not self.compress: return DescrptSeADP.call.__wrapped__( diff --git a/deepmd/pt_expt/descriptor/se_r.py b/deepmd/pt_expt/descriptor/se_r.py index 22302f54e6..ab32be1131 100644 --- a/deepmd/pt_expt/descriptor/se_r.py +++ b/deepmd/pt_expt/descriptor/se_r.py @@ -128,6 +128,7 @@ def call( nlist: torch.Tensor, mapping: torch.Tensor | None = None, fparam: torch.Tensor | None = None, + comm_dict: dict | None = None, ) -> Any: if not self.compress: return DescrptSeRDP.call.__wrapped__( diff --git a/deepmd/pt_expt/descriptor/se_t.py b/deepmd/pt_expt/descriptor/se_t.py index 061306f281..69d6183642 100644 --- a/deepmd/pt_expt/descriptor/se_t.py +++ b/deepmd/pt_expt/descriptor/se_t.py @@ -139,6 +139,7 @@ def call( nlist: torch.Tensor, mapping: torch.Tensor | None = None, fparam: torch.Tensor | None = None, + comm_dict: dict | None = None, ) -> Any: if not self.compress: return DescrptSeTDP.call.__wrapped__( diff --git a/deepmd/pt_expt/descriptor/se_t_tebd.py b/deepmd/pt_expt/descriptor/se_t_tebd.py index c0ae308971..cbcaf3822c 100644 --- a/deepmd/pt_expt/descriptor/se_t_tebd.py +++ b/deepmd/pt_expt/descriptor/se_t_tebd.py @@ -166,6 +166,7 @@ def call( nlist: torch.Tensor, mapping: torch.Tensor | None = None, fparam: torch.Tensor | None = None, + comm_dict: dict | None = None, ) -> Any: if not self.compress: return DescrptSeTTebdDP.call.__wrapped__( diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py index 4bd9792420..45a8cb10ea 100644 --- a/deepmd/pt_expt/model/make_model.py +++ b/deepmd/pt_expt/model/make_model.py @@ -257,6 +257,7 @@ def forward_common_atomic( aparam: torch.Tensor | None = None, do_atomic_virial: bool = False, extended_coord_corr: torch.Tensor | None = None, + comm_dict: dict | None = None, ) -> dict[str, torch.Tensor]: atomic_ret = self.atomic_model.forward_common_atomic( extended_coord, @@ -265,6 +266,7 @@ def forward_common_atomic( mapping=mapping, fparam=fparam, aparam=aparam, + comm_dict=comm_dict, ) model_ret = fit_output_to_model_output( atomic_ret, @@ -365,4 +367,95 @@ def fn( aparam, ) + def forward_common_lower_exportable_with_comm( + self, + extended_coord: torch.Tensor, + extended_atype: torch.Tensor, + nlist: torch.Tensor, + mapping: torch.Tensor | None, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + send_list: torch.Tensor, + send_proc: torch.Tensor, + recv_proc: torch.Tensor, + send_num: torch.Tensor, + recv_num: torch.Tensor, + communicator: torch.Tensor, + nlocal: torch.Tensor, + nghost: torch.Tensor, + do_atomic_virial: bool = False, + **make_fx_kwargs: Any, + ) -> torch.nn.Module: + """Trace forward_common_lower with comm_dict tensors as positional inputs. + + Used to compile a parallel-inference variant of the model + (.pt2 with-comm artifact) that drives MPI ghost-atom exchange + for GNN descriptors via the opaque + ``deepmd_export::border_op`` wrapper. The comm tensors enter + the exported program as 8 additional positional inputs after + the usual (coord, atype, nlist, mapping, fparam, aparam) — + this fixes the C++ ABI for ``DeepPotPTExpt`` (Phase 4). + + Tracing requires ``nswap >= 1`` (Phase 0 finding); with + ``nswap == 0`` the dim specializes and the artifact would + only run for that exact value. The C++ caller must always + provide ``nswap >= 1``. + """ + model = self + + def fn( + extended_coord: torch.Tensor, + extended_atype: torch.Tensor, + nlist: torch.Tensor, + mapping: torch.Tensor | None, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + send_list: torch.Tensor, + send_proc: torch.Tensor, + recv_proc: torch.Tensor, + send_num: torch.Tensor, + recv_num: torch.Tensor, + communicator: torch.Tensor, + nlocal: torch.Tensor, + nghost: torch.Tensor, + ) -> dict[str, torch.Tensor]: + extended_coord = extended_coord.detach().requires_grad_(True) + comm_dict = { + "send_list": send_list, + "send_proc": send_proc, + "recv_proc": recv_proc, + "send_num": send_num, + "recv_num": recv_num, + "communicator": communicator, + "nlocal": nlocal, + "nghost": nghost, + } + return model.forward_common_lower( + extended_coord, + extended_atype, + nlist, + mapping, + fparam=fparam, + aparam=aparam, + do_atomic_virial=do_atomic_virial, + comm_dict=comm_dict, + ) + + return make_fx(fn, **make_fx_kwargs)( + extended_coord, + extended_atype, + nlist, + mapping, + fparam, + aparam, + send_list, + send_proc, + recv_proc, + send_num, + recv_num, + communicator, + nlocal, + nghost, + ) + return CM diff --git a/deepmd/pt_expt/model/spin_model.py b/deepmd/pt_expt/model/spin_model.py index 70f41f0701..e361999b17 100644 --- a/deepmd/pt_expt/model/spin_model.py +++ b/deepmd/pt_expt/model/spin_model.py @@ -117,6 +117,100 @@ def fn( aparam, ) + def forward_common_lower_exportable_with_comm( + self, + extended_coord: torch.Tensor, + extended_atype: torch.Tensor, + extended_spin: torch.Tensor, + nlist: torch.Tensor, + mapping: torch.Tensor | None, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + send_list: torch.Tensor, + send_proc: torch.Tensor, + recv_proc: torch.Tensor, + send_num: torch.Tensor, + recv_num: torch.Tensor, + communicator: torch.Tensor, + nlocal: torch.Tensor, + nghost: torch.Tensor, + do_atomic_virial: bool = False, + **make_fx_kwargs: Any, + ) -> torch.nn.Module: + """Spin variant of ``forward_common_lower_exportable_with_comm``. + + Mirrors the non-spin version (see ``make_model.py``) but threads + ``extended_spin`` through and injects ``has_spin`` into + ``comm_dict`` so the pt_expt Repflow/Repformer override takes + the spin branch (split real/virtual + concat_switch_virtual). + """ + model = self + + def fn( + extended_coord: torch.Tensor, + extended_atype: torch.Tensor, + extended_spin: torch.Tensor, + nlist: torch.Tensor, + mapping: torch.Tensor | None, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + send_list: torch.Tensor, + send_proc: torch.Tensor, + recv_proc: torch.Tensor, + send_num: torch.Tensor, + recv_num: torch.Tensor, + communicator: torch.Tensor, + nlocal: torch.Tensor, + nghost: torch.Tensor, + ) -> dict[str, torch.Tensor]: + extended_coord = extended_coord.detach().requires_grad_(True) + comm_dict = { + "send_list": send_list, + "send_proc": send_proc, + "recv_proc": recv_proc, + "send_num": send_num, + "recv_num": recv_num, + "communicator": communicator, + "nlocal": nlocal, + "nghost": nghost, + # Trace-time marker so the override takes the spin path. + # Value is irrelevant — only key presence matters. + "has_spin": torch.tensor( + [1], + dtype=torch.int32, + device=extended_coord.device, + ), + } + return model.forward_common_lower( + extended_coord, + extended_atype, + extended_spin, + nlist, + mapping, + fparam=fparam, + aparam=aparam, + do_atomic_virial=do_atomic_virial, + comm_dict=comm_dict, + ) + + return make_fx(fn, **make_fx_kwargs)( + extended_coord, + extended_atype, + extended_spin, + nlist, + mapping, + fparam, + aparam, + send_list, + send_proc, + recv_proc, + send_num, + recv_num, + communicator, + nlocal, + nghost, + ) + def forward_common_lower( self, *args: Any, **kwargs: Any ) -> dict[str, torch.Tensor]: diff --git a/deepmd/pt_expt/utils/__init__.py b/deepmd/pt_expt/utils/__init__.py index efb026f7f1..99da68fe4f 100644 --- a/deepmd/pt_expt/utils/__init__.py +++ b/deepmd/pt_expt/utils/__init__.py @@ -22,7 +22,10 @@ # as it's a stateless utility class register_dpmodel_mapping(EnvMat, lambda v: v) +# Register opaque deepmd_export::border_op wrapper (used by GNN MPI +# parallel inference; see comm.py module docstring). # Register fake tensor implementations for custom tabulate ops +from deepmd.pt_expt.utils import comm # noqa: F401 from deepmd.pt_expt.utils import tabulate_ops # noqa: F401 __all__ = [ diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py new file mode 100644 index 0000000000..cfa92bcf6c --- /dev/null +++ b/deepmd/pt_expt/utils/comm.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Opaque torch.export wrapper around the deepmd MPI border_op. + +The existing ``torch.ops.deepmd.border_op`` (registered by +``libdeepmd_op_pt.so``) is a ``CompositeImplicitAutograd`` op that wraps +``Border::apply`` for the torch.jit (pt) backend. ``torch.export`` / +AOTInductor try to *decompose* such ops into primitive aten ops, which +fails because the C++ kernel calls ``data_ptr()`` on inputs — illegal +during tracing on FakeTensors. + +This module defines a NEW op ``deepmd_export::border_op`` via +``torch.library.custom_op``, marked opaque so ``torch.export`` records it +as a single black-box call. At runtime the loaded ``.pt2`` dispatches +back into ``torch.ops.deepmd.border_op`` (forward) or +``torch.ops.deepmd.border_op_backward`` (backward), preserving the MPI +exchange semantics. + +Constraints discovered during de-risking (scratch/derisk_border_op.py): + 1. ``custom_op`` forbids returning a tensor that aliases an input — + the underlying C++ op returns ``g1`` itself, so we ``.clone()``. + 2. The fake (meta) impl honours ``g1.dtype`` (no float64 hardcoding). + 3. ``register_autograd`` makes the op differentiable; the backward + dispatches to ``deepmd::border_op_backward`` which performs the + symmetric MPI exchange. +""" + +from __future__ import ( + annotations, +) + +import torch + + +@torch.library.custom_op("deepmd_export::border_op", mutates_args=()) +def border_op_export( + sendlist: torch.Tensor, + sendproc: torch.Tensor, + recvproc: torch.Tensor, + sendnum: torch.Tensor, + recvnum: torch.Tensor, + g1: torch.Tensor, + communicator: torch.Tensor, + nlocal: torch.Tensor, + nghost: torch.Tensor, +) -> torch.Tensor: + """Opaque wrapper around ``torch.ops.deepmd.border_op``. + + Performs MPI ghost-atom exchange of the embedding tensor ``g1`` so + GNN message-passing layers can run under multi-rank LAMMPS. Inputs + and outputs match the underlying op exactly except for the aliasing + fix (see module docstring). + """ + out = torch.ops.deepmd.border_op( + sendlist, + sendproc, + recvproc, + sendnum, + recvnum, + g1, + communicator, + nlocal, + nghost, + ) + if isinstance(out, (list, tuple)): + out = out[0] + # custom_op forbids output aliasing inputs; underlying op returns g1. + return out.clone() + + +@border_op_export.register_fake +def _border_op_export_fake( + sendlist: torch.Tensor, + sendproc: torch.Tensor, + recvproc: torch.Tensor, + sendnum: torch.Tensor, + recvnum: torch.Tensor, + g1: torch.Tensor, + communicator: torch.Tensor, + nlocal: torch.Tensor, + nghost: torch.Tensor, +) -> torch.Tensor: + return torch.empty_like(g1) + + +def _border_op_setup_context( + ctx: torch.autograd.function.FunctionCtx, + inputs: tuple, + output: torch.Tensor, +) -> None: + ( + sendlist, + sendproc, + recvproc, + sendnum, + recvnum, + _g1, + communicator, + nlocal, + nghost, + ) = inputs + ctx.save_for_backward( + sendlist, + sendproc, + recvproc, + sendnum, + recvnum, + communicator, + nlocal, + nghost, + ) + + +def _border_op_backward( + ctx: torch.autograd.function.FunctionCtx, + grad_output: torch.Tensor, +) -> tuple: + (sendlist, sendproc, recvproc, sendnum, recvnum, communicator, nlocal, nghost) = ( + ctx.saved_tensors + ) + grad_in = torch.ops.deepmd.border_op_backward( + sendlist, + sendproc, + recvproc, + sendnum, + recvnum, + grad_output, + communicator, + nlocal, + nghost, + ) + # Same aliasing concern as forward: the C++ backward returns the same + # tensor object it modified; clone before handing back to autograd. + return ( + None, + None, + None, + None, + None, # sendlist..recvnum + grad_in.clone(), # g1 + None, + None, + None, # communicator, nlocal, nghost + ) + + +border_op_export.register_autograd( + _border_op_backward, + setup_context=_border_op_setup_context, +) From 2936bd4459cc393bbec4164a23c6586f8c7ea29b Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 00:43:05 +0800 Subject: [PATCH 04/34] fix(pt_expt): plumb comm_dict through SpinModel + guards Three small follow-ups uncovered by the spin export-with-comm test: 1. ``dpmodel/model/spin_model.py::call_common_lower`` was missing the ``comm_dict`` kwarg added by the Phase 1 plumbing. Added it and forward to ``backbone_model.call_common_lower`` so spin GNN models can drive parallel inference. 2. ``pt_expt/descriptor/repflows.py`` raises a clear ``RuntimeError`` when ``use_loc_mapping=True`` is combined with a non-None ``comm_dict``. The local-mapping codepath skips per-layer ghost exchange entirely so combining it with ``comm_dict`` would silently drop the parallel behaviour. 3. ``pt_expt/utils/comm.py`` ``_check_underlying_ops_loaded`` is called on first wrapper invocation; surfaces a clearer error when libdeepmd_op_pt.so is unloaded ("rebuild the pt custom-op library") rather than the cryptic "torch.ops.deepmd has no attribute 'border_op'" from torch's dispatcher. --- deepmd/dpmodel/model/spin_model.py | 2 ++ deepmd/pt_expt/descriptor/repflows.py | 13 ++++++++++++ deepmd/pt_expt/utils/comm.py | 29 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/deepmd/dpmodel/model/spin_model.py b/deepmd/dpmodel/model/spin_model.py index be6566e303..2de41945f3 100644 --- a/deepmd/dpmodel/model/spin_model.py +++ b/deepmd/dpmodel/model/spin_model.py @@ -748,6 +748,7 @@ def call_common_lower( fparam: Array | None = None, aparam: Array | None = None, do_atomic_virial: bool = False, + comm_dict: dict | None = None, ) -> dict[str, Array]: """Return model prediction with raw internal keys. Lower interface that takes extended atomic coordinates, types and spins, nlist, and mapping @@ -800,6 +801,7 @@ def call_common_lower( aparam=aparam, do_atomic_virial=do_atomic_virial, extended_coord_corr=extended_coord_corr, + comm_dict=comm_dict, ) model_output_type = self.backbone_model.model_output_type() if "mask" in model_output_type: diff --git a/deepmd/pt_expt/descriptor/repflows.py b/deepmd/pt_expt/descriptor/repflows.py index 2f680703bf..efd7cba7ba 100644 --- a/deepmd/pt_expt/descriptor/repflows.py +++ b/deepmd/pt_expt/descriptor/repflows.py @@ -51,6 +51,19 @@ def _exchange_ghosts( nall, nloc, ) + # Pt's parallel branch (repflows.py:580-587) requires the + # extended-region pathway (use_loc_mapping=False). The + # local-mapping codepath skips the per-layer ghost exchange + # entirely, so combining it with comm_dict is contradictory. + # Surface this as a clear error rather than producing silently + # wrong results. + if getattr(self, "use_loc_mapping", False): + raise RuntimeError( + "DescrptBlockRepflows._exchange_ghosts: comm_dict is " + "set but use_loc_mapping=True. Multi-rank parallel " + "inference requires use_loc_mapping=False so per-layer " + "ghost exchange is meaningful." + ) has_spin = "has_spin" in comm_dict if has_spin: diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py index cfa92bcf6c..442a232a6f 100644 --- a/deepmd/pt_expt/utils/comm.py +++ b/deepmd/pt_expt/utils/comm.py @@ -31,6 +31,34 @@ import torch +def _check_underlying_ops_loaded() -> None: + """Surface a clearer error when libdeepmd_op_pt.so isn't loaded. + + pt_expt depends on libdeepmd_op_pt.so for the underlying + ``deepmd::border_op`` and ``deepmd::border_op_backward`` C++ ops. + Without them, callers get cryptic + ``AttributeError: '_OpNamespace' object has no attribute 'border_op'`` + errors. We translate that into actionable advice. + + Called once on first wrapper invocation (not at import time, since + pt_expt may legitimately be imported on systems where the .so is + not built — e.g. eager-only smoke tests of dpmodel-side code). + """ + if not ( + hasattr(torch.ops, "deepmd") + and hasattr(torch.ops.deepmd, "border_op") + and hasattr(torch.ops.deepmd, "border_op_backward") + ): + raise RuntimeError( + "deepmd_export::border_op wrapper requires " + "torch.ops.deepmd.border_op and " + "torch.ops.deepmd.border_op_backward (from " + "libdeepmd_op_pt.so) to be loaded. Build the pt custom-op " + "library and ensure deepmd.pt is imported before the " + "first call to this wrapper." + ) + + @torch.library.custom_op("deepmd_export::border_op", mutates_args=()) def border_op_export( sendlist: torch.Tensor, @@ -50,6 +78,7 @@ def border_op_export( and outputs match the underlying op exactly except for the aliasing fix (see module docstring). """ + _check_underlying_ops_loaded() out = torch.ops.deepmd.border_op( sendlist, sendproc, From b22feb792de3990f791c5cadb08cdcb8679dbfdd Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 00:54:49 +0800 Subject: [PATCH 05/34] feat(pt_expt): two-mode AOTInductor export with comm_dict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a ``with_comm_dict: bool`` flag to ``_trace_and_export`` and ``_make_sample_inputs``/``_build_dynamic_shapes``. When True, the trace runs through ``forward_common_lower_exportable_with_comm`` (which threads 8 comm tensors as positional inputs and reconstructs ``comm_dict`` inside the traced function), and the resulting export accepts comm tensors as additional positional inputs. Constraints enforced for the with-comm trace: * ``nframes=1`` static (the pt-parity override uses squeeze(0)/unsqueeze(0) which only works for nb=1; LAMMPS always drives one frame anyway). Avoids the regular-variants ``nframes=2`` collision-avoidance bumping (irrelevant when nframes is static — duck-sizing only unifies dynamic dims). * ``nswap`` static at the trace value. ``nswap`` is fixed once at LAMMPS init (depends on the processor grid which doesnt change at runtime), so the dim doesnt need to be dynamic. For GNN models, ``_deserialize_to_file_pt2`` now compiles BOTH the regular and with-comm artifacts and packs the latter inside the .pt2 ZIP at ``extra/forward_lower_with_comm.pt2``. Metadata gains: * ``has_message_passing`` (true if the descriptor has GNN block). * ``has_comm_artifact`` (true iff a with-comm artifact was packed). Old .pt2 files lack these keys; the C++ loader (Phase 4) must default to False when the field is missing. The non-GNN path is unchanged: a single regular artifact + the existing metadata layout, so existing .pt2 readers keep working. --- deepmd/pt_expt/utils/serialization.py | 349 ++++++++++++++++++++++---- 1 file changed, 301 insertions(+), 48 deletions(-) diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index f59c397525..74fbe67111 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +import ctypes import json import numpy as np @@ -75,6 +76,87 @@ def _json_to_numpy(model_obj: dict) -> dict: ) +def _has_message_passing(model: torch.nn.Module) -> bool: + """Detect whether a model's descriptor uses GNN-style message passing. + + GNN descriptors (DPA2 with repformers, DPA3 with repflows) require + a per-layer ghost-atom MPI exchange when running multi-rank LAMMPS, + which means a separate ``with-comm`` AOTInductor artifact must be + compiled. Non-GNN descriptors (se_e2_a, se_r, se_t, se_t_tebd, + DPA1, hybrid-of-non-GNN) need only the regular artifact. + + Returns False if the descriptor's ``has_message_passing()`` query + cannot be answered (e.g. linear/zbl/frozen models without a single + descriptor) — those are assumed local. + """ + try: + descriptor = model.atomic_model.descriptor + except AttributeError: + return False + if hasattr(descriptor, "has_message_passing"): + try: + return bool(descriptor.has_message_passing()) + except (AttributeError, NotImplementedError): + return False + return False + + +# Module-level cache for the trace-time sendlist buffer. The pointer +# value embedded in ``send_list_tensor`` references this numpy array's +# data; the array must outlive the trace + export call. Caching here +# (rather than per-call) is fine because the contents are never read by +# the exported graph at runtime — only by the eager call inside +# ``make_fx`` when extracting output keys, and by ``torch.export`` when +# materializing example inputs. +_TRACE_SENDLIST_KEEPALIVE: list[np.ndarray] = [] + + +def _make_comm_sample_inputs( + nloc: int, + nghost: int, + device: torch.device, +) -> tuple[torch.Tensor, ...]: + """Build trivial-but-valid comm tensors for tracing the with-comm variant. + + Phase 0 finding: tracing with ``nswap == 0`` causes the dim to + specialize, so we must use ``nswap >= 1``. We use ``nswap == 1`` + with a single self-send swap whose sendlist points to ``nghost`` + local atoms (the actual indices don't matter for the trace — only + the validity of the pointer matters; ``border_op`` is opaque to + ``torch.export`` via the ``deepmd_export::border_op`` wrapper). + + Returns ``(send_list, send_proc, recv_proc, send_num, recv_num, + communicator, nlocal_ts, nghost_ts)`` — 8 tensors, matching the + canonical positional order of + ``forward_common_lower_exportable_with_comm``. + """ + nswap = 1 + send_count = max(1, nghost) + # The trace-time sendlist must be a real ``int**``: a tensor of + # int64 values, each value the address of a contiguous int32 array. + indices = np.zeros(send_count, dtype=np.int32) + _TRACE_SENDLIST_KEEPALIVE.append(indices) + addr = indices.ctypes.data_as(ctypes.c_void_p).value + send_list = torch.tensor([addr], dtype=torch.int64, device=device) + send_proc = torch.zeros(nswap, dtype=torch.int32, device=device) + recv_proc = torch.zeros(nswap, dtype=torch.int32, device=device) + send_num = torch.tensor([send_count], dtype=torch.int32, device=device) + recv_num = torch.tensor([send_count], dtype=torch.int32, device=device) + communicator = torch.zeros(1, dtype=torch.int64, device=device) + nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device) + nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device) + return ( + send_list, + send_proc, + recv_proc, + send_num, + recv_num, + communicator, + nlocal_ts, + nghost_ts, + ) + + def _make_sample_inputs( model: torch.nn.Module, nframes: int = 1, @@ -178,22 +260,42 @@ def _make_sample_inputs( def _build_dynamic_shapes( *sample_inputs: torch.Tensor | None, has_spin: bool = False, + with_comm_dict: bool = False, ) -> tuple: """Build dynamic shape specifications for torch.export. Marks nframes, nloc and nall as dynamic dimensions so the exported program handles arbitrary frame and atom counts. + When ``with_comm_dict`` is True, 8 additional comm tensors are + appended to the returned tuple — matching the positional order of + ``forward_common_lower_exportable_with_comm``. ``nswap`` is the + only dynamic dim among them; the rest are scalar or fixed-size. + Parameters ---------- *sample_inputs : torch.Tensor | None - Sample inputs: either 6 tensors (non-spin) or 7 tensors (spin). + Sample inputs: 6 tensors (non-spin) or 7 (spin), optionally + followed by 8 comm tensors when ``with_comm_dict``. has_spin : bool Whether the inputs include an extended_spin tensor. + with_comm_dict : bool + Whether the inputs include the 8 comm tensors. Returns a tuple (not dict) to match positional args of the make_fx traced module, whose arg names may have suffixes like ``_1``. """ - nframes_dim = torch.export.Dim("nframes", min=1) + # When tracing the with-comm variant, nframes is static at 1. + # Rationale: pt_expt's Repflow/Repformer parallel-mode override + # mirrors pt's repflows.py:593 ``node_ebd.squeeze(0)`` / + # ``…unsqueeze(0)`` pattern, which only works for nb=1. LAMMPS + # always drives inference with one frame so this matches reality. + # Marking nframes static (not dynamic) means it does not + # participate in duck-sizing — so the nframes==2 collision-avoidance + # chosen for the regular variant is *not* needed here, and the + # static value (1) is safe regardless of other tensors' sizes. + nframes_dim: torch.export.Dim | int = ( + 1 if with_comm_dict else torch.export.Dim("nframes", min=1) + ) nall_dim = torch.export.Dim("nall", min=1) nloc_dim = torch.export.Dim("nloc", min=1) @@ -201,7 +303,7 @@ def _build_dynamic_shapes( # (ext_coord, ext_atype, ext_spin, nlist, mapping, fparam, aparam) fparam = sample_inputs[5] aparam = sample_inputs[6] - return ( + base = ( {0: nframes_dim, 1: nall_dim}, # extended_coord: (nframes, nall, 3) {0: nframes_dim, 1: nall_dim}, # extended_atype: (nframes, nall) {0: nframes_dim, 1: nall_dim}, # extended_spin: (nframes, nall, 3) @@ -214,7 +316,7 @@ def _build_dynamic_shapes( # (ext_coord, ext_atype, nlist, mapping, fparam, aparam) fparam = sample_inputs[4] aparam = sample_inputs[5] - return ( + base = ( {0: nframes_dim, 1: nall_dim}, # extended_coord: (nframes, nall, 3) {0: nframes_dim, 1: nall_dim}, # extended_atype: (nframes, nall) {0: nframes_dim, 1: nloc_dim}, # nlist: (nframes, nloc, nnei) @@ -223,6 +325,21 @@ def _build_dynamic_shapes( {0: nframes_dim, 1: nloc_dim} if aparam is not None else None, # aparam ) + if not with_comm_dict: + return base + + # All 8 comm tensors have static shapes: + # send_list, send_proc, recv_proc, send_num, recv_num: (nswap,) + # communicator: (1,) + # nlocal, nghost: scalar + # nswap is fixed once at LAMMPS init (it depends on the processor + # grid which doesn't change at runtime), so it's safe to bake it + # in as static at the trace value. Marking nswap dynamic instead + # raises a Constraints-violated error because the trace specialises + # it to the sample value (1) downstream of border_op anyway — + # there is no graph variation across nswap values. + return base + (None, None, None, None, None, None, None, None) + def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict: """Collect metadata from the model for C++ inference. @@ -268,6 +385,11 @@ def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict: if is_spin: meta["ntypes_spin"] = model.spin.get_ntypes_spin() meta["use_spin"] = [bool(v) for v in model.spin.use_spin] + # Record whether the model uses GNN-style message passing. When + # True, .pt2 deserialization compiles a second ``with-comm`` artifact + # so multi-rank LAMMPS can drive ghost-atom MPI exchange through + # the model. C++ DeepPotPTExpt branches on this flag at load time. + meta["has_message_passing"] = _has_message_passing(model) return meta @@ -366,9 +488,27 @@ def deserialize_to_file( def _trace_and_export( data: dict, model_json_override: dict | None = None, + with_comm_dict: bool = False, ) -> tuple: """Common logic: build model, trace, export. + Parameters + ---------- + data + Serialized model dict (with "model" and optionally + "model_def_script" keys). + model_json_override + Optional alternate dict to embed as model.json (used by + ``dp compress`` to store the compressed model dict while + tracing the uncompressed one). + with_comm_dict + If True, trace ``forward_common_lower_exportable_with_comm`` + instead of the regular variant. The resulting exported program + accepts 8 additional positional comm tensors (``send_list``, + ``send_proc``, ``recv_proc``, ``send_num``, ``recv_num``, + ``communicator``, ``nlocal``, ``nghost``) used by the pt_expt + Repflow/Repformer override to drive MPI ghost-atom exchange. + Only valid for GNN models (see ``_has_message_passing``). Returns (exported, metadata, data_for_json, output_keys). """ from copy import ( @@ -412,19 +552,37 @@ def _trace_and_export( _orig_device = _env.DEVICE _env.DEVICE = torch.device("cpu") try: - nframes = 2 - sample_inputs = _make_sample_inputs(model, nframes=nframes, has_spin=is_spin) - # Collect all dimension sizes except dim-0 (nframes) from every tensor - other_dims: set[int] = set() - for t in sample_inputs: - if t is not None: - other_dims.update(t.shape[1:]) - while nframes in other_dims: - nframes += 1 - if nframes != 2: + if with_comm_dict: + # The pt_expt parallel-mode override (in pt's repflows.py + # line 593 too) uses ``squeeze(0)`` / ``unsqueeze(0)`` on + # ``node_ebd`` and so requires ``nframes == 1``. LAMMPS + # always drives inference with one frame, so this is the + # only realistic shape — and we mark dim 0 static in + # ``_build_dynamic_shapes`` to match. + nframes = 1 + sample_inputs = _make_sample_inputs( + model, + nframes=nframes, + has_spin=is_spin, + ) + else: + nframes = 2 sample_inputs = _make_sample_inputs( - model, nframes=nframes, has_spin=is_spin + model, + nframes=nframes, + has_spin=is_spin, ) + # Collect all dimension sizes except dim-0 (nframes) from every tensor + other_dims: set[int] = set() + for t in sample_inputs: + if t is not None: + other_dims.update(t.shape[1:]) + while nframes in other_dims: + nframes += 1 + if nframes != 2: + sample_inputs = _make_sample_inputs( + model, nframes=nframes, has_spin=is_spin + ) finally: _env.DEVICE = _orig_device @@ -435,40 +593,87 @@ def _trace_and_export( else: ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam = sample_inputs + # 3b. Build comm-tensor sample inputs when tracing the with-comm + # variant (only valid for GNN models). The actual values don't + # matter for tracing — only that they're valid tensors of the right + # shape and dtype. See ``_make_comm_sample_inputs``. + if with_comm_dict: + if not metadata.get("has_message_passing"): + raise ValueError( + "with_comm_dict=True requested but model has no GNN " + "message-passing descriptor — there's nothing to compile." + ) + nloc_sample = nlist_t.shape[1] + nall_sample = ext_atype.shape[1] + nghost_sample = nall_sample - nloc_sample + comm_inputs = _make_comm_sample_inputs( + nloc=nloc_sample, + nghost=nghost_sample, + device=torch.device("cpu"), + ) + sample_inputs = sample_inputs + comm_inputs + # 4. Trace via make_fx on CPU. # This decomposes torch.autograd.grad into aten ops so the resulting # GraphModule no longer contains autograd calls. if is_spin: - traced = model.forward_common_lower_exportable( - ext_coord, - ext_atype, - ext_spin, - nlist_t, - mapping_t, - fparam=fparam, - aparam=aparam, - do_atomic_virial=True, - tracing_mode="symbolic", - _allow_non_fake_inputs=True, - ) + if with_comm_dict: + traced = model.forward_common_lower_exportable_with_comm( + ext_coord, + ext_atype, + ext_spin, + nlist_t, + mapping_t, + fparam, + aparam, + *comm_inputs, + do_atomic_virial=True, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + else: + traced = model.forward_common_lower_exportable( + ext_coord, + ext_atype, + ext_spin, + nlist_t, + mapping_t, + fparam=fparam, + aparam=aparam, + do_atomic_virial=True, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) # 5. Extract output keys from the CPU-traced module. - sample_out = traced( - ext_coord, ext_atype, ext_spin, nlist_t, mapping_t, fparam, aparam - ) + sample_out = traced(*sample_inputs) else: - traced = model.forward_common_lower_exportable( - ext_coord, - ext_atype, - nlist_t, - mapping_t, - fparam=fparam, - aparam=aparam, - do_atomic_virial=True, - tracing_mode="symbolic", - _allow_non_fake_inputs=True, - ) + if with_comm_dict: + traced = model.forward_common_lower_exportable_with_comm( + ext_coord, + ext_atype, + nlist_t, + mapping_t, + fparam, + aparam, + *comm_inputs, + do_atomic_virial=True, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + else: + traced = model.forward_common_lower_exportable( + ext_coord, + ext_atype, + nlist_t, + mapping_t, + fparam=fparam, + aparam=aparam, + do_atomic_virial=True, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) # 5. Extract output keys from the CPU-traced module. - sample_out = traced(ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam) + sample_out = traced(*sample_inputs) output_keys = list(sample_out.keys()) @@ -477,7 +682,11 @@ def _trace_and_export( # graph. Exporting on CPU keeps devices consistent; we move the # ExportedProgram to the target device afterwards via the official # move_to_device_pass (avoids FakeTensor device-propagation errors). - dynamic_shapes = _build_dynamic_shapes(*sample_inputs, has_spin=is_spin) + dynamic_shapes = _build_dynamic_shapes( + *sample_inputs, + has_spin=is_spin, + with_comm_dict=with_comm_dict, + ) exported = torch.export.export( traced, sample_inputs, @@ -543,27 +752,71 @@ def _deserialize_to_file_pt2( Uses torch._inductor.aoti_compile_and_package to compile the exported program into a .pt2 package (ZIP archive with compiled shared libraries), then embeds metadata into the archive. + + For GNN models (descriptor.has_message_passing() is True), compiles + a SECOND ``with-comm`` artifact and packs it alongside the regular + one. The ``with-comm`` variant accepts comm-dict tensors as + additional positional inputs and drives MPI ghost-atom exchange via + ``deepmd_export::border_op``. The C++ ``DeepPotPTExpt`` loader picks + the artifact based on the LAMMPS rank count at runtime. + + Layout inside the .pt2 ZIP: + regular → artifact at the top of the archive (existing layout) + with-comm → ``extra/forward_lower_with_comm.pt2`` (nested ZIP) + metadata → ``extra/metadata.json`` with ``has_message_passing`` + and ``has_comm_artifact`` flags. + + Old .pt2 files (pre-this-change) lack ``has_comm_artifact`` so the + C++ loader must default to ``False`` when the field is missing. """ + import os + import tempfile import zipfile from torch._inductor import ( aoti_compile_and_package, ) + # First artifact: regular (no comm). Always produced. exported, metadata, data_for_json, output_keys = _trace_and_export( data, model_json_override ) - - # Compile via AOTInductor into a .pt2 package aoti_compile_and_package(exported, package_path=model_file) + metadata["output_keys"] = output_keys - # Embed metadata into the .pt2 ZIP archive + # Second artifact: with-comm. Only for GNN models. + has_comm_artifact = bool(metadata.get("has_message_passing")) + metadata["has_comm_artifact"] = has_comm_artifact + with_comm_bytes: bytes | None = None + with_comm_output_keys: list[str] | None = None + if has_comm_artifact: + exported_wc, _meta_wc, _data_wc, with_comm_output_keys = _trace_and_export( + data, + model_json_override, + with_comm_dict=True, + ) + with tempfile.TemporaryDirectory() as td: + wc_path = os.path.join(td, "forward_lower_with_comm.pt2") + aoti_compile_and_package(exported_wc, package_path=wc_path) + with open(wc_path, "rb") as f: + with_comm_bytes = f.read() + # The output keys are identical between the two artifacts (same + # forward_lower output dict); record only one set in metadata. + # If they ever diverge we'll surface a hard error here. + if with_comm_output_keys != output_keys: + raise RuntimeError( + "with-comm artifact output keys diverge from regular: " + f"regular={output_keys} vs with_comm={with_comm_output_keys}" + ) + + # Embed metadata + supplementary files into the .pt2 ZIP archive model_def_script = data.get("model_def_script") or {} - metadata["output_keys"] = output_keys - with zipfile.ZipFile(model_file, "a") as zf: + with zipfile.ZipFile(model_file, "a", zipfile.ZIP_STORED) as zf: zf.writestr("extra/metadata.json", json.dumps(metadata)) zf.writestr("extra/model_def_script.json", json.dumps(model_def_script)) zf.writestr( "extra/model.json", json.dumps(data_for_json, separators=(",", ":")), ) + if with_comm_bytes is not None: + zf.writestr("extra/forward_lower_with_comm.pt2", with_comm_bytes) From 4b707a761655114a21b48e5818402c4ccbbd7ae0 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 10:04:51 +0800 Subject: [PATCH 06/34] test(pt_expt): add comm_dict eager parity + export round-trip suite Five new test files covering the GNN MPI plumbing: * test_repflow_parallel.py / test_repformer_parallel.py Eager parity for DescrptBlockRepflows / DescrptBlockRepformers override. Single-rank self-exchange via ctypes pointer-array sendlist; verifies override output equals dpmodel default for both with-mapping and none-mapping variants. Includes a structural test for the spin branch and a guard test that use_loc_mapping=True + comm_dict raises RuntimeError. * test_border_op_backward.py Direct unit tests for torch.ops.deepmd.border_op_backward (float32 + float64) and the autograd path through deepmd_export::border_op. * test_export_with_comm.py Phase 3 round-trip for the dual-artifact .pt2 layout: GNN models produce both regular and forward_lower_with_comm artifacts; both load via aoti_load_package; outputs match for self-exchange. Plus three coverage tests for previously-untested branches: zero-nghost clamp in _make_comm_sample_inputs, hybrid-with-GNN detection in _has_message_passing, .pte with-comm trace round-trip. * test_spin_export_with_comm.py Spin model trace machinery (smoke test on se_e2_a) and end-to-end eager value parity for spin DPA3 models running through SpinModel.call_common_lower with comm_dict. --- .../descriptor/test_repflow_parallel.py | 411 ++++++++++++++++++ .../descriptor/test_repformer_parallel.py | 207 +++++++++ .../pt_expt/model/test_export_with_comm.py | 342 +++++++++++++++ .../model/test_spin_export_with_comm.py | 316 ++++++++++++++ .../pt_expt/utils/test_border_op_backward.py | 248 +++++++++++ 5 files changed, 1524 insertions(+) create mode 100644 source/tests/pt_expt/descriptor/test_repflow_parallel.py create mode 100644 source/tests/pt_expt/descriptor/test_repformer_parallel.py create mode 100644 source/tests/pt_expt/model/test_export_with_comm.py create mode 100644 source/tests/pt_expt/model/test_spin_export_with_comm.py create mode 100644 source/tests/pt_expt/utils/test_border_op_backward.py diff --git a/source/tests/pt_expt/descriptor/test_repflow_parallel.py b/source/tests/pt_expt/descriptor/test_repflow_parallel.py new file mode 100644 index 0000000000..61b84fe5af --- /dev/null +++ b/source/tests/pt_expt/descriptor/test_repflow_parallel.py @@ -0,0 +1,411 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Eager parity test for the pt_expt RepFlow parallel-mode override. + +Verifies that ``DescrptBlockRepflows._exchange_ghosts`` (the pt_expt +override) produces output identical to the dpmodel default +``_exchange_ghosts`` when the supplied ``comm_dict`` describes a +single-rank, self-only MPI exchange whose effect equals the per-layer +gather that the default does via ``mapping``. + +This is a Phase 2.5 gate: it exercises the override code path *eagerly* +(no torch.export, no AOTInductor) before we attempt the export round +trip in Phase 3. End-to-end multi-rank validation is deferred to the +Phase 5 LAMMPS test (``test_lammps_dpa3_pt2_mpi``). + +Implementation note: the underlying ``torch.ops.deepmd.border_op`` +treats ``sendlist_tensor`` as a packed pointer-array (``int**``). We +build that pointer array using numpy contiguous int32 arrays and pack +their addresses into an int64 tensor. In single-rank mode (no MPI +init) the C++ op enters the ``sendproc == me`` self-send branch and +performs an in-process memcpy from the sendlist-indexed rows into the +ghost slots — no MPI runtime needed. +""" + +from __future__ import ( + annotations, +) + +import ctypes + +import numpy as np +import pytest +import torch + +# Trigger registration of the deepmd_export::border_op opaque wrapper. +import deepmd.pt_expt.utils.comm # noqa: F401 +from deepmd.dpmodel.descriptor.dpa3 import ( + RepFlowArgs, +) +from deepmd.pt_expt.descriptor.dpa3 import ( + DescrptDPA3, +) +from deepmd.pt_expt.utils import ( + env, +) +from deepmd.pt_expt.utils.env import ( + PRECISION_DICT, +) + +from ...common.test_mixins import ( + TestCaseSingleFrameWithNlist, + get_tols, +) +from ...seed import ( + GLOBAL_SEED, +) + +# --------------------------------------------------------------------------- +# Helpers for building the comm_dict tensors + + +def _addr_of(np_arr: np.ndarray) -> int: + """Return the raw int address of a numpy array's data buffer.""" + return np_arr.ctypes.data_as(ctypes.c_void_p).value + + +def _build_self_comm_dict( + *, + nloc: int, + nghost: int, + sendlist_indices: np.ndarray, + device: torch.device, + keepalive: list, +) -> dict: + """Build a comm_dict for a single-rank self-exchange. + + Parameters + ---------- + nloc, nghost + Atom counts; ``nall = nloc + nghost``. + sendlist_indices + int32 array of length ``nghost`` giving local indices to copy + into successive ghost slots [nloc, nloc+1, ...]. + device + Target torch device for tensors. + keepalive + List into which we store numpy buffers that must outlive the + forward pass (their addresses are referenced by sendlist_tensor). + """ + sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32) + keepalive.append(sendlist_indices) + nswap = 1 + addr = _addr_of(sendlist_indices) + # int** packed as one int64 entry per swap. + sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device=device) + sendproc = torch.zeros(nswap, dtype=torch.int32, device=device) + recvproc = torch.zeros(nswap, dtype=torch.int32, device=device) + sendnum = torch.tensor([nghost], dtype=torch.int32, device=device) + recvnum = torch.tensor([nghost], dtype=torch.int32, device=device) + communicator = torch.zeros(1, dtype=torch.int64, device=device) + nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device) + nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device) + return { + "send_list": sendlist_tensor, + "send_proc": sendproc, + "recv_proc": recvproc, + "send_num": sendnum, + "recv_num": recvnum, + "communicator": communicator, + "nlocal": nlocal_ts, + "nghost": nghost_ts, + } + + +# --------------------------------------------------------------------------- + + +class TestRepflowParallel(TestCaseSingleFrameWithNlist): + def setup_method(self) -> None: + TestCaseSingleFrameWithNlist.setUp(self) + self.device = env.DEVICE + + # ``mapping_at_parallel`` toggles between two scenarios: + # - "with-mapping": parallel call still receives the mapping tensor + # (matches what pt's DeepPotPT.cc does in production). + # - "none-mapping": parallel call receives ``mapping=None`` so the + # dpmodel branches that gate on ``mapping is not None`` are + # exercised (the regular code path still uses mapping for the + # reference, which proves the comm_dict path's correctness + # does not depend on mapping when override consumes comm_dict). + @pytest.mark.parametrize("mapping_at_parallel", ["with-mapping", "none-mapping"]) + @pytest.mark.parametrize( + "prec", ["float64"] + ) # precision (single is enough for parity) + def test_parallel_matches_default( + self, + prec: str, + mapping_at_parallel: str, + ) -> None: + """Override with comm_dict matching mapping must match default path.""" + rng = np.random.default_rng(GLOBAL_SEED) + nf, nloc, nnei = self.nlist.shape + davg = rng.normal(size=(self.nt, nnei, 4)) + dstd = rng.normal(size=(self.nt, nnei, 4)) + dstd = 0.1 + np.abs(dstd) + dtype = PRECISION_DICT[prec] + rtol, atol = get_tols(prec) + + repflow = RepFlowArgs( + n_dim=8, + e_dim=6, + a_dim=4, + nlayers=2, + e_rcut=self.rcut, + e_rcut_smth=self.rcut_smth, + e_sel=nnei, + a_rcut=self.rcut - 0.1, + a_rcut_smth=self.rcut_smth, + a_sel=nnei - 1, + axis_neuron=4, + update_angle=False, + update_style="res_residual", + update_residual_init="const", + smooth_edge_update=True, + ) + + dd = DescrptDPA3( + self.nt, + repflow=repflow, + exclude_types=[], + precision=prec, + use_econf_tebd=False, + type_map=None, + seed=GLOBAL_SEED, + use_loc_mapping=False, # need extended-region indexing for parity + ).to(self.device) + dd.repflows.mean = torch.tensor(davg, dtype=dtype, device=self.device) + dd.repflows.stddev = torch.tensor(dstd, dtype=dtype, device=self.device) + + # use only the first frame to keep the test simple — single rank, + # one frame, simple mapping ([0, 1, 2, 0]: ghost atom 3 mirrors local 0). + coord_ext = torch.tensor( + self.coord_ext[:1], + dtype=dtype, + device=self.device, + ) + atype_ext = torch.tensor( + self.atype_ext[:1], + dtype=torch.int64, + device=self.device, + ) + nlist = torch.tensor(self.nlist[:1], dtype=torch.int64, device=self.device) + mapping = torch.tensor( + self.mapping[:1], + dtype=torch.int64, + device=self.device, + ) + nall = self.nall + + # Default path (comm_dict=None) — uses gather via mapping. + rd_default, _, _, _, _ = dd(coord_ext, atype_ext, nlist, mapping) + + # Parallel path: build a comm_dict whose sendlist mirrors the + # extended portion of mapping. For each ghost slot ii in + # [nloc, nall), border_op writes node_ebd[sendlist[ii - nloc]], + # so sendlist must match mapping[nloc:nall]. + keepalive: list = [] + ghost_sources = self.mapping[0, nloc:].astype(np.int32) + comm_dict = _build_self_comm_dict( + nloc=nloc, + nghost=nall - nloc, + sendlist_indices=ghost_sources, + device=self.device, + keepalive=keepalive, + ) + + mapping_for_parallel = ( + mapping if mapping_at_parallel == "with-mapping" else None + ) + rd_parallel, _, _, _, _ = dd( + coord_ext, + atype_ext, + nlist, + mapping_for_parallel, + comm_dict=comm_dict, + ) + + np.testing.assert_allclose( + rd_parallel.detach().cpu().numpy(), + rd_default.detach().cpu().numpy(), + rtol=rtol, + atol=atol, + ) + + def test_use_loc_mapping_with_comm_dict_raises(self) -> None: + """``use_loc_mapping=True`` + ``comm_dict`` is contradictory. + + The local-mapping codepath skips per-layer ghost exchange + entirely, so combining it with ``comm_dict`` would silently + drop the parallel behaviour. Verify the override raises a + clear error rather than producing wrong output. + """ + rng = np.random.default_rng(GLOBAL_SEED) + nf, nloc, nnei = self.nlist.shape + davg = rng.normal(size=(self.nt, nnei, 4)) + dstd = 0.1 + np.abs(rng.normal(size=(self.nt, nnei, 4))) + + repflow = RepFlowArgs( + n_dim=8, + e_dim=6, + a_dim=4, + nlayers=1, + e_rcut=self.rcut, + e_rcut_smth=self.rcut_smth, + e_sel=nnei, + a_rcut=self.rcut - 0.1, + a_rcut_smth=self.rcut_smth, + a_sel=nnei - 1, + axis_neuron=4, + update_angle=False, + update_style="res_residual", + update_residual_init="const", + smooth_edge_update=True, + ) + dd = DescrptDPA3( + self.nt, + repflow=repflow, + exclude_types=[], + precision="float64", + use_econf_tebd=False, + type_map=None, + seed=GLOBAL_SEED, + use_loc_mapping=True, # contradictory with comm_dict + ).to(self.device) + dd.repflows.mean = torch.tensor(davg, dtype=torch.float64, device=self.device) + dd.repflows.stddev = torch.tensor(dstd, dtype=torch.float64, device=self.device) + + coord_ext = torch.tensor( + self.coord_ext[:1], + dtype=torch.float64, + device=self.device, + ) + atype_ext = torch.tensor( + self.atype_ext[:1], + dtype=torch.int64, + device=self.device, + ) + nlist = torch.tensor(self.nlist[:1], dtype=torch.int64, device=self.device) + mapping = torch.tensor( + self.mapping[:1], + dtype=torch.int64, + device=self.device, + ) + + keepalive: list = [] + ghost_sources = self.mapping[0, nloc:].astype(np.int32) + comm_dict = _build_self_comm_dict( + nloc=nloc, + nghost=self.nall - nloc, + sendlist_indices=ghost_sources, + device=self.device, + keepalive=keepalive, + ) + + with pytest.raises(RuntimeError, match="use_loc_mapping=True"): + dd(coord_ext, atype_ext, nlist, mapping, comm_dict=comm_dict) + + def test_spin_branch_runs(self) -> None: + """Structural test for the ``has_spin`` branch of _exchange_ghosts. + + Builds a synthetic input that satisfies the spin path's atom- + doubling invariant (``nloc`` and ``nall`` even), invokes the + override directly with ``comm_dict["has_spin"]`` set, and + verifies the output shape matches the input. This catches + regressions in the split-real-virtual + concat_switch_virtual + code path without requiring a full spin model. + """ + from deepmd.pt_expt.descriptor.repflows import ( + DescrptBlockRepflows, + ) + + # Build a minimally-initialised block instance via deserialize + # of a tiny dpmodel block. We just need an instance to call + # the method on; method behaviour is independent of weights. + rng = np.random.default_rng(GLOBAL_SEED) + nf, nloc, nnei = self.nlist.shape + davg = rng.normal(size=(self.nt, nnei, 4)) + dstd = 0.1 + np.abs(rng.normal(size=(self.nt, nnei, 4))) + + repflow = RepFlowArgs( + n_dim=8, + e_dim=6, + a_dim=4, + nlayers=1, + e_rcut=self.rcut, + e_rcut_smth=self.rcut_smth, + e_sel=nnei, + a_rcut=self.rcut - 0.1, + a_rcut_smth=self.rcut_smth, + a_sel=nnei - 1, + axis_neuron=4, + update_angle=False, + update_style="res_residual", + update_residual_init="const", + smooth_edge_update=True, + ) + dd = DescrptDPA3( + self.nt, + repflow=repflow, + exclude_types=[], + precision="float64", + use_econf_tebd=False, + type_map=None, + seed=GLOBAL_SEED, + use_loc_mapping=False, + ).to(self.device) + dd.repflows.mean = torch.tensor(davg, dtype=torch.float64, device=self.device) + dd.repflows.stddev = torch.tensor(dstd, dtype=torch.float64, device=self.device) + block = dd.repflows + assert isinstance(block, DescrptBlockRepflows) + + # Pseudo-spin shapes: nloc and nall are even; n_dim from the + # model. The spin path splits along dim 1 into real/virtual + # halves and concats along dim 2. + n_dim = block.n_dim + nloc_spin, nghost_spin = 4, 2 + nall_spin = nloc_spin + nghost_spin + # node_ebd: (1, nloc_spin, n_dim) + node_ebd = torch.randn( + 1, + nloc_spin, + n_dim, + dtype=torch.float64, + device=self.device, + ) + + keepalive: list = [] + # sendlist mirrors local-to-ghost slot for one ghost rank. + # Real ghost slots are real_nall-real_nloc = 1 atoms -> sendlist + # has 1 entry. Self-send branch will copy local index 0. + sendlist_indices = np.array([0], dtype=np.int32) + comm_dict = _build_self_comm_dict( + nloc=nloc_spin // 2, + nghost=nghost_spin // 2, + sendlist_indices=sendlist_indices, + device=self.device, + keepalive=keepalive, + ) + comm_dict["has_spin"] = torch.tensor( + [1], + dtype=torch.int32, + device=self.device, + ) + + # Direct invocation of _exchange_ghosts on the block. + out = block._exchange_ghosts( + node_ebd, + mapping_tiled=None, + comm_dict=comm_dict, + nall=nall_spin, + nloc=nloc_spin, + ) + # concat_switch_virtual produces a tensor of shape + # (1, nall_spin, n_dim) — 4 real + 2 virtual + 2 ghost-real + + # 2 ghost-virtual interleaved per the helper's contract. + # The exact structure is: out[1] dim is doubled relative to the + # real_nall (real_nloc + real_nghost = 3); for nloc_spin=4, + # nall_spin=6, the helper outputs 2*real_nall = 6 rows. + assert out.shape[0] == 1 + assert out.shape[2] == n_dim + # Spin path returns shape (1, 2*real_nall, n_dim) = (1, nall_spin, n_dim). + assert out.shape[1] == nall_spin diff --git a/source/tests/pt_expt/descriptor/test_repformer_parallel.py b/source/tests/pt_expt/descriptor/test_repformer_parallel.py new file mode 100644 index 0000000000..ca0bd035e7 --- /dev/null +++ b/source/tests/pt_expt/descriptor/test_repformer_parallel.py @@ -0,0 +1,207 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Eager parity test for the pt_expt Repformer parallel-mode override. + +Mirror of ``test_repflow_parallel.py`` but for DPA2 (which uses +``DescrptBlockRepformers``). Same single-rank self-exchange trick: +``sendlist`` mirrors ``mapping[nloc:]`` so the C++ ``border_op``'s +self-send branch reproduces the gather that the dpmodel default does. +""" + +from __future__ import ( + annotations, +) + +import ctypes + +import numpy as np +import pytest +import torch + +# Trigger registration of the deepmd_export::border_op opaque wrapper. +import deepmd.pt_expt.utils.comm # noqa: F401 +from deepmd.dpmodel.descriptor.dpa2 import ( + RepformerArgs, + RepinitArgs, +) +from deepmd.pt_expt.descriptor.dpa2 import ( + DescrptDPA2, +) +from deepmd.pt_expt.utils import ( + env, +) +from deepmd.pt_expt.utils.env import ( + PRECISION_DICT, +) + +from ...common.test_mixins import ( + TestCaseSingleFrameWithNlist, + get_tols, +) +from ...seed import ( + GLOBAL_SEED, +) + + +def _addr_of(np_arr: np.ndarray) -> int: + return np_arr.ctypes.data_as(ctypes.c_void_p).value + + +def _build_self_comm_dict( + *, + nloc: int, + nghost: int, + sendlist_indices: np.ndarray, + device: torch.device, + keepalive: list, +) -> dict: + sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32) + keepalive.append(sendlist_indices) + nswap = 1 + addr = _addr_of(sendlist_indices) + sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device=device) + sendproc = torch.zeros(nswap, dtype=torch.int32, device=device) + recvproc = torch.zeros(nswap, dtype=torch.int32, device=device) + sendnum = torch.tensor([nghost], dtype=torch.int32, device=device) + recvnum = torch.tensor([nghost], dtype=torch.int32, device=device) + communicator = torch.zeros(1, dtype=torch.int64, device=device) + nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device) + nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device) + return { + "send_list": sendlist_tensor, + "send_proc": sendproc, + "recv_proc": recvproc, + "send_num": sendnum, + "recv_num": recvnum, + "communicator": communicator, + "nlocal": nlocal_ts, + "nghost": nghost_ts, + } + + +class TestRepformerParallel(TestCaseSingleFrameWithNlist): + def setup_method(self) -> None: + TestCaseSingleFrameWithNlist.setUp(self) + self.device = env.DEVICE + + # See test_repflow_parallel.py for rationale on the "none-mapping" + # variant — exercises dpa2's "skip pre-block gather" branch with + # mapping=None, which is the realistic LAMMPS multi-rank shape. + @pytest.mark.parametrize("mapping_at_parallel", ["with-mapping", "none-mapping"]) + @pytest.mark.parametrize("prec", ["float64"]) # precision + def test_parallel_matches_default( + self, + prec: str, + mapping_at_parallel: str, + ) -> None: + rng = np.random.default_rng(GLOBAL_SEED) + nf, nloc, nnei = self.nlist.shape + davg = rng.normal(size=(self.nt, nnei, 4)) + dstd = rng.normal(size=(self.nt, nnei, 4)) + dstd = 0.1 + np.abs(dstd) + davg_2 = rng.normal(size=(self.nt, nnei // 2, 4)) + dstd_2 = rng.normal(size=(self.nt, nnei // 2, 4)) + dstd_2 = 0.1 + np.abs(dstd_2) + + dtype = PRECISION_DICT[prec] + rtol, atol = get_tols(prec) + if prec == "float64": + atol = 1e-8 + + repinit = RepinitArgs( + rcut=self.rcut, + rcut_smth=self.rcut_smth, + nsel=self.sel_mix, + tebd_input_mode="concat", + set_davg_zero=True, + ) + repformer = RepformerArgs( + rcut=self.rcut / 2, + rcut_smth=self.rcut_smth, + nsel=nnei // 2, + nlayers=2, + g1_dim=12, + g2_dim=8, + axis_neuron=4, + update_g1_has_conv=True, + update_g1_has_drrd=True, + update_g1_has_grrg=True, + update_g1_has_attn=True, + update_g2_has_g1g1=True, + update_g2_has_attn=True, + update_h2=False, + attn1_hidden=12, + attn1_nhead=2, + attn2_hidden=8, + attn2_nhead=2, + attn2_has_gate=False, + update_style="res_avg", + set_davg_zero=True, + use_sqrt_nnei=False, + g1_out_conv=False, + g1_out_mlp=False, + ) + + dd = DescrptDPA2( + self.nt, + repinit=repinit, + repformer=repformer, + smooth=True, + exclude_types=[], + add_tebd_to_repinit_out=False, + precision=prec, + use_econf_tebd=False, + type_map=None, + seed=GLOBAL_SEED, + ).to(self.device) + dd.repinit.mean = torch.tensor(davg, dtype=dtype, device=self.device) + dd.repinit.stddev = torch.tensor(dstd, dtype=dtype, device=self.device) + dd.repformers.mean = torch.tensor(davg_2, dtype=dtype, device=self.device) + dd.repformers.stddev = torch.tensor(dstd_2, dtype=dtype, device=self.device) + + coord_ext = torch.tensor( + self.coord_ext[:1], + dtype=dtype, + device=self.device, + ) + atype_ext = torch.tensor( + self.atype_ext[:1], + dtype=torch.int64, + device=self.device, + ) + nlist = torch.tensor(self.nlist[:1], dtype=torch.int64, device=self.device) + mapping = torch.tensor( + self.mapping[:1], + dtype=torch.int64, + device=self.device, + ) + nall = self.nall + + rd_default, _, _, _, _ = dd(coord_ext, atype_ext, nlist, mapping) + + keepalive: list = [] + ghost_sources = self.mapping[0, nloc:].astype(np.int32) + comm_dict = _build_self_comm_dict( + nloc=nloc, + nghost=nall - nloc, + sendlist_indices=ghost_sources, + device=self.device, + keepalive=keepalive, + ) + + mapping_for_parallel = ( + mapping if mapping_at_parallel == "with-mapping" else None + ) + rd_parallel, _, _, _, _ = dd( + coord_ext, + atype_ext, + nlist, + mapping_for_parallel, + comm_dict=comm_dict, + ) + + np.testing.assert_allclose( + rd_parallel.detach().cpu().numpy(), + rd_default.detach().cpu().numpy(), + rtol=rtol, + atol=atol, + ) diff --git a/source/tests/pt_expt/model/test_export_with_comm.py b/source/tests/pt_expt/model/test_export_with_comm.py new file mode 100644 index 0000000000..24c27310ee --- /dev/null +++ b/source/tests/pt_expt/model/test_export_with_comm.py @@ -0,0 +1,342 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Phase 3 round-trip test for the with-comm AOTInductor artifact. + +For a GNN model (DPA3 here), ``deserialize_to_file`` produces a .pt2 +archive containing TWO compiled artifacts: + * the regular forward_lower (no comm), packed at the top of the ZIP. + * a ``forward_lower_with_comm`` variant nested at + ``extra/forward_lower_with_comm.pt2``. + +This test verifies: + 1. Both artifacts are present in the archive. + 2. ``metadata.json`` carries the new ``has_message_passing`` and + ``has_comm_artifact`` flags. + 3. The with-comm artifact loads via ``aoti_load_package`` and runs + when fed valid comm-dict tensors built via the ctypes pointer + trick (see ``test_repflow_parallel.py``). + 4. The with-comm artifact's output matches the regular artifact's + output for a single-rank self-exchange whose effect is identity + (sendlist mirrors the extended-region mapping, which is what the + gather in the regular path produces). +""" + +from __future__ import ( + annotations, +) + +import ctypes +import json +import os +import tempfile +import zipfile + +import numpy as np +import pytest +import torch + +# Trigger registration of the deepmd_export::border_op opaque wrapper +# (needed by the with-comm artifact at runtime). +import deepmd.pt_expt.utils.comm # noqa: F401 +from deepmd.pt_expt.model.get_model import ( + get_model, +) +from deepmd.pt_expt.utils.serialization import ( + _make_sample_inputs, + deserialize_to_file, +) + +_DPA3_CONFIG = { + "type_map": ["O", "H"], + "descriptor": { + "type": "dpa3", + "repflow": { + "n_dim": 8, + "e_dim": 6, + "a_dim": 4, + "nlayers": 1, + "e_rcut": 4.0, + "e_rcut_smth": 0.5, + "e_sel": 12, + "a_rcut": 3.5, + "a_rcut_smth": 0.5, + "a_sel": 8, + "axis_neuron": 4, + "update_angle": False, + }, + "use_loc_mapping": False, + }, + "fitting_net": {"neuron": [16, 16], "seed": 1}, +} + + +def _addr_of(np_arr: np.ndarray) -> int: + return np_arr.ctypes.data_as(ctypes.c_void_p).value + + +def _build_self_comm_inputs( + nloc: int, + nghost: int, + sendlist_indices: np.ndarray, + keepalive: list, +) -> tuple[torch.Tensor, ...]: + """Build runtime comm tensors for a single-rank self-send.""" + sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32) + keepalive.append(sendlist_indices) + nswap = 1 + addr = _addr_of(sendlist_indices) + send_list = torch.tensor([addr], dtype=torch.int64) + send_proc = torch.zeros(nswap, dtype=torch.int32) + recv_proc = torch.zeros(nswap, dtype=torch.int32) + send_num = torch.tensor([nghost], dtype=torch.int32) + recv_num = torch.tensor([nghost], dtype=torch.int32) + communicator = torch.zeros(1, dtype=torch.int64) + nlocal_ts = torch.tensor(nloc, dtype=torch.int32) + nghost_ts = torch.tensor(nghost, dtype=torch.int32) + return ( + send_list, + send_proc, + recv_proc, + send_num, + recv_num, + communicator, + nlocal_ts, + nghost_ts, + ) + + +@pytest.mark.skipif( + os.environ.get("CI") == "true", + reason="AOTInductor compile is slow (~30s); run locally only by default.", +) +def test_pt2_dual_artifact_for_gnn(tmp_path) -> None: + """End-to-end: GNN model produces dual-artifact .pt2; both load.""" + model = get_model(_DPA3_CONFIG) + model.to("cpu") + model.eval() + + # Serialize → deserialize_to_file (compiles and packs both artifacts) + pt2_path = str(tmp_path / "test_dpa3.pt2") + data = {"model": model.serialize()} + deserialize_to_file(pt2_path, data) + assert os.path.exists(pt2_path) + + # 1. ZIP layout sanity + with zipfile.ZipFile(pt2_path, "r") as zf: + names = set(zf.namelist()) + meta = json.loads(zf.read("extra/metadata.json").decode("utf-8")) + assert "extra/forward_lower_with_comm.pt2" in names, ( + f"with-comm artifact missing; names={sorted(names)}" + ) + assert meta["has_message_passing"] is True + assert meta["has_comm_artifact"] is True + + # 2. Both artifacts load. + from torch._inductor import ( + aoti_load_package, + ) + + regular = aoti_load_package(pt2_path) + + with tempfile.TemporaryDirectory() as td: + wc_path = os.path.join(td, "fl_wc.pt2") + with zipfile.ZipFile(pt2_path, "r") as zf: + with open(wc_path, "wb") as f: + f.write(zf.read("extra/forward_lower_with_comm.pt2")) + with_comm = aoti_load_package(wc_path) + + # 3. Run both artifacts with nframes=1 (matches what the with-comm + # artifact requires; LAMMPS always passes one frame anyway). + sample = _make_sample_inputs(model, nframes=1, has_spin=False) + ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam = sample + nloc = nlist_t.shape[1] + nall = ext_atype.shape[1] + nghost = nall - nloc + + out_regular = regular(ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam) + + # 4. Build runtime comm tensors mirroring the mapping (single-rank + # self-send: ghost slot ii receives node[mapping[ii]], identical to + # the gather in the regular path). + keepalive: list = [] + ghost_sources = mapping_t[0, nloc:].cpu().numpy().astype(np.int32) + comm_inputs = _build_self_comm_inputs( + nloc=nloc, + nghost=nghost, + sendlist_indices=ghost_sources, + keepalive=keepalive, + ) + + out_with_comm = with_comm( + ext_coord, + ext_atype, + nlist_t, + mapping_t, + fparam, + aparam, + *comm_inputs, + ) + + # 5. Outputs must match (parity gate, eager-mode equivalent). + for key in out_regular: + np.testing.assert_allclose( + out_with_comm[key].detach().cpu().numpy(), + out_regular[key].detach().cpu().numpy(), + rtol=0, + atol=1e-10, + err_msg=f"output[{key}] differs between regular and with-comm", + ) + + +# --------------------------------------------------------------------------- +# Coverage for previously-untested branches +# --------------------------------------------------------------------------- + + +def test_make_comm_sample_inputs_clamps_zero_nghost() -> None: + """``_make_comm_sample_inputs(nghost=0)`` must produce valid tensors. + + The clamp ``send_count = max(1, nghost)`` ensures we never pass an + empty pointer-array to border_op. This test exercises the + ``nghost == 0`` branch (a model exported on a system whose entire + domain fits in one rank with no ghosts) — the trace must still + produce well-formed comm tensors of shape (1,). + """ + from deepmd.pt_expt.utils.serialization import ( + _make_comm_sample_inputs, + ) + + comm_inputs = _make_comm_sample_inputs( + nloc=4, + nghost=0, + device=torch.device("cpu"), + ) + assert len(comm_inputs) == 8 + ( + send_list, + send_proc, + recv_proc, + send_num, + recv_num, + communicator, + nlocal, + nghost_t, + ) = comm_inputs + # nswap stays at 1 (Phase 0: nswap=0 specializes during export). + assert send_list.shape == (1,) + assert send_proc.shape == (1,) + assert recv_proc.shape == (1,) + assert send_num.shape == (1,) + assert recv_num.shape == (1,) + # send_count is clamped to >=1, so send_num is also clamped. + assert send_num.item() == 1 + assert recv_num.item() == 1 + # Scalar metadata reports the original (un-clamped) values. + assert nlocal.item() == 4 + assert nghost_t.item() == 0 + + +def test_has_message_passing_for_hybrid_with_gnn() -> None: + """``_has_message_passing`` correctly reports True for hybrid + descriptors whose children include a GNN block. + + The hybrid descriptor delegates ``has_message_passing()`` to its + children — if any child has message passing, the hybrid does too. + Our metadata flag (``has_message_passing``) is what + ``_deserialize_to_file_pt2`` uses to decide whether to compile + the with-comm artifact, so the hybrid case must route correctly. + """ + from deepmd.pt_expt.model.get_model import get_model as get_pt_expt_model + from deepmd.pt_expt.utils.serialization import ( + _has_message_passing, + ) + + config = { + "type_map": ["O", "H"], + "descriptor": { + "type": "hybrid", + "list": [ + # Non-GNN child. + { + "type": "se_e2_a", + "sel": [12, 12], + "rcut": 4.0, + "rcut_smth": 0.5, + "neuron": [4, 8], + "axis_neuron": 4, + "seed": 1, + }, + # GNN child (DPA3). + { + "type": "dpa3", + "repflow": { + "n_dim": 4, + "e_dim": 4, + "a_dim": 4, + "nlayers": 1, + "e_rcut": 4.0, + "e_rcut_smth": 0.5, + "e_sel": 8, + "a_rcut": 3.5, + "a_rcut_smth": 0.5, + "a_sel": 4, + "axis_neuron": 4, + "update_angle": False, + }, + "use_loc_mapping": False, + }, + ], + }, + "fitting_net": {"neuron": [8, 8], "seed": 1}, + } + model = get_pt_expt_model(config) + model.to("cpu") + model.eval() + assert _has_message_passing(model) is True, ( + "hybrid model with a GNN child must report has_message_passing=True" + ) + + +def test_pte_with_comm_dict_traces_and_loads(tmp_path) -> None: + """``_trace_and_export(with_comm_dict=True)`` produces a valid + ExportedProgram that can be saved as .pte and loaded back. + + .pte is Python-only (the multi-rank consumer is C++/LAMMPS via + .pt2), so production has no business calling this path. But the + trace machinery is the same as the .pt2 path, so .pte serves as + a cheap (no AOTI compile) round-trip test for the with-comm + export pipeline. + """ + from deepmd.pt_expt.utils.serialization import ( + _trace_and_export, + ) + + model = get_model(_DPA3_CONFIG) + model.to("cpu") + model.eval() + data = {"model": model.serialize()} + + exported, metadata, _data_for_json, output_keys = _trace_and_export( + data, + model_json_override=None, + with_comm_dict=True, + ) + assert metadata["has_message_passing"] is True + # output_keys mirrors what the regular trace would produce; at + # least one energy-related key must be present. + assert any(k.startswith("energy") for k in output_keys), ( + f"expected an 'energy*' output key; got {output_keys}" + ) + + # Save as .pte and reload — verifies the ExportedProgram is + # structurally valid (no broken graph or missing constants). + pte_path = str(tmp_path / "fl_with_comm.pte") + torch.export.save(exported, pte_path) + assert os.path.exists(pte_path) + loaded = torch.export.load(pte_path) + # Sanity: the loaded program has the expected number of inputs + # (6 base + 8 comm = 14). + spec = loaded.module().graph.find_nodes(op="placeholder") + assert len(spec) == 14, ( + f"with-comm exported program must accept 14 positional inputs " + f"(6 base + 8 comm); got {len(spec)}" + ) diff --git a/source/tests/pt_expt/model/test_spin_export_with_comm.py b/source/tests/pt_expt/model/test_spin_export_with_comm.py new file mode 100644 index 0000000000..93b22bf864 --- /dev/null +++ b/source/tests/pt_expt/model/test_spin_export_with_comm.py @@ -0,0 +1,316 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Tests for SpinModel + comm_dict end-to-end. + +Two coverage levels: + +1. ``test_spin_forward_common_lower_exportable_with_comm_traces``: + verifies the trace machinery (positional comm-tensor plumbing, + has_spin injection, make_fx symbolic mode) on a spin model with a + non-GNN descriptor (se_e2_a). The non-GNN case is the cheapest + smoke test since se_e2_a's `call` accepts and drops comm_dict — + exercising the wrapper/spin model layers without paying for GNN + compile cost. + +2. ``test_spin_dpa3_eager_parity``: end-to-end value-correctness for + a spin DPA3 model running through ``call_common_lower`` in eager + mode, with a comm_dict whose self-exchange mirrors the mapping. + Asserts the result matches the no-comm reference. This proves + ``SpinModel.call_common_lower`` correctly forwards comm_dict + through to the GNN repflow, AND that the spin branch of + ``_exchange_ghosts`` (real/virtual split + concat_switch_virtual) + reproduces the regular gather path on real values. +""" + +from __future__ import ( + annotations, +) + +import ctypes + +import numpy as np +import torch + +import deepmd.pt_expt.utils.comm # noqa: F401 - opaque op registration +from deepmd.dpmodel.model.model import get_model as get_model_dp +from deepmd.pt_expt.model.spin_ener_model import ( + SpinEnergyModel, +) + +SPIN_GNN_DATA = { + "type_map": ["O", "H", "B"], + "descriptor": { + "type": "se_e2_a", + "sel": [20, 20, 20], + "rcut_smth": 0.50, + "rcut": 4.00, + "neuron": [3, 6], + "resnet_dt": False, + "axis_neuron": 2, + "precision": "float64", + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5], + "resnet_dt": True, + "precision": "float64", + "seed": 1, + }, + "spin": { + "use_spin": [True, False, False], + "virtual_scale": [0.3140], + }, +} + + +def _addr_of(np_arr: np.ndarray) -> int: + return np_arr.ctypes.data_as(ctypes.c_void_p).value + + +def _build_self_comm_inputs(nloc: int, nghost: int): + """Build trivial-but-valid comm tensors for tracing.""" + keepalive: list[np.ndarray] = [] + indices = np.zeros(max(1, nghost), dtype=np.int32) + keepalive.append(indices) + addr = _addr_of(indices) + nswap = 1 + return ( + torch.tensor([addr], dtype=torch.int64), # send_list + torch.zeros(nswap, dtype=torch.int32), # send_proc + torch.zeros(nswap, dtype=torch.int32), # recv_proc + torch.tensor([max(1, nghost)], dtype=torch.int32), # send_num + torch.tensor([max(1, nghost)], dtype=torch.int32), # recv_num + torch.zeros(1, dtype=torch.int64), # communicator + torch.tensor(nloc, dtype=torch.int32), # nlocal + torch.tensor(nghost, dtype=torch.int32), # nghost + ), keepalive + + +def test_spin_forward_common_lower_exportable_with_comm_traces() -> None: + """The spin variant of forward_common_lower_exportable_with_comm + produces a callable traced GraphModule. + """ + dp_model = get_model_dp(SPIN_GNN_DATA) + model = SpinEnergyModel.deserialize(dp_model.serialize()).to("cpu") + model.eval() + + # Build sample inputs (nframes=1 to match the override's nb=1 + # constraint; spin doubles natoms). + nloc = 6 # 3 real + 3 virtual + nall = 8 # 1 ghost on each side + n_dim_coord = 3 + ext_coord = torch.zeros(1, nall, n_dim_coord, dtype=torch.float64) + ext_atype = torch.zeros(1, nall, dtype=torch.int64) + ext_spin = torch.zeros(1, nall, n_dim_coord, dtype=torch.float64) + nlist = torch.zeros(1, nloc, 6, dtype=torch.int64) # nnei from sel + mapping = torch.zeros(1, nall, dtype=torch.int64) + fparam = None + aparam = None + + comm_inputs, _keepalive = _build_self_comm_inputs(nloc=nloc, nghost=nall - nloc) + + # The trace should succeed without raising. We do NOT verify + # numerical correctness here — that would require a real spin GNN + # model + live MPI (deferred to Phase 5 LAMMPS). This test only + # checks the trace-time machinery: positional arg plumbing, + # has_spin injection, and that make_fx symbolic mode produces a + # valid GraphModule. + traced = model.forward_common_lower_exportable_with_comm( + ext_coord, + ext_atype, + ext_spin, + nlist, + mapping, + fparam, + aparam, + *comm_inputs, + do_atomic_virial=True, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + # The traced module must be a torch.nn.Module that can be invoked. + assert isinstance(traced, torch.nn.Module) + # And calling it with the same inputs returns a dict with the + # expected keys. + out = traced( + ext_coord, + ext_atype, + ext_spin, + nlist, + mapping, + fparam, + aparam, + *comm_inputs, + ) + assert isinstance(out, dict) + # forward_common_lower internal output names; specifics depend on + # the model's output def, just check at least one is present. + assert any(k.startswith("energy") for k in out), ( + f"expected an 'energy*' key in trace output; got {list(out.keys())}" + ) + + +# --------------------------------------------------------------------------- +# 2. End-to-end value parity for spin DPA3 in eager mode +# --------------------------------------------------------------------------- + + +SPIN_DPA3_DATA = { + "type_map": ["O", "H"], + "descriptor": { + "type": "dpa3", + "repflow": { + "n_dim": 8, + "e_dim": 6, + "a_dim": 4, + "nlayers": 1, + "e_rcut": 4.0, + "e_rcut_smth": 0.5, + "e_sel": 8, + "a_rcut": 3.5, + "a_rcut_smth": 0.5, + "a_sel": 4, + "axis_neuron": 4, + "update_angle": False, + }, + "use_loc_mapping": False, + }, + "fitting_net": {"neuron": [16, 16], "seed": 1}, + "spin": {"use_spin": [True, False], "virtual_scale": [0.314]}, +} + + +def test_spin_dpa3_eager_parity() -> None: + """SpinModel.call_common_lower with comm_dict (self-exchange) must + match the no-comm reference for a spin DPA3 model. + + Setup mirrors the per-block parity tests but at the SpinModel + level so it exercises the full plumbing chain: + ``SpinModel.call_common_lower(comm_dict=...)`` + → process_spin_input_lower (atom-doubling) + → backbone EnergyModel.call_common_lower(comm_dict=...) + → atomic_model.forward_common_atomic(comm_dict=...) + → DescrptDPA3.call(comm_dict=...) + → DescrptBlockRepflows.call(comm_dict=...) + → DescrptBlockRepflows._exchange_ghosts (pt_expt override, + spin branch via has_spin in comm_dict) + + The comm_dict has has_spin=tensor([1]) and a sendlist that + mirrors the real-atom portion of the mapping. The override's + spin branch splits node_ebd into real/virtual halves, stacks + along feature dim, exchanges, then de-interleaves with + concat_switch_virtual. When the exchange produces the same + result as the gather (which it should for a self-mirror + sendlist), the spin model output must equal the no-comm output + bit-for-bit (atol 1e-12 for float64). + """ + dp_model = get_model_dp(SPIN_DPA3_DATA) + model = SpinEnergyModel.deserialize(dp_model.serialize()).to("cpu") + model.eval() + + # Build a 2-atom test system: 1 real + 1 ghost real for type 0, + # plus the same in spin (use_spin=[True, False] means type 0 is + # spin-doubled, type 1 is not). After atom-doubling the model + # processes 2 real + 2 virtual = 4 atoms locally and 4 ghost + # slots. We use minimal nloc to keep the test fast. + nframes = 1 + nloc_real = 2 # 2 real atoms (both type 0 to keep simple) + nghost_real = 2 # 2 ghost real atoms + nall_real = nloc_real + nghost_real + rng = np.random.default_rng(42) + + # Coordinates and types (real only — spin model doubles internally). + coord_real = rng.uniform(0, 4.0, size=(nframes, nall_real, 3)).astype(np.float64) + atype_real = np.zeros((nframes, nall_real), dtype=np.int64) # all type 0 + spin_real = rng.uniform(-0.1, 0.1, size=(nframes, nall_real, 3)).astype(np.float64) + # mapping: ghost atoms mirror local atoms (ghost 0 → local 0, ghost 1 → local 1) + mapping_real = np.array( + [[0, 1, 0, 1]], + dtype=np.int64, + ) # nframes=1, nall_real=4 + + # Build extended-region nlist for the real atoms. Each real atom's + # neighbour list points to the other 3 atoms (within rcut by + # construction of small box). We don't need physically meaningful + # values — just well-formed nlist so the model runs. + nnei = 8 # matches e_sel + nlist_real = np.full((nframes, nloc_real, nnei), -1, dtype=np.int64) + for ii in range(nloc_real): + # neighbours = all other atoms (real + ghost) up to nnei + others = [j for j in range(nall_real) if j != ii][:nnei] + nlist_real[0, ii, : len(others)] = others + + # ``call_common_lower`` runs through ``transform_output`` which + # calls ``torch.autograd.grad`` on coord, so coord must require + # grad in eager mode. + ext_coord = torch.tensor(coord_real, dtype=torch.float64, requires_grad=True) + ext_atype = torch.tensor(atype_real, dtype=torch.int64) + ext_spin = torch.tensor(spin_real, dtype=torch.float64) + nlist_t = torch.tensor(nlist_real, dtype=torch.int64) + mapping_t = torch.tensor(mapping_real, dtype=torch.int64) + + # 1. No-comm reference. + out_ref = model.call_common_lower( + ext_coord, + ext_atype, + ext_spin, + nlist_t, + mapping_t, + fparam=None, + aparam=None, + do_atomic_virial=False, + ) + + # 2. With comm_dict. The SpinModel internally doubles atoms to + # nloc=2*nloc_real=4 and nall=2*nall_real=8. The override's spin + # branch peels back to real_nloc=nloc_real and real_nall=nall_real. + # Sendlist must point to REAL local indices for each real ghost + # slot (mapping_real[nloc_real:nall_real]). + keepalive: list = [] + sendlist_indices = mapping_real[0, nloc_real:].astype(np.int32) + keepalive.append(sendlist_indices) + addr = sendlist_indices.ctypes.data_as(ctypes.c_void_p).value + nswap = 1 + nghost_real_count = nall_real - nloc_real + comm_dict = { + "send_list": torch.tensor([addr], dtype=torch.int64), + "send_proc": torch.zeros(nswap, dtype=torch.int32), + "recv_proc": torch.zeros(nswap, dtype=torch.int32), + "send_num": torch.tensor([nghost_real_count], dtype=torch.int32), + "recv_num": torch.tensor([nghost_real_count], dtype=torch.int32), + "communicator": torch.zeros(1, dtype=torch.int64), + # nlocal/nghost are the REAL counts (the override's spin branch + # halves nloc/nall internally). In production C++ side passes + # real counts here too — see DeepSpinPT.cc. + "nlocal": torch.tensor(nloc_real, dtype=torch.int32), + "nghost": torch.tensor(nghost_real_count, dtype=torch.int32), + # Triggers spin branch in the override. + "has_spin": torch.tensor([1], dtype=torch.int32), + } + + # Fresh coord tensor (the first call's backward graph would otherwise + # be reused / cause double-backward errors). + ext_coord_2 = torch.tensor(coord_real, dtype=torch.float64, requires_grad=True) + out_parallel = model.call_common_lower( + ext_coord_2, + ext_atype, + ext_spin, + nlist_t, + mapping_t, + fparam=None, + aparam=None, + do_atomic_virial=False, + comm_dict=comm_dict, + ) + + # 3. Compare every output key. + for key in out_ref: + ref = out_ref[key].detach().cpu().numpy() + par = out_parallel[key].detach().cpu().numpy() + np.testing.assert_allclose( + par, + ref, + atol=1e-10, + rtol=0, + err_msg=f"output[{key}] mismatch between no-comm and comm_dict path", + ) diff --git a/source/tests/pt_expt/utils/test_border_op_backward.py b/source/tests/pt_expt/utils/test_border_op_backward.py new file mode 100644 index 0000000000..c46705ad8a --- /dev/null +++ b/source/tests/pt_expt/utils/test_border_op_backward.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Unit tests for the new C++ symbol ``deepmd::border_op_backward`` and +the pt_expt autograd path that dispatches to it. + +Tests two distinct surfaces: + +1. **Direct op call** — invokes ``torch.ops.deepmd.border_op_backward`` + with hand-built comm tensors (single-rank self-exchange via ctypes + pointer trick). Verifies the symbol is registered, accepts the + expected positional args, and produces a correctly-shaped output + for both ``float32`` and ``float64`` (covers the ``backward_t`` + template's two specializations). + +2. **Through the opaque wrapper** — exercises + ``torch.ops.deepmd_export.border_op``'s ``register_autograd`` + pathway. Calls the wrapper inside an autograd context, asks for + ``grad`` w.r.t. the ``g1`` input, and verifies the gradient flows + through (matches the gradient produced by an equivalent + ``index_select`` + ``index_add_`` Python implementation, which is + the reference for the symmetric MPI exchange in single-rank). +""" + +from __future__ import ( + annotations, +) + +import ctypes + +import numpy as np +import pytest +import torch + +# Ensure the new C++ symbol is loaded. pt_expt imports deepmd.pt for +# the custom-op .so. +import deepmd.pt +import deepmd.pt_expt.utils.comm # noqa: F401 - registers deepmd_export::border_op + + +def _addr_of(np_arr: np.ndarray) -> int: + return np_arr.ctypes.data_as(ctypes.c_void_p).value + + +def _build_self_swap( + nloc: int, + nghost: int, + sendlist_indices: np.ndarray, + keepalive: list, + dtype: torch.dtype, +): + """Build comm tensors for a single self-exchange swap.""" + sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32) + keepalive.append(sendlist_indices) + nswap = 1 + addr = _addr_of(sendlist_indices) + sendlist = torch.tensor([addr], dtype=torch.int64) + sendproc = torch.zeros(nswap, dtype=torch.int32) + recvproc = torch.zeros(nswap, dtype=torch.int32) + sendnum = torch.tensor([nghost], dtype=torch.int32) + recvnum = torch.tensor([nghost], dtype=torch.int32) + communicator = torch.zeros(1, dtype=torch.int64) + nlocal_ts = torch.tensor(nloc, dtype=torch.int32) + nghost_ts = torch.tensor(nghost, dtype=torch.int32) + return ( + sendlist, + sendproc, + recvproc, + sendnum, + recvnum, + communicator, + nlocal_ts, + nghost_ts, + ) + + +# --------------------------------------------------------------------------- +# 1. Direct op call: border_op_backward as a standalone op +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float64]) +def test_border_op_backward_direct(dtype: torch.dtype) -> None: + """``torch.ops.deepmd.border_op_backward`` is callable for both + float32 and float64 inputs and returns a tensor of the expected + shape on the input's device. + """ + assert hasattr(torch.ops.deepmd, "border_op_backward"), ( + "Symbol not registered; rebuild libdeepmd_op_pt.so." + ) + nloc, nghost = 5, 3 + nall = nloc + nghost + n_dim = 4 + + keepalive: list = [] + sendlist_indices = np.array([0, 1, 2], dtype=np.int32) + comm = _build_self_swap(nloc, nghost, sendlist_indices, keepalive, dtype) + + grad_g1 = torch.ones(nall, n_dim, dtype=dtype) + + grad_in = torch.ops.deepmd.border_op_backward( + comm[0], + comm[1], + comm[2], + comm[3], + comm[4], + grad_g1, + comm[5], + comm[6], + comm[7], + ) + + # backward must preserve dtype and shape, and run on the same device. + assert grad_in.dtype == grad_g1.dtype + assert tuple(grad_in.shape) == tuple(grad_g1.shape) + assert grad_in.device == grad_g1.device + + +def test_border_op_backward_accumulation_semantics() -> None: + """Single-rank self-exchange backward: each ghost slot's grad is + accumulated into the local atom whose index sendlist points to. + + Reference: for forward ``g_ext[nloc + i] = g[sendlist[i]]``, the + reverse is ``grad_g[sendlist[i]] += grad_g_ext[nloc + i]``. + """ + nloc, nghost = 4, 4 + nall = nloc + nghost + n_dim = 3 + + # Each ghost slot maps back to a local atom: ghost 0->local 0, ghost + # 1->local 1, etc. So backward should add grad_g_ext[nloc+i] into + # grad_g[i] for i in [0, nghost). + keepalive: list = [] + sendlist_indices = np.array([0, 1, 2, 3], dtype=np.int32) + comm = _build_self_swap( + nloc, + nghost, + sendlist_indices, + keepalive, + torch.float64, + ) + + # Distinct values per ghost slot so we can identify the routing. + grad_g1 = torch.zeros(nall, n_dim, dtype=torch.float64) + grad_g1[nloc + 0, 0] = 7.0 + grad_g1[nloc + 1, 1] = 11.0 + grad_g1[nloc + 2, 2] = 13.0 + grad_g1[nloc + 3, 0] = 17.0 + # Local part has its own grad too — must pass through unchanged. + grad_g1[0, 1] = 1.0 + grad_g1[2, 2] = 2.0 + # Capture the input BEFORE the call: the C++ op writes + # ``index_add_`` into the same tensor and returns it, so once + # we've called the op the ``grad_g1`` reference points to the + # modified buffer. Snapshot first. + grad_g1_orig = grad_g1.clone() + grad_in = torch.ops.deepmd.border_op_backward( + comm[0], + comm[1], + comm[2], + comm[3], + comm[4], + grad_g1, + comm[5], + comm[6], + comm[7], + ) + + # Expected: grad_g_local += grad_g_ext[nloc:] indexed by sendlist. + # Ghost rows pass through unchanged (the C++ backward does not + # zero them; the wrapper's autograd consumer is F.pad whose + # backward drops them anyway). + expected = grad_g1_orig.clone() + for i, src_local_idx in enumerate(sendlist_indices.tolist()): + expected[src_local_idx] += grad_g1_orig[nloc + i] + np.testing.assert_allclose( + grad_in.numpy(), + expected.numpy(), + atol=1e-12, + rtol=0, + ) + + +# --------------------------------------------------------------------------- +# 2. Autograd path through the deepmd_export::border_op opaque wrapper +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float64]) +def test_border_op_export_autograd(dtype: torch.dtype) -> None: + """End-to-end autograd through the opaque wrapper. + + Builds an inputs tensor with ``requires_grad=True``, calls the + wrapper, sums the output, and asks for ``grad`` w.r.t. the input. + The reported gradient must match a hand-computed reference based + on the same self-exchange routing. + """ + nloc, nghost = 3, 2 + nall = nloc + nghost + n_dim = 4 + + keepalive: list = [] + sendlist_indices = np.array([0, 1], dtype=np.int32) # ghosts mirror locals 0,1 + comm = _build_self_swap(nloc, nghost, sendlist_indices, keepalive, dtype) + + # g1 is full nall-shape pre-padded; ghosts initialised to zero + # (mirroring how repflows.forward feeds the wrapper). + rng = np.random.default_rng(123) + g1_np = rng.normal(size=(nall, n_dim)).astype( + np.float32 if dtype == torch.float32 else np.float64, + ) + g1_np[nloc:] = 0.0 + g1 = torch.tensor(g1_np, dtype=dtype, requires_grad=True) + + out = torch.ops.deepmd_export.border_op( + comm[0], + comm[1], + comm[2], + comm[3], + comm[4], + g1, + comm[5], + comm[6], + comm[7], + ) + # Sum so the upstream grad is all-ones at every position. + loss = out.sum() + (grad_in,) = torch.autograd.grad(loss, g1, create_graph=False) + + # Reference for LOCAL rows only: forward sets + # ``out[nloc + i] = g1[sendlist[i]]`` for each ghost slot i and + # passes local rows through. With ``loss = out.sum()`` the + # upstream grad is ones everywhere, so each local row k receives + # 1 (from ``out[k] = g1[k]``) plus 1 for every ghost slot that + # references k via ``sendlist``. + expected_local = torch.ones(nloc, n_dim, dtype=dtype) + for s in sendlist_indices: + expected_local[int(s)] += 1.0 + rtol, atol = (0.0, 1e-5) if dtype == torch.float32 else (0.0, 1e-12) + np.testing.assert_allclose( + grad_in[:nloc].numpy(), + expected_local.numpy(), + atol=atol, + rtol=rtol, + ) + # Ghost rows of grad_in are not semantically meaningful: in + # production the wrapper's input is ``F.pad(node_ebd, value=0)`` + # so the ghost-row gradient is consumed by ``F.pad``'s backward + # (which drops it). The C++ backward leaves them as the upstream + # grad (here, ones), but we don't assert on it. From 0bd131ad79be47bcf65434bca66c79a390ea3c21 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 14:27:27 +0800 Subject: [PATCH 07/34] fix(cc): link TORCH_LIBRARIES in api_cc tests so pt_expt tests run Without TORCH_LIBRARIES on the test binary, the ``__has_include()`` check in DeepPotPTExpt.h evaluates to false and the test files compile with BUILD_PT_EXPT=0, causing every pt_expt test case to silently GTEST_SKIP("PyTorch support is not enabled"). The bug was masked by ctest reporting a green run with all skips counted as success. Adding ``target_link_libraries(runUnitTests_cc "${TORCH_LIBRARIES}")`` under the existing ``ENABLE_PYTORCH`` branch makes the AOTI header visible to the test compilation. After this fix, the 148 pt_expt tests actually run instead of being silently skipped. --- source/api_cc/tests/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt index a3e7d067f7..a570747f29 100644 --- a/source/api_cc/tests/CMakeLists.txt +++ b/source/api_cc/tests/CMakeLists.txt @@ -11,6 +11,10 @@ if(ENABLE_TENSORFLOW) endif() if(ENABLE_PYTORCH) target_compile_definitions(runUnitTests_cc PRIVATE BUILD_PYTORCH) + # Link torch so __has_include() succeeds and + # BUILD_PT_EXPT is set for the test binary; otherwise pt_expt tests all + # GTEST_SKIP() with "PyTorch support is not enabled". + target_link_libraries(runUnitTests_cc "${TORCH_LIBRARIES}") endif() if(ENABLE_JAX) target_compile_definitions(runUnitTests_cc PRIVATE BUILD_JAX) From cdef9d5214e48b78c8a93064ef391ed54008968a Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 14:33:28 +0800 Subject: [PATCH 08/34] feat(cc): wire DeepPotPTExpt and DeepSpinPTExpt for multi-rank GNN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 4 of the GNN MPI plumbing. When a .pt2 archive carries a nested forward_lower_with_comm.pt2 (added by Phase 3 for GNN models), the C++ inference path now optionally extracts and loads it as a second AOTInductor module. Each compute() call dispatches between the regular and with-comm artifacts based on lmp_list.nswap: LAMMPS sets nswap=0 in single-rank mode and >0 in multi-rank, so single-rank inference keeps using the regular artifact (mapping-tensor gather) and multi-rank routes to the with-comm artifact (MPI ghost exchange). Three additions: 1. commonPTExpt.h adds: - TempFile RAII handle for the extracted nested artifact (mkstemp, unlinked at destruction). - TempFile::from_zip_entry reads a ZIP entry from the outer .pt2 and writes it to a temp file (atomic, 0600). - build_comm_tensors_positional packs the 8 comm tensors in canonical positional order (send_list, send_proc, recv_proc, send_num, recv_num, communicator, nlocal, nghost) for the with-comm AOTI module input vector. 2. DeepPotPTExpt: - Reads has_comm_artifact from metadata.json (defaults false for old .pt2 files lacking the field). - When true, extracts extra/forward_lower_with_comm.pt2 to a TempFile and loads it as with_comm_loader. - run_model_with_comm appends the 8 comm tensors to the base inputs and dispatches to with_comm_loader->run. - compute() chooses regular vs with-comm based on nswap. 3. DeepSpinPTExpt: - Same pattern; the Phase 3 export injects has_spin=1 into the traced graph comm_dict, so the C++ side passes the same 8 comm tensors as the non-spin case. nlocal/nghost carry the real-atom counts (the spin override halves them internally to get the atom-doubled counts). All 148 existing pt_expt C++ tests pass — the with-comm path is gated behind nswap > 0 so single-rank tests dont exercise it (that coverage is Phase 5 multi-rank LAMMPS test). --- source/api_cc/include/DeepPotPTExpt.h | 32 +++++ source/api_cc/include/DeepSpinPTExpt.h | 23 ++++ source/api_cc/src/DeepPotPTExpt.cc | 75 ++++++++++- source/api_cc/src/DeepSpinPTExpt.cc | 72 +++++++++- source/api_cc/src/commonPTExpt.h | 177 ++++++++++++++++++++++++- 5 files changed, 371 insertions(+), 8 deletions(-) diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h index 0d42324d24..10135b58db 100644 --- a/source/api_cc/include/DeepPotPTExpt.h +++ b/source/api_cc/include/DeepPotPTExpt.h @@ -16,6 +16,12 @@ #include "DeepPot.h" +// Forward-declare to keep TempFile out of public header. Defined in +// commonPTExpt.h. +namespace deepmd::ptexpt { +class TempFile; +} + namespace torch::inductor { class AOTIModelPackageLoader; } @@ -212,6 +218,14 @@ class DeepPotPTExpt : public DeepPotBackend { std::vector sel; NeighborListData nlist_data; std::unique_ptr loader; + // Optional second AOTInductor artifact for the multi-rank GNN code + // path (Phase 4). Loaded only if the .pt2 metadata reports + // ``has_comm_artifact == true`` AND the model has GNN message + // passing. ``with_comm_tempfile_`` owns the extracted nested .pt2 + // for the lifetime of ``with_comm_loader``. + bool has_comm_artifact_ = false; + std::unique_ptr with_comm_tempfile_; + std::unique_ptr with_comm_loader; /** * @brief Multi-frame loop for standalone compute (no nlist). @@ -264,6 +278,24 @@ class DeepPotPTExpt : public DeepPotBackend { const torch::Tensor& fparam, const torch::Tensor& aparam); + /** + * @brief Run the with-comm .pt2 artifact with comm tensors appended. + * + * @param[in] base 4-6 base inputs (coord, atype, nlist, mapping, + * fparam?, aparam?) — same as ``run_model``. + * @param[in] comm_tensors 8 comm tensors in canonical positional + * order: send_list, send_proc, recv_proc, send_num, + * recv_num, communicator, nlocal, nghost. + */ + std::vector run_model_with_comm( + const torch::Tensor& coord, + const torch::Tensor& atype, + const torch::Tensor& nlist, + const torch::Tensor& mapping, + const torch::Tensor& fparam, + const torch::Tensor& aparam, + const std::vector& comm_tensors); + /** * @brief Extract outputs from flat tensor list using output_keys. */ diff --git a/source/api_cc/include/DeepSpinPTExpt.h b/source/api_cc/include/DeepSpinPTExpt.h index af108c7690..08ca4e8ccb 100644 --- a/source/api_cc/include/DeepSpinPTExpt.h +++ b/source/api_cc/include/DeepSpinPTExpt.h @@ -14,6 +14,11 @@ #include "DeepSpin.h" +// Forward-declare the temp-file helper from commonPTExpt.h. +namespace deepmd::ptexpt { +class TempFile; +} + namespace torch::inductor { class AOTIModelPackageLoader; } @@ -187,6 +192,10 @@ class DeepSpinPTExpt : public DeepSpinBackend { std::vector sel; NeighborListData nlist_data; std::unique_ptr loader; + // Optional with-comm artifact for multi-rank GNN spin inference. + bool has_comm_artifact_ = false; + std::unique_ptr with_comm_tempfile_; + std::unique_ptr with_comm_loader; std::vector run_model(const torch::Tensor& coord, const torch::Tensor& atype, @@ -196,6 +205,20 @@ class DeepSpinPTExpt : public DeepSpinBackend { const torch::Tensor& fparam, const torch::Tensor& aparam); + /** + * @brief Run with-comm spin artifact: 5-7 base inputs (incl. + * extended_spin) + 8 comm tensors. + */ + std::vector run_model_with_comm( + const torch::Tensor& coord, + const torch::Tensor& atype, + const torch::Tensor& spin, + const torch::Tensor& nlist, + const torch::Tensor& mapping, + const torch::Tensor& fparam, + const torch::Tensor& aparam, + const std::vector& comm_tensors); + void extract_outputs(std::map& output_map, const std::vector& flat_outputs); diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index c1f3d9d674..dbcfe0e163 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -142,6 +142,25 @@ void DeepPotPTExpt::init(const std::string& model, gpu_enabled ? static_cast(gpu_id) : static_cast(-1)); + // Phase 4: load the optional with-comm artifact for multi-rank GNN + // inference. Pre-Phase-3 .pt2 files lack ``has_comm_artifact``; + // default to false so old artifacts keep working. + has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") && + metadata["has_comm_artifact"].as_bool(); + if (has_comm_artifact_) { + // Extract the nested ``extra/forward_lower_with_comm.pt2`` into a + // temp file and load it as a second AOTI module. The TempFile + // unlinks the temp file on destruction. + with_comm_tempfile_ = std::make_unique( + deepmd::ptexpt::TempFile::from_zip_entry( + model, "extra/forward_lower_with_comm.pt2")); + with_comm_loader = + std::make_unique( + with_comm_tempfile_->path(), "model", false, 1, + gpu_enabled ? static_cast(gpu_id) + : static_cast(-1)); + } + int num_intra_nthreads, num_inter_nthreads; get_env_nthreads(num_intra_nthreads, num_inter_nthreads); if (num_inter_nthreads) { @@ -182,6 +201,40 @@ std::vector DeepPotPTExpt::run_model( return loader->run(inputs); } +std::vector DeepPotPTExpt::run_model_with_comm( + const torch::Tensor& coord, + const torch::Tensor& atype, + const torch::Tensor& nlist, + const torch::Tensor& mapping, + const torch::Tensor& fparam, + const torch::Tensor& aparam, + const std::vector& comm_tensors) { + if (!with_comm_loader) { + throw deepmd::deepmd_exception( + "run_model_with_comm called but the .pt2 file has no with-comm " + "artifact. This is a programming error: the caller should check " + "has_comm_artifact_ before invoking this path."); + } + if (comm_tensors.size() != 8) { + throw deepmd::deepmd_exception( + "run_model_with_comm: comm_tensors must contain exactly 8 tensors " + "(send_list, send_proc, recv_proc, send_num, recv_num, " + "communicator, nlocal, nghost). Got " + + std::to_string(comm_tensors.size()) + "."); + } + std::vector inputs = {coord, atype, nlist, mapping}; + if (dfparam > 0) { + inputs.push_back(fparam); + } + if (daparam > 0) { + inputs.push_back(aparam); + } + for (const auto& t : comm_tensors) { + inputs.push_back(t); + } + return with_comm_loader->run(inputs); +} + void DeepPotPTExpt::extract_outputs( std::map& output_map, const std::vector& flat_outputs) { @@ -328,9 +381,25 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, aparam_tensor = torch::zeros({0}, options).to(device); } - // Run the .pt2 model - auto flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor, - mapping_tensor, fparam_tensor, aparam_tensor); + // Phase 4 dispatch: use the with-comm artifact when LAMMPS is + // running multi-rank. ``lmp_list.nswap > 0`` is the proxy for + // "multi-rank with cross-domain communication"; in single-rank + // mode LAMMPS sets nswap=0. Falling back to the regular artifact + // for nswap=0 is correct because that artifact uses the mapping + // tensor to gather ghost embeddings from local atoms. + std::vector flat_outputs; + bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0; + if (use_with_comm) { + auto comm_tensors = deepmd::ptexpt::build_comm_tensors_positional( + lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, nloc, + nghost_real); + flat_outputs = run_model_with_comm( + coord_Tensor, atype_Tensor, firstneigh_tensor, mapping_tensor, + fparam_tensor, aparam_tensor, comm_tensors); + } else { + flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor, + mapping_tensor, fparam_tensor, aparam_tensor); + } // Map flat outputs to internal keys std::map output_map; diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc index ae4ef423ed..ed95018e4c 100644 --- a/source/api_cc/src/DeepSpinPTExpt.cc +++ b/source/api_cc/src/DeepSpinPTExpt.cc @@ -154,6 +154,21 @@ void DeepSpinPTExpt::init(const std::string& model, gpu_enabled ? static_cast(gpu_id) : static_cast(-1)); + // Phase 4: load the optional with-comm artifact for multi-rank GNN + // spin inference. Mirrors DeepPotPTExpt; see its init() comment. + has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") && + metadata["has_comm_artifact"].as_bool(); + if (has_comm_artifact_) { + with_comm_tempfile_ = std::make_unique( + deepmd::ptexpt::TempFile::from_zip_entry( + model, "extra/forward_lower_with_comm.pt2")); + with_comm_loader = + std::make_unique( + with_comm_tempfile_->path(), "model", false, 1, + gpu_enabled ? static_cast(gpu_id) + : static_cast(-1)); + } + int num_intra_nthreads, num_inter_nthreads; get_env_nthreads(num_intra_nthreads, num_inter_nthreads); if (num_inter_nthreads) { @@ -195,6 +210,39 @@ std::vector DeepSpinPTExpt::run_model( return loader->run(inputs); } +std::vector DeepSpinPTExpt::run_model_with_comm( + const torch::Tensor& coord, + const torch::Tensor& atype, + const torch::Tensor& spin, + const torch::Tensor& nlist, + const torch::Tensor& mapping, + const torch::Tensor& fparam, + const torch::Tensor& aparam, + const std::vector& comm_tensors) { + if (!with_comm_loader) { + throw deepmd::deepmd_exception( + "DeepSpinPTExpt::run_model_with_comm called but the .pt2 has no " + "with-comm artifact."); + } + if (comm_tensors.size() != 8) { + throw deepmd::deepmd_exception( + "DeepSpinPTExpt::run_model_with_comm: comm_tensors must contain " + "exactly 8 tensors. Got " + + std::to_string(comm_tensors.size()) + "."); + } + std::vector inputs = {coord, atype, spin, nlist, mapping}; + if (dfparam > 0) { + inputs.push_back(fparam); + } + if (daparam > 0) { + inputs.push_back(aparam); + } + for (const auto& t : comm_tensors) { + inputs.push_back(t); + } + return with_comm_loader->run(inputs); +} + void DeepSpinPTExpt::extract_outputs( std::map& output_map, const std::vector& flat_outputs) { @@ -353,10 +401,26 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener, aparam_tensor = torch::zeros({0}, options).to(device); } - // Run the .pt2 model (7 args for spin) - auto flat_outputs = - run_model(coord_Tensor, atype_Tensor, spin_Tensor, firstneigh_tensor, - mapping_tensor, fparam_tensor, aparam_tensor); + // Phase 4 dispatch: route to with-comm artifact in multi-rank mode. + // ``has_spin=tensor([1])`` is baked into the with-comm graph at + // trace time (Phase 3, spin_model.forward_common_lower_exportable + // _with_comm), so C++ supplies the same 8 comm tensors as the + // non-spin path. ``nlocal``/``nghost`` carry the real-atom counts + // (pre atom-doubling); the spin override halves them internally. + std::vector flat_outputs; + bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0; + if (use_with_comm) { + auto comm_tensors = deepmd::ptexpt::build_comm_tensors_positional( + lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, nloc, + nghost_real); + flat_outputs = run_model_with_comm( + coord_Tensor, atype_Tensor, spin_Tensor, firstneigh_tensor, + mapping_tensor, fparam_tensor, aparam_tensor, comm_tensors); + } else { + flat_outputs = + run_model(coord_Tensor, atype_Tensor, spin_Tensor, firstneigh_tensor, + mapping_tensor, fparam_tensor, aparam_tensor); + } std::map output_map; extract_outputs(output_map, flat_outputs); diff --git a/source/api_cc/src/commonPTExpt.h b/source/api_cc/src/commonPTExpt.h index 7dd02d09a9..dcaaddd6ea 100644 --- a/source/api_cc/src/commonPTExpt.h +++ b/source/api_cc/src/commonPTExpt.h @@ -1,10 +1,17 @@ // SPDX-License-Identifier: LGPL-3.0-or-later // Shared utilities for pt_expt (.pt2 / AOTInductor) backend classes. -// Provides: JSON parser, ZIP archive reader, and type-sorted nlist builder. +// Provides: JSON parser, ZIP archive reader, type-sorted nlist builder, +// and helpers for the with-comm dual-artifact layout (Phase 4 of the +// GNN MPI plumbing). #pragma once +#include +#include + #include #include +#include +#include #include #include #include @@ -12,6 +19,7 @@ #include #include "errors.h" +#include "neighbor_list.h" namespace deepmd { namespace ptexpt { @@ -534,5 +542,172 @@ inline torch::Tensor buildTypeSortedNlist( return tensor; } +// ============================================================================ +// With-comm artifact extraction (Phase 4) +// +// GNN .pt2 archives carry a nested ``extra/forward_lower_with_comm.pt2`` +// alongside the regular forward_lower artifact. AOTInductor's +// ``ModelPackageLoader`` reads .pt2 files from disk, so to load the +// nested artifact we extract it to a temp file. +// ============================================================================ + +/** + * @brief RAII handle for a temp file on disk. + * + * Used to hold the extracted with-comm .pt2 artifact for the lifetime + * of the loader. Destructor unlinks the file. + */ +class TempFile { + public: + TempFile() = default; + TempFile(const TempFile&) = delete; + TempFile& operator=(const TempFile&) = delete; + TempFile(TempFile&& other) noexcept : path_(std::move(other.path_)) { + other.path_.clear(); + } + TempFile& operator=(TempFile&& other) noexcept { + if (this != &other) { + cleanup(); + path_ = std::move(other.path_); + other.path_.clear(); + } + return *this; + } + ~TempFile() { cleanup(); } + + const std::string& path() const { return path_; } + bool empty() const { return path_.empty(); } + + /** + * @brief Write the content of an existing .pt2 ZIP entry to a temp + * file and return a TempFile owning that path. + * + * The temp file is created via ``mkstemp(3)`` (atomic, unique, + * 0600 permissions) under the system tempdir (TMPDIR or /tmp). + */ + static TempFile from_zip_entry(const std::string& outer_pt2_path, + const std::string& entry_name) { + std::string content = read_zip_entry(outer_pt2_path, entry_name); + const char* tmpdir = std::getenv("TMPDIR"); + std::string tmpl = + std::string(tmpdir ? tmpdir : "/tmp") + "/dp_pt2_with_comm_XXXXXX"; + std::vector buf(tmpl.begin(), tmpl.end()); + buf.push_back('\0'); + int fd = mkstemp(buf.data()); + if (fd < 0) { + throw deepmd::deepmd_exception( + "Failed to create temp file for nested .pt2 artifact: " + tmpl); + } + std::string path(buf.data()); + // Write content to the fd so we don't race with another process + // opening the same path. + ssize_t written = 0; + const char* p = content.data(); + ssize_t remain = static_cast(content.size()); + while (remain > 0) { + ssize_t n = ::write(fd, p + written, static_cast(remain)); + if (n < 0) { + ::close(fd); + ::unlink(path.c_str()); + throw deepmd::deepmd_exception( + "Failed to write nested .pt2 artifact to temp file: " + path); + } + written += n; + remain -= n; + } + ::close(fd); + TempFile tf; + tf.path_ = std::move(path); + return tf; + } + + private: + void cleanup() { + if (!path_.empty()) { + ::unlink(path_.c_str()); + path_.clear(); + } + } + std::string path_; +}; + +// ============================================================================ +// comm_dict tensor packing for the with-comm artifact (Phase 4) +// +// The with-comm AOTInductor artifact accepts comm tensors as 8 additional +// positional inputs (after the regular 4-6 inputs) in this canonical order: +// send_list (nswap, int64 ptr-array packed as int64 tensor) +// send_proc (nswap, int32) +// recv_proc (nswap, int32) +// send_num (nswap, int32) +// recv_num (nswap, int32) +// communicator (1, int64 — MPI handle as opaque int) +// nlocal (scalar int32) +// nghost (scalar int32) +// This mirrors deepmd_export::border_op's argument order in +// deepmd/pt_expt/utils/comm.py. +// ============================================================================ + +/** + * @brief Build the 8 comm-tensor positional inputs from LAMMPS data. + * + * Tensors share storage with the LAMMPS-owned buffers (no copy); + * the caller must keep ``lmp_list``, ``sendlist``, ``sendnum``, and + * ``recvnum`` alive until ``loader->run`` returns. ``nlocal`` / + * ``nghost`` are produced via ``torch::tensor`` (small allocation). + * + * @param lmp_list LAMMPS neighbor list (provides nswap, sendproc, + * recvproc, world). + * @param sendlist int** pointer-array (already remapped if needed). + * @param sendnum int* per-swap send counts (already remapped). + * @param recvnum int* per-swap recv counts (already remapped). + * @param nlocal Number of local atoms (per-rank). + * @param nghost Number of ghost atoms (per-rank). + * @return Vector of 8 tensors in canonical positional order. + */ +inline std::vector build_comm_tensors_positional( + const InputNlist& lmp_list, + int** sendlist, + int* sendnum, + int* recvnum, + int nlocal, + int nghost) { + int nswap = lmp_list.nswap; + auto int32_option = + torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt32); + auto int64_option = + torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64); + + // sendlist is int**: nswap entries each holding an int* pointer. + // Reinterpret as int64 for tensor packaging (matches what pt's + // build_comm_dict does and what border_op expects). + at::Tensor sendlist_tensor = + torch::from_blob(static_cast(sendlist), {nswap}, int64_option); + at::Tensor sendproc_tensor = + torch::from_blob(lmp_list.sendproc, {nswap}, int32_option); + at::Tensor recvproc_tensor = + torch::from_blob(lmp_list.recvproc, {nswap}, int32_option); + at::Tensor sendnum_tensor = torch::from_blob(sendnum, {nswap}, int32_option); + at::Tensor recvnum_tensor = torch::from_blob(recvnum, {nswap}, int32_option); + + // MPI communicator handle as a 1-element int64 tensor. + static std::int64_t null_communicator = 0; + at::Tensor communicator_tensor; + if (lmp_list.world == nullptr) { + communicator_tensor = + torch::from_blob(&null_communicator, {1}, int64_option); + } else { + communicator_tensor = + torch::from_blob(const_cast(lmp_list.world), {1}, int64_option); + } + + // Scalar nlocal / nghost — int32 to match Python-side tracing. + at::Tensor nlocal_tensor = torch::tensor(nlocal, int32_option); + at::Tensor nghost_tensor = torch::tensor(nghost, int32_option); + + return {sendlist_tensor, sendproc_tensor, recvproc_tensor, sendnum_tensor, + recvnum_tensor, communicator_tensor, nlocal_tensor, nghost_tensor}; +} + } // namespace ptexpt } // namespace deepmd From 1ad6103549c6123b45032a8c7fab99ba52bd9189 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 15:14:27 +0800 Subject: [PATCH 09/34] feat(gnn-mpi): wire up multi-rank LAMMPS path end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5 — final integration after Phases 1-4 land the dpmodel plumbing, opaque op wrappers, two-mode AOTI export and C++ dispatch. Three pieces had to fall into place to make multi-rank LAMMPS actually run a GNN .pt2: 1. Move deepmd_export op schema declarations to C++. torch.library.custom_op only registers the op in the Python process, but a LAMMPS run loads the .pt2 in pure C++ (no Python interpreter). Add TORCH_LIBRARY_FRAGMENT(deepmd_export, m) + TORCH_LIBRARY_IMPL blocks under explicit CPU/CUDA dispatch keys in source/op/pt/comm.cc; the C++ impls clone the underlying deepmd::* op outputs to satisfy AOTI no-aliasing. Python comm.py now layers register_fake + register_autograd on top of the C++-defined ops instead of defining new ones. 2. Call deepmd::load_op_library at DeepPot/SpinPTExpt init so libdeepmd_op_pt.so loads before AOTIModelPackageLoader; the LAMMPS plugin doesnt pre-load it. Without this, a multi-rank GNN .pt2 aborts at pair_style time with a missing-schema error. 3. Gate dual-artifact production on use_loc_mapping=False. _has_message_passing now walks into the GNN block to inspect use_loc_mapping; if True, only the regular artifact is produced (the override would raise on parallel mode anyway). gen_dpa3.py produces a second deeppot_dpa3_mpi.pt2 with use_loc_mapping=False so the new mpirun test has a real dual-artifact .pt2 to load. Plus the multi-rank test itself: - run_mpi_pair_deepmd_dpa3_pt2.py: subprocess driver. Uses PyLammps + processors 2 1 1 so nswap > 0 on every rank, forcing the C++ side to dispatch to the with-comm artifact. Forces are gathered via lammps.lmp.gather_atoms (rank-local atoms[i] doesnt see other ranks); pe via lammps.eval on rank 0. - test_pair_deepmd_mpi_dpa3 in test_lammps_dpa3_pt2.py: invokes the driver under mpirun -n 2, asserts energy + per-atom forces match the single-rank reference within atol=1e-8. Also: register_fake for the backward op too. Without it, make_fx tracing autograd.grad inside forward_common_lower_exportable hits the same FakeTensor data_ptr error we solved for forward in Phase 0. All 31 pt_expt LAMMPS tests pass. --- deepmd/pt_expt/utils/comm.py | 135 ++++++++---------- deepmd/pt_expt/utils/serialization.py | 26 +++- source/api_cc/src/DeepPotPTExpt.cc | 7 + source/api_cc/src/DeepSpinPTExpt.cc | 5 + .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 72 ++++++++++ source/lmp/tests/test_lammps_dpa3_pt2.py | 77 ++++++++++ source/op/pt/comm.cc | 82 +++++++++++ source/tests/infer/gen_dpa3.py | 18 +++ 8 files changed, 344 insertions(+), 78 deletions(-) create mode 100644 source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py index 442a232a6f..b985c57fe6 100644 --- a/deepmd/pt_expt/utils/comm.py +++ b/deepmd/pt_expt/utils/comm.py @@ -1,27 +1,32 @@ # SPDX-License-Identifier: LGPL-3.0-or-later -"""Opaque torch.export wrapper around the deepmd MPI border_op. - -The existing ``torch.ops.deepmd.border_op`` (registered by -``libdeepmd_op_pt.so``) is a ``CompositeImplicitAutograd`` op that wraps -``Border::apply`` for the torch.jit (pt) backend. ``torch.export`` / -AOTInductor try to *decompose* such ops into primitive aten ops, which -fails because the C++ kernel calls ``data_ptr()`` on inputs — illegal -during tracing on FakeTensors. - -This module defines a NEW op ``deepmd_export::border_op`` via -``torch.library.custom_op``, marked opaque so ``torch.export`` records it -as a single black-box call. At runtime the loaded ``.pt2`` dispatches -back into ``torch.ops.deepmd.border_op`` (forward) or -``torch.ops.deepmd.border_op_backward`` (backward), preserving the MPI -exchange semantics. +"""Python-side fake / autograd registration for the C++-defined opaque +``deepmd_export::border_op`` and ``deepmd_export::border_op_backward``. + +The op schemas and concrete CPU/CUDA implementations are defined in +``source/op/pt/comm.cc`` (registered under explicit dispatch keys so +``torch.export`` records them as opaque external calls instead of +decomposing into the C++ kernel — which would hit ``data_ptr()`` on +FakeTensors and fail). Defining the schema in C++ also means a +``.pt2`` archive loaded by a pure-C++ process (LAMMPS via +``DeepPotPTExpt``) can dispatch through the registered op without +needing a Python interpreter. + +This module adds the Python-only metadata that the ops still need: + * ``register_fake`` so ``make_fx`` / ``torch.export`` can trace + through them with FakeTensor inputs. + * ``register_autograd`` so ``torch.autograd.grad`` (used inside + ``forward_common_lower_exportable_with_comm``) flows gradients + through the forward op back to its inputs. Constraints discovered during de-risking (scratch/derisk_border_op.py): - 1. ``custom_op`` forbids returning a tensor that aliases an input — - the underlying C++ op returns ``g1`` itself, so we ``.clone()``. - 2. The fake (meta) impl honours ``g1.dtype`` (no float64 hardcoding). - 3. ``register_autograd`` makes the op differentiable; the backward - dispatches to ``deepmd::border_op_backward`` which performs the - symmetric MPI exchange. + 1. Both forward and backward outputs must NOT alias their inputs + (the C++ kernels return the same tensor they modified) — the + C++ wrapper layer in ``comm.cc`` clones them before exposing. + 2. The fake impls honour ``g1.dtype`` (no float64 hardcoding). + 3. ``register_autograd`` makes the forward op differentiable; the + backward callback dispatches to the opaque + ``deepmd_export::border_op_backward`` op so ``make_fx`` tracing + through ``autograd.grad`` also sees a black box. """ from __future__ import ( @@ -34,33 +39,33 @@ def _check_underlying_ops_loaded() -> None: """Surface a clearer error when libdeepmd_op_pt.so isn't loaded. - pt_expt depends on libdeepmd_op_pt.so for the underlying - ``deepmd::border_op`` and ``deepmd::border_op_backward`` C++ ops. - Without them, callers get cryptic - ``AttributeError: '_OpNamespace' object has no attribute 'border_op'`` - errors. We translate that into actionable advice. - - Called once on first wrapper invocation (not at import time, since - pt_expt may legitimately be imported on systems where the .so is - not built — e.g. eager-only smoke tests of dpmodel-side code). + pt_expt depends on libdeepmd_op_pt.so for the ``deepmd_export::*`` + op schemas + impls. Without it, the ops can't be registered for + fake/autograd metadata and callers get a cryptic AttributeError + on ``torch.ops.deepmd_export.border_op``. """ if not ( - hasattr(torch.ops, "deepmd") - and hasattr(torch.ops.deepmd, "border_op") - and hasattr(torch.ops.deepmd, "border_op_backward") + hasattr(torch.ops, "deepmd_export") + and hasattr(torch.ops.deepmd_export, "border_op") + and hasattr(torch.ops.deepmd_export, "border_op_backward") ): raise RuntimeError( - "deepmd_export::border_op wrapper requires " - "torch.ops.deepmd.border_op and " - "torch.ops.deepmd.border_op_backward (from " - "libdeepmd_op_pt.so) to be loaded. Build the pt custom-op " - "library and ensure deepmd.pt is imported before the " - "first call to this wrapper." + "torch.ops.deepmd_export.{border_op,border_op_backward} " + "are not registered. Build libdeepmd_op_pt.so and ensure " + "deepmd.pt is imported before this module." ) -@torch.library.custom_op("deepmd_export::border_op", mutates_args=()) -def border_op_export( +_check_underlying_ops_loaded() + + +# --------------------------------------------------------------------------- +# Fake (meta) impls — let make_fx / torch.export trace through. +# --------------------------------------------------------------------------- + + +@torch.library.register_fake("deepmd_export::border_op") +def _border_op_fake( sendlist: torch.Tensor, sendproc: torch.Tensor, recvproc: torch.Tensor, @@ -71,44 +76,29 @@ def border_op_export( nlocal: torch.Tensor, nghost: torch.Tensor, ) -> torch.Tensor: - """Opaque wrapper around ``torch.ops.deepmd.border_op``. - - Performs MPI ghost-atom exchange of the embedding tensor ``g1`` so - GNN message-passing layers can run under multi-rank LAMMPS. Inputs - and outputs match the underlying op exactly except for the aliasing - fix (see module docstring). - """ - _check_underlying_ops_loaded() - out = torch.ops.deepmd.border_op( - sendlist, - sendproc, - recvproc, - sendnum, - recvnum, - g1, - communicator, - nlocal, - nghost, - ) - if isinstance(out, (list, tuple)): - out = out[0] - # custom_op forbids output aliasing inputs; underlying op returns g1. - return out.clone() + return torch.empty_like(g1) -@border_op_export.register_fake -def _border_op_export_fake( +@torch.library.register_fake("deepmd_export::border_op_backward") +def _border_op_backward_fake( sendlist: torch.Tensor, sendproc: torch.Tensor, recvproc: torch.Tensor, sendnum: torch.Tensor, recvnum: torch.Tensor, - g1: torch.Tensor, + grad_g1: torch.Tensor, communicator: torch.Tensor, nlocal: torch.Tensor, nghost: torch.Tensor, ) -> torch.Tensor: - return torch.empty_like(g1) + return torch.empty_like(grad_g1) + + +# --------------------------------------------------------------------------- +# Autograd: route the forward op's backward through the backward op so +# ``make_fx`` tracing through ``torch.autograd.grad`` records both as +# opaque external calls. +# --------------------------------------------------------------------------- def _border_op_setup_context( @@ -146,7 +136,7 @@ def _border_op_backward( (sendlist, sendproc, recvproc, sendnum, recvnum, communicator, nlocal, nghost) = ( ctx.saved_tensors ) - grad_in = torch.ops.deepmd.border_op_backward( + grad_in = torch.ops.deepmd_export.border_op_backward( sendlist, sendproc, recvproc, @@ -157,22 +147,21 @@ def _border_op_backward( nlocal, nghost, ) - # Same aliasing concern as forward: the C++ backward returns the same - # tensor object it modified; clone before handing back to autograd. return ( None, None, None, None, None, # sendlist..recvnum - grad_in.clone(), # g1 + grad_in, # g1 None, None, None, # communicator, nlocal, nghost ) -border_op_export.register_autograd( +torch.library.register_autograd( + "deepmd_export::border_op", _border_op_backward, setup_context=_border_op_setup_context, ) diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index 74fbe67111..fe5fe7f318 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -85,6 +85,11 @@ def _has_message_passing(model: torch.nn.Module) -> bool: compiled. Non-GNN descriptors (se_e2_a, se_r, se_t, se_t_tebd, DPA1, hybrid-of-non-GNN) need only the regular artifact. + Additional gate: ``use_loc_mapping=True`` GNN models (the default + for DPA3) keep nlist in local-only indexing, so per-layer ghost + exchange is meaningless — these get only the regular artifact. + Multi-rank LAMMPS for GNN requires use_loc_mapping=False. + Returns False if the descriptor's ``has_message_passing()`` query cannot be answered (e.g. linear/zbl/frozen models without a single descriptor) — those are assumed local. @@ -93,12 +98,23 @@ def _has_message_passing(model: torch.nn.Module) -> bool: descriptor = model.atomic_model.descriptor except AttributeError: return False - if hasattr(descriptor, "has_message_passing"): - try: - return bool(descriptor.has_message_passing()) - except (AttributeError, NotImplementedError): + if not hasattr(descriptor, "has_message_passing"): + return False + try: + if not descriptor.has_message_passing(): + return False + except (AttributeError, NotImplementedError): + return False + # Walk into the GNN block (repflows / repformers) to inspect + # ``use_loc_mapping``. The attribute lives on the block, not on the + # top-level descriptor wrapper. + for attr in ("repflows", "repformers"): + block = getattr(descriptor, attr, None) + if block is None: + continue + if getattr(block, "use_loc_mapping", False): return False - return False + return True # Module-level cache for the trace-time sendlist buffer. The pointer diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index dbcfe0e163..020566de23 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -62,6 +62,13 @@ void DeepPotPTExpt::init(const std::string& model, return; } + // Load libdeepmd_op_pt.so so its TORCH_LIBRARY_FRAGMENT entries + // (deepmd::*, deepmd_export::*) are visible to torch's dispatcher + // before the AOTI module loads. Without this, multi-rank GNN .pt2 + // archives fail at pair_style time with + // ``Could not find schema for deepmd_export::border_op``. + deepmd::load_op_library(); + if (!file_content.empty()) { throw deepmd::deepmd_exception( "In-memory file_content loading is not supported for .pt2 models. " diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc index ed95018e4c..e16991d884 100644 --- a/source/api_cc/src/DeepSpinPTExpt.cc +++ b/source/api_cc/src/DeepSpinPTExpt.cc @@ -62,6 +62,11 @@ void DeepSpinPTExpt::init(const std::string& model, return; } + // Load libdeepmd_op_pt.so so deepmd_export::* schemas are visible + // to torch's dispatcher before the AOTI module loads. See + // DeepPotPTExpt::init for the full rationale. + deepmd::load_op_library(); + if (!file_content.empty()) { throw deepmd::deepmd_exception( "In-memory file_content loading is not supported for .pt2 models. " diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py new file mode 100644 index 0000000000..29b103cf01 --- /dev/null +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Multi-rank LAMMPS driver for DPA3 .pt2 (Phase 5 of GNN MPI). + +Run via ``mpirun -n N python run_mpi_pair_deepmd_dpa3_pt2.py DATAFILE PB_FILE OUTPUT``. +Mirrors ``run_mpi_pair_deepmd.py`` but targets a GNN model whose .pt2 archive +carries the with-comm artifact (Phase 3 dual-artifact layout). The C++ +``DeepPotPTExpt`` (Phase 4) routes to the with-comm artifact when LAMMPS +reports nswap > 0 (multi-rank), driving MPI ghost-atom exchange via +``deepmd_export::border_op`` per layer. + +Rank 0 writes potential energy + per-atom forces to ``OUTPUT`` so the parent +pytest process can compare against the single-rank reference. +""" + +from __future__ import ( + annotations, +) + +import argparse + +import numpy as np +from lammps import ( + PyLammps, +) +from mpi4py import ( + MPI, +) + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() + +parser = argparse.ArgumentParser() +parser.add_argument("DATAFILE", type=str, help="LAMMPS data file (atom positions)") +parser.add_argument("PB_FILE", type=str, help=".pt2 model file") +parser.add_argument("OUTPUT", type=str, help="Output file for energies + forces") +args = parser.parse_args() + +lammps = PyLammps() +# Force a non-trivial domain decomposition: 2 x 1 x 1 across ranks. +# Combined with the simulation box this guarantees nswap > 0 on the C++ +# side, so DeepPotPTExpt routes to the with-comm AOTI artifact. +lammps.processors("2 1 1") +lammps.units("metal") +lammps.boundary("p p p") +lammps.atom_style("atomic") +lammps.neighbor("2.0 bin") +lammps.neigh_modify("every 10 delay 0 check no") +lammps.read_data(args.DATAFILE) +lammps.mass("1 16") +lammps.mass("2 2") +lammps.timestep(0.0005) +lammps.fix("1 all nve") + +lammps.pair_style(f"deepmd {args.PB_FILE}") +lammps.pair_coeff("* *") +lammps.run(0) + +# Forces need to be gathered across ranks. PyLammps's ``atoms[i]`` +# only exposes rank-local atoms; ``gather_atoms`` returns the global, +# id-ordered array on every rank. +forces_global = lammps.lmp.gather_atoms("f", 1, 3) +# ``PyLammps.eval`` is rank-0-only. +if rank == 0: + pe_global = lammps.eval("pe") + natoms = lammps.atoms.natoms + forces = np.array(forces_global, dtype=np.float64).reshape(natoms, 3) + with open(args.OUTPUT, "w") as f: + f.write(f"{pe_global:.16e}\n") + for row in forces: + f.write(" ".join(f"{v:.16e}" for v in row) + "\n") + +MPI.Finalize() diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index 7ce05f9a2d..73b3ea1442 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -5,7 +5,12 @@ Reference values from source/tests/infer/gen_dpa3.py / C++ test. """ +import importlib.util import os +import shutil +import subprocess as sp +import sys +import tempfile from pathlib import ( Path, ) @@ -21,6 +26,12 @@ ) pb_file = Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa3.pt2" +# Multi-rank-capable variant (use_loc_mapping=False; carries the +# nested forward_lower_with_comm.pt2 artifact). Produced alongside +# deeppot_dpa3.pt2 by source/tests/infer/gen_dpa3.py. +pb_file_mpi = ( + Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa3_mpi.pt2" +) data_file = Path(__file__).parent / "data_dpa3_pt2.lmp" data_file_si = Path(__file__).parent / "data_dpa3_pt2.si" data_type_map_file = Path(__file__).parent / "data_type_map_dpa3_pt2.lmp" @@ -315,3 +326,69 @@ def test_pair_deepmd_si(lammps_si) -> None: expected_f[lammps_si.atoms[ii].id - 1] * constants.force_metal2si ) lammps_si.run(1) + + +# --------------------------------------------------------------------------- +# Multi-rank test (Phase 5 of GNN MPI) +# +# Drives the .pt2 model under ``mpirun -n 2`` so the C++ ``DeepPotPTExpt`` +# routes to the with-comm AOTI artifact (Phase 4) and ``border_op`` does +# real MPI ghost exchange between two ranks. The expected energy/forces +# are the same as the single-rank reference (single-rank LAMMPS would +# need ``atom_modify map yes`` to use the regular artifact; multi-rank +# uses the with-comm artifact whose graph reproduces the gather via +# MPI exchange). +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3() -> None: + """Multi-rank LAMMPS run for DPA3 .pt2 must match the single-rank + reference within numerical tolerance. + + Requires the .pt2 archive to carry a with-comm artifact (Phase 3 + output for GNN models). If the archive lacks it, the C++ falls + back to the regular artifact and produces wrong cross-rank values + — which the assertion would catch (loud test failure, not silent). + """ + with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: + out_path = f.name + try: + sp.check_call( + [ + "mpirun", + "-n", + "2", + sys.executable, + str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"), + str(data_file.resolve()), + str(pb_file_mpi.resolve()), + out_path, + ] + ) + with open(out_path) as fh: + lines = fh.read().strip().splitlines() + pe_mpi = float(lines[0]) + forces_mpi = np.array( + [list(map(float, line.split())) for line in lines[1:]], + dtype=np.float64, + ) + # Energy matches single-rank reference. + assert pe_mpi == pytest.approx(expected_e) + # Per-atom forces match (atoms in id-sorted order from the + # subprocess script). + for ii in range(6): + np.testing.assert_allclose( + forces_mpi[ii], + expected_f[ii], + atol=1e-8, + rtol=0, + ) + finally: + if os.path.exists(out_path): + os.remove(out_path) diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 9dd9b50c3b..3bb7516155 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -452,3 +452,85 @@ TORCH_LIBRARY_FRAGMENT(deepmd, m) { m.def("border_op", border_op); m.def("border_op_backward", border_op_backward); } + +// ============================================================================ +// Opaque wrappers for the pt_expt (.pt2 / AOTInductor) export path. +// +// ``deepmd::border_op`` and ``deepmd::border_op_backward`` are registered +// without an explicit dispatch key, which makes them +// ``CompositeImplicitAutograd`` ops. ``torch.export`` decomposes such ops +// during tracing — i.e., it tries to inline the C++ kernel — and that +// fails because the kernel calls ``data_ptr()`` on FakeTensors. +// +// These ``deepmd_export::*`` wrappers are registered with explicit +// ``CPU`` and ``CUDA`` dispatch keys so ``torch.export`` records them as +// opaque external calls in the graph. The .pt2 archive embeds the call +// sites; at runtime the dispatcher routes back to the underlying +// ``deepmd::*`` op. Both clones because ``deepmd::border_op`` returns +// the same tensor it modified in place, which violates AOTInductor's +// no-aliasing rule for graph outputs. +// +// Python (``deepmd/pt_expt/utils/comm.py``) layers ``register_fake`` and +// ``register_autograd`` on top of these C++-defined ops so traced graphs +// can run their fake/backward. +// ============================================================================ + +namespace { +torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor, + const torch::Tensor& sendproc_tensor, + const torch::Tensor& recvproc_tensor, + const torch::Tensor& sendnum_tensor, + const torch::Tensor& recvnum_tensor, + const torch::Tensor& g1_tensor, + const torch::Tensor& communicator_tensor, + const torch::Tensor& nlocal_tensor, + const torch::Tensor& nghost_tensor) { + auto out = border_op(sendlist_tensor, sendproc_tensor, recvproc_tensor, + sendnum_tensor, recvnum_tensor, g1_tensor, + communicator_tensor, nlocal_tensor, nghost_tensor); + // border_op returns {g1_tensor} — a list whose first element aliases + // g1_tensor. Clone for AOTI graph-output correctness. + return out.empty() ? torch::empty_like(g1_tensor) : out[0].clone(); +} + +torch::Tensor border_op_backward_export( + const torch::Tensor& sendlist_tensor, + const torch::Tensor& sendproc_tensor, + const torch::Tensor& recvproc_tensor, + const torch::Tensor& sendnum_tensor, + const torch::Tensor& recvnum_tensor, + const torch::Tensor& grad_g1, + const torch::Tensor& communicator_tensor, + const torch::Tensor& nlocal_tensor, + const torch::Tensor& nghost_tensor) { + return border_op_backward(sendlist_tensor, sendproc_tensor, recvproc_tensor, + sendnum_tensor, recvnum_tensor, grad_g1, + communicator_tensor, nlocal_tensor, nghost_tensor) + .clone(); +} +} // namespace + +TORCH_LIBRARY_FRAGMENT(deepmd_export, m) { + m.def( + "border_op(Tensor sendlist, Tensor sendproc, Tensor recvproc, " + "Tensor sendnum, Tensor recvnum, Tensor g1, Tensor communicator, " + "Tensor nlocal, Tensor nghost) -> Tensor"); + m.def( + "border_op_backward(Tensor sendlist, Tensor sendproc, Tensor recvproc, " + "Tensor sendnum, Tensor recvnum, Tensor grad_g1, Tensor communicator, " + "Tensor nlocal, Tensor nghost) -> Tensor"); +} + +// Register CPU + CUDA implementations under explicit dispatch keys so +// torch.export sees opaque external calls (vs CompositeImplicitAutograd +// which gets decomposed during trace). +TORCH_LIBRARY_IMPL(deepmd_export, CPU, m) { + m.impl("border_op", border_op_export); + m.impl("border_op_backward", border_op_backward_export); +} +#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) +TORCH_LIBRARY_IMPL(deepmd_export, CUDA, m) { + m.impl("border_op", border_op_export); + m.impl("border_op_backward", border_op_backward_export); +} +#endif diff --git a/source/tests/infer/gen_dpa3.py b/source/tests/infer/gen_dpa3.py index 322163462d..69a6757d0e 100644 --- a/source/tests/infer/gen_dpa3.py +++ b/source/tests/infer/gen_dpa3.py @@ -88,6 +88,24 @@ def main(): print(f"Exporting to {pt2_path} ...") # noqa: T201 pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data)) + # Multi-rank LAMMPS variant (use_loc_mapping=False) — produces a + # dual-artifact .pt2 with the with-comm AOTI module nested inside + # so the C++ DeepPotPTExpt routes to it under mpirun. See + # source/lmp/tests/test_lammps_dpa3_pt2.py::test_pair_deepmd_mpi_dpa3. + config_mpi = copy.deepcopy(config) + config_mpi["descriptor"]["use_loc_mapping"] = False + model_mpi = get_model(config_mpi) + data_mpi = { + "model": model_mpi.serialize(), + "model_def_script": config_mpi, + "backend": "dpmodel", + "software": "deepmd-kit", + "version": "3.0.0", + } + pt2_mpi_path = os.path.join(base_dir, "deeppot_dpa3_mpi.pt2") + print(f"Exporting to {pt2_mpi_path} ...") # noqa: T201 + pt_expt_deserialize_to_file(pt2_mpi_path, copy.deepcopy(data_mpi)) + pth_path = os.path.join(base_dir, "deeppot_dpa3.pth") print(f"Exporting to {pth_path} ...") # noqa: T201 try: From 8b2501dbb1e8db7dd8f9fa5c965293c74c9e0e71 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 17:43:03 +0800 Subject: [PATCH 10/34] test(gnn-mpi): expand multi-rank coverage; address Phase 5 follow-up gaps Three fixes targeting the limitations from the previous Phase 5 commit: 1. NULL-type atoms (build_comm_dict_with_virtual_atoms equivalent). When ``select_real_atoms_coord`` filters atoms with atype < 0, the LAMMPS-supplied sendlist still indexes the original atom array. ``DeepPotPTExpt::compute`` (and Spin) now check ``has_null_atoms = (nall_real < nall)`` and route to the new ``build_comm_tensors_positional_with_virtual_atoms`` helper in commonPTExpt.h, which calls ``remap_comm_sendlist`` to translate indices through ``fwd_map`` (mirrors what ``commonPT.h::build_comm_dict_with_virtual_atoms`` does for the torch.jit pt-backend). Untested numerically (no test fixture produces NULL-type atoms in multi-rank); code path is structurally identical to the validated pt-backend equivalent. 2. nlist-rebuild test (test_pair_deepmd_mpi_dpa3_nlist_rebuild). Runs 50 MD steps under mpirun -n 2 with neigh_modify every=10, forcing >=5 neighbor-list rebuilds. Validates the with-comm dispatch path stays consistent across rebuilds (the comm tensors are reconstructed when ``ago == 0`` triggers). Asserts forces stay finite and bounded; no exact-value comparison since round- off accumulates over the trajectory and cross-rank ordering can shift the LSBs. 3. Spin multi-rank dispatch wiring (DeepSpinPTExpt::compute). Same has_null_atoms branch as DeepPotPTExpt. Code path structurally identical to the validated DeepPotPTExpt path; no spin-specific multi-rank test yet (would need a spin DPA3 .pt2 with use_loc_mapping=False to exercise it end-to-end). Note: virial check via LAMMPS compute pressure NULL virial caused PyLammps multi-rank deadlock; deferred to a follow-up. Forces ARE the autograd output of energy through the with-comm graph, so force parity already validates the with-comm backward path. All 26 pt_expt LAMMPS tests pass (including the new multi-rank ones); 9 model_devi_pt2 tests confirm DeepPotModelDevi delegates correctly through the dispatch. --- source/api_cc/src/DeepPotPTExpt.cc | 21 +++- source/api_cc/src/DeepSpinPTExpt.cc | 19 ++- source/api_cc/src/commonPTExpt.h | 53 +++++--- .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 15 +++ source/lmp/tests/test_lammps_dpa3_pt2.py | 116 ++++++++++++------ 5 files changed, 162 insertions(+), 62 deletions(-) diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 020566de23..711061f6bf 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -396,10 +397,24 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // tensor to gather ghost embeddings from local atoms. std::vector flat_outputs; bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0; + // When NULL-type atoms exist, remapped storage must outlive comm + // tensors (the int** pointer-array tensor references it). + std::vector> remapped_sendlist; + std::vector remapped_sendlist_ptrs; + std::vector remapped_sendnum, remapped_recvnum; if (use_with_comm) { - auto comm_tensors = deepmd::ptexpt::build_comm_tensors_positional( - lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, nloc, - nghost_real); + bool has_null_atoms = (nall_real < nall); + std::vector comm_tensors; + if (has_null_atoms) { + comm_tensors = + deepmd::ptexpt::build_comm_tensors_positional_with_virtual_atoms( + lmp_list, fwd_map, nloc, nghost_real, remapped_sendlist, + remapped_sendlist_ptrs, remapped_sendnum, remapped_recvnum); + } else { + comm_tensors = deepmd::ptexpt::build_comm_tensors_positional( + lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, + nloc, nghost_real); + } flat_outputs = run_model_with_comm( coord_Tensor, atype_Tensor, firstneigh_tensor, mapping_tensor, fparam_tensor, aparam_tensor, comm_tensors); diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc index e16991d884..cf714ea79e 100644 --- a/source/api_cc/src/DeepSpinPTExpt.cc +++ b/source/api_cc/src/DeepSpinPTExpt.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -414,10 +415,22 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener, // (pre atom-doubling); the spin override halves them internally. std::vector flat_outputs; bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0; + std::vector> remapped_sendlist; + std::vector remapped_sendlist_ptrs; + std::vector remapped_sendnum, remapped_recvnum; if (use_with_comm) { - auto comm_tensors = deepmd::ptexpt::build_comm_tensors_positional( - lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, nloc, - nghost_real); + bool has_null_atoms = (nall_real < nall); + std::vector comm_tensors; + if (has_null_atoms) { + comm_tensors = + deepmd::ptexpt::build_comm_tensors_positional_with_virtual_atoms( + lmp_list, fwd_map, nloc, nghost_real, remapped_sendlist, + remapped_sendlist_ptrs, remapped_sendnum, remapped_recvnum); + } else { + comm_tensors = deepmd::ptexpt::build_comm_tensors_positional( + lmp_list, lmp_list.sendlist, lmp_list.sendnum, lmp_list.recvnum, + nloc, nghost_real); + } flat_outputs = run_model_with_comm( coord_Tensor, atype_Tensor, spin_Tensor, firstneigh_tensor, mapping_tensor, fparam_tensor, aparam_tensor, comm_tensors); diff --git a/source/api_cc/src/commonPTExpt.h b/source/api_cc/src/commonPTExpt.h index dcaaddd6ea..20ffe10781 100644 --- a/source/api_cc/src/commonPTExpt.h +++ b/source/api_cc/src/commonPTExpt.h @@ -18,6 +18,7 @@ #include #include +#include "common.h" // for remap_comm_sendlist #include "errors.h" #include "neighbor_list.h" @@ -649,21 +650,9 @@ class TempFile { // ============================================================================ /** - * @brief Build the 8 comm-tensor positional inputs from LAMMPS data. - * - * Tensors share storage with the LAMMPS-owned buffers (no copy); - * the caller must keep ``lmp_list``, ``sendlist``, ``sendnum``, and - * ``recvnum`` alive until ``loader->run`` returns. ``nlocal`` / - * ``nghost`` are produced via ``torch::tensor`` (small allocation). - * - * @param lmp_list LAMMPS neighbor list (provides nswap, sendproc, - * recvproc, world). - * @param sendlist int** pointer-array (already remapped if needed). - * @param sendnum int* per-swap send counts (already remapped). - * @param recvnum int* per-swap recv counts (already remapped). - * @param nlocal Number of local atoms (per-rank). - * @param nghost Number of ghost atoms (per-rank). - * @return Vector of 8 tensors in canonical positional order. + * @brief Build the 8 comm-tensor positional inputs from LAMMPS data + * (Phase 5 working signature, restored after the consolidation + * attempt regressed). */ inline std::vector build_comm_tensors_positional( const InputNlist& lmp_list, @@ -678,9 +667,6 @@ inline std::vector build_comm_tensors_positional( auto int64_option = torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64); - // sendlist is int**: nswap entries each holding an int* pointer. - // Reinterpret as int64 for tensor packaging (matches what pt's - // build_comm_dict does and what border_op expects). at::Tensor sendlist_tensor = torch::from_blob(static_cast(sendlist), {nswap}, int64_option); at::Tensor sendproc_tensor = @@ -690,7 +676,6 @@ inline std::vector build_comm_tensors_positional( at::Tensor sendnum_tensor = torch::from_blob(sendnum, {nswap}, int32_option); at::Tensor recvnum_tensor = torch::from_blob(recvnum, {nswap}, int32_option); - // MPI communicator handle as a 1-element int64 tensor. static std::int64_t null_communicator = 0; at::Tensor communicator_tensor; if (lmp_list.world == nullptr) { @@ -701,7 +686,6 @@ inline std::vector build_comm_tensors_positional( torch::from_blob(const_cast(lmp_list.world), {1}, int64_option); } - // Scalar nlocal / nghost — int32 to match Python-side tracing. at::Tensor nlocal_tensor = torch::tensor(nlocal, int32_option); at::Tensor nghost_tensor = torch::tensor(nghost, int32_option); @@ -709,5 +693,34 @@ inline std::vector build_comm_tensors_positional( recvnum_tensor, communicator_tensor, nlocal_tensor, nghost_tensor}; } +/** + * @brief Build the 8 comm-tensor positional inputs with NULL-type-atom + * remapping. When ``select_real_atoms_coord`` filters atoms (atype < + * 0), ``fwd_map`` translates original sendlist indices into real-atom + * indices (with ``-1`` for filtered). Mirrors + * ``commonPT.h::build_comm_dict_with_virtual_atoms``. The remapped + * storage must outlive the returned tensors. + */ +inline std::vector build_comm_tensors_positional_with_virtual_atoms( + const InputNlist& lmp_list, + const std::vector& fwd_map, + int nlocal, + int nghost, + std::vector>& remapped_sendlist, + std::vector& remapped_sendlist_ptrs, + std::vector& remapped_sendnum, + std::vector& remapped_recvnum) { + remap_comm_sendlist(remapped_sendlist, remapped_sendnum, remapped_recvnum, + lmp_list, fwd_map); + int nswap = lmp_list.nswap; + remapped_sendlist_ptrs.resize(nswap); + for (int s = 0; s < nswap; ++s) { + remapped_sendlist_ptrs[s] = remapped_sendlist[s].data(); + } + return build_comm_tensors_positional( + lmp_list, remapped_sendlist_ptrs.data(), remapped_sendnum.data(), + remapped_recvnum.data(), nlocal, nghost); +} + } // namespace ptexpt } // namespace deepmd diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py index 29b103cf01..1d593882bd 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -33,6 +33,14 @@ parser.add_argument("DATAFILE", type=str, help="LAMMPS data file (atom positions)") parser.add_argument("PB_FILE", type=str, help=".pt2 model file") parser.add_argument("OUTPUT", type=str, help="Output file for energies + forces") +parser.add_argument( + "--nsteps", + type=int, + default=0, + help="Number of MD steps to run after the initial force evaluation; " + "with --nsteps > 10 (LAMMPS neigh_modify every=10) the dispatch path " + "is exercised across at least one neighbor-list rebuild.", +) args = parser.parse_args() lammps = PyLammps() @@ -55,6 +63,13 @@ lammps.pair_coeff("* *") lammps.run(0) +# Optional: run additional MD steps to exercise the with-comm +# dispatch across neighbor-list rebuilds (LAMMPS rebuilds every +# 10 steps with our neigh_modify config, so any nsteps >= 10 +# triggers at least one rebuild). +if args.nsteps > 0: + lammps.run(args.nsteps) + # Forces need to be gathered across ranks. PyLammps's ``atoms[i]`` # only exposes rank-local atoms; ``gather_atoms`` returns the global, # id-ordered array on every rank. diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index 73b3ea1442..7b33c64b75 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -341,6 +341,38 @@ def test_pair_deepmd_si(lammps_si) -> None: # --------------------------------------------------------------------------- +def _run_mpi_subprocess(extra_args: list[str] | None = None) -> dict: + """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under mpirun -n 2, + return ``{"pe": float, "forces": (n, 3) array}``.""" + with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: + out_path = f.name + try: + argv = [ + "mpirun", + "-n", + "2", + sys.executable, + str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"), + str(data_file.resolve()), + str(pb_file_mpi.resolve()), + out_path, + ] + if extra_args: + argv.extend(extra_args) + sp.check_call(argv) + with open(out_path) as fh: + lines = fh.read().strip().splitlines() + pe = float(lines[0]) + forces = np.array( + [list(map(float, line.split())) for line in lines[1:]], + dtype=np.float64, + ) + return {"pe": pe, "forces": forces} + finally: + if os.path.exists(out_path): + os.remove(out_path) + + @pytest.mark.skipif( shutil.which("mpirun") is None, reason="MPI is not installed on this system" ) @@ -349,46 +381,58 @@ def test_pair_deepmd_si(lammps_si) -> None: ) def test_pair_deepmd_mpi_dpa3() -> None: """Multi-rank LAMMPS run for DPA3 .pt2 must match the single-rank - reference within numerical tolerance. + reference within numerical tolerance for energy and forces. + + Forces are the autograd output of energy through the with-comm + graph, so they implicitly validate the backward path of + ``deepmd_export::border_op``. Virial requires a separate + ``compute pressure NULL virial`` which interacts poorly with + PyLammps multi-rank (hangs); deferred to a follow-up. Requires the .pt2 archive to carry a with-comm artifact (Phase 3 output for GNN models). If the archive lacks it, the C++ falls back to the regular artifact and produces wrong cross-rank values — which the assertion would catch (loud test failure, not silent). """ - with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: - out_path = f.name - try: - sp.check_call( - [ - "mpirun", - "-n", - "2", - sys.executable, - str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"), - str(data_file.resolve()), - str(pb_file_mpi.resolve()), - out_path, - ] - ) - with open(out_path) as fh: - lines = fh.read().strip().splitlines() - pe_mpi = float(lines[0]) - forces_mpi = np.array( - [list(map(float, line.split())) for line in lines[1:]], - dtype=np.float64, + out = _run_mpi_subprocess() + # Energy matches single-rank reference. + assert out["pe"] == pytest.approx(expected_e) + # Per-atom forces match (atoms in id-sorted order from the + # subprocess script). + for ii in range(6): + np.testing.assert_allclose( + out["forces"][ii], + expected_f[ii], + atol=1e-8, + rtol=0, ) - # Energy matches single-rank reference. - assert pe_mpi == pytest.approx(expected_e) - # Per-atom forces match (atoms in id-sorted order from the - # subprocess script). - for ii in range(6): - np.testing.assert_allclose( - forces_mpi[ii], - expected_f[ii], - atol=1e-8, - rtol=0, - ) - finally: - if os.path.exists(out_path): - os.remove(out_path) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None: + """Multi-rank with neighbor-list rebuilds. + + Runs ~50 MD steps with ``neigh_modify every 10 delay 0 check no``, + forcing at least 5 nlist rebuilds during the trajectory. The + purpose is NOT to validate exact final-state values (round-off + accumulates over MD steps and cross-rank ordering can shift the + LSBs) but to verify the with-comm dispatch path stays consistent + across rebuilds — i.e. ``DeepPotPTExpt::compute`` correctly + reconstructs the comm tensors when ``ago == 0`` triggers and the + AOTI graph keeps producing finite values. + """ + out = _run_mpi_subprocess(extra_args=["--nsteps", "50"]) + # Trajectory advanced; final state will differ from the run-0 + # reference. Just sanity-check finite values + reasonable forces. + assert np.all(np.isfinite(out["forces"])) + assert np.isfinite(out["pe"]) + # Force magnitudes shouldn't blow up; pick a generous bound for + # the small-box water-like 6-atom system. + assert np.max(np.abs(out["forces"])) < 100.0, ( + f"forces exploded after 50 steps: max|f|={np.max(np.abs(out['forces']))}" + ) From c43bd8bc8b022ab182d13a7899795cb8c5ed068f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 21:22:35 +0800 Subject: [PATCH 11/34] test(gnn-mpi): tighten multi-rank LAMMPS test assertions - run_mpi_pair_deepmd_dpa3_pt2.py: gather atom ids alongside forces and sort by id explicitly. Output ordering is now robust to subdomain layout, empty subdomains, or future LAMMPS gather_atoms changes. Add atom_modify map yes so single-rank dispatch on the dual-artifact .pt2 (uses mapping) works; expose --processors so the runner can produce a same-archive single-rank reference. - test_pair_deepmd_mpi_dpa3_nlist_rebuild: replace the finite/bounded sanity check with a value comparison against a single-rank reference of the same trajectory (mpirun -n 1, processors "1 1 1"). 25 MD steps cross two nlist rebuilds, atol=1e-6 forces / rel=1e-8 energy. This catches a wrong-but-finite force from a dispatch bug that the previous assertion would have missed. --- .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 33 ++++++++-- source/lmp/tests/test_lammps_dpa3_pt2.py | 61 ++++++++++++------- 2 files changed, 66 insertions(+), 28 deletions(-) diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py index 1d593882bd..fa536b5c6f 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -41,16 +41,31 @@ "with --nsteps > 10 (LAMMPS neigh_modify every=10) the dispatch path " "is exercised across at least one neighbor-list rebuild.", ) +parser.add_argument( + "--processors", + type=str, + default="2 1 1", + help="LAMMPS processors grid. Default '2 1 1' forces multi-rank " + "domain decomposition (nswap>0). Pass '1 1 1' for a single-rank " + "reference run on the same archive (single-artifact dispatch).", +) args = parser.parse_args() lammps = PyLammps() -# Force a non-trivial domain decomposition: 2 x 1 x 1 across ranks. -# Combined with the simulation box this guarantees nswap > 0 on the C++ -# side, so DeepPotPTExpt routes to the with-comm AOTI artifact. -lammps.processors("2 1 1") +# Force the requested domain decomposition. The default "2 1 1" +# combined with the simulation box guarantees nswap > 0 on the C++ +# side, so DeepPotPTExpt routes to the with-comm AOTI artifact. Pass +# "1 1 1" to obtain a single-rank reference using the same archive +# (the regular artifact handles nswap==0). +lammps.processors(args.processors) lammps.units("metal") lammps.boundary("p p p") lammps.atom_style("atomic") +# ``atom_modify map yes`` is required when single-rank dispatch goes +# through the regular artifact of a use_loc_mapping=False .pt2: the +# C++ side needs the LAMMPS global-id->local-index map to build the +# ``mapping`` tensor. It is harmless under multi-rank. +lammps.atom_modify("map yes") lammps.neighbor("2.0 bin") lammps.neigh_modify("every 10 delay 0 check no") lammps.read_data(args.DATAFILE) @@ -72,13 +87,21 @@ # Forces need to be gathered across ranks. PyLammps's ``atoms[i]`` # only exposes rank-local atoms; ``gather_atoms`` returns the global, -# id-ordered array on every rank. +# id-ordered array on every rank. We also gather ``id`` and reorder +# explicitly by id rather than trusting an implicit ordering — this +# is robust against subdomain layout, empty subdomains, and any +# future LAMMPS change in gather ordering. forces_global = lammps.lmp.gather_atoms("f", 1, 3) +ids_global = lammps.lmp.gather_atoms("id", 0, 1) # ``PyLammps.eval`` is rank-0-only. if rank == 0: pe_global = lammps.eval("pe") natoms = lammps.atoms.natoms forces = np.array(forces_global, dtype=np.float64).reshape(natoms, 3) + ids = np.array(ids_global, dtype=np.int64).reshape(natoms) + # Sort by atom id so output is unambiguously id-ordered (id 1 first). + order = np.argsort(ids) + forces = forces[order] with open(args.OUTPUT, "w") as f: f.write(f"{pe_global:.16e}\n") for row in forces: diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index 7b33c64b75..d14a46cea0 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -341,22 +341,33 @@ def test_pair_deepmd_si(lammps_si) -> None: # --------------------------------------------------------------------------- -def _run_mpi_subprocess(extra_args: list[str] | None = None) -> dict: - """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under mpirun -n 2, - return ``{"pe": float, "forces": (n, 3) array}``.""" +def _run_mpi_subprocess( + extra_args: list[str] | None = None, + nprocs: int = 2, +) -> dict: + """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under + ``mpirun -n `` and return ``{"pe": float, "forces": (n, 3) array}``. + + With ``nprocs == 1`` the runner is invoked with ``--processors 1 1 1`` + so the C++ side sees ``nswap == 0`` and routes to the regular + (single-rank) artifact of the dual-artifact .pt2 — useful as a + same-archive reference for multi-rank comparisons. + """ with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: out_path = f.name try: argv = [ "mpirun", "-n", - "2", + str(nprocs), sys.executable, str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"), str(data_file.resolve()), str(pb_file_mpi.resolve()), out_path, ] + if nprocs == 1: + argv.extend(["--processors", "1 1 1"]) if extra_args: argv.extend(extra_args) sp.check_call(argv) @@ -415,24 +426,28 @@ def test_pair_deepmd_mpi_dpa3() -> None: importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" ) def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None: - """Multi-rank with neighbor-list rebuilds. - - Runs ~50 MD steps with ``neigh_modify every 10 delay 0 check no``, - forcing at least 5 nlist rebuilds during the trajectory. The - purpose is NOT to validate exact final-state values (round-off - accumulates over MD steps and cross-rank ordering can shift the - LSBs) but to verify the with-comm dispatch path stays consistent - across rebuilds — i.e. ``DeepPotPTExpt::compute`` correctly - reconstructs the comm tensors when ``ago == 0`` triggers and the - AOTI graph keeps producing finite values. + """Multi-rank with neighbor-list rebuilds, validated against a + single-rank reference of the same archive and trajectory. + + Runs 25 MD steps with ``neigh_modify every 10 delay 0 check no``, + so the multi-rank trajectory crosses two nlist rebuilds (at steps + 10 and 20) before the final force evaluation. The same trajectory + is then run under ``mpirun -n 1`` (regular-artifact dispatch on + the same dual-artifact .pt2) to obtain a reference; comparing the + two catches a wrong-but-finite force from a dispatch bug that the + previous finite/bounded check would miss. + + NVE is deterministic up to floating-point summation order, so the + cross-rank divergence after 25 steps is bounded by accumulated + round-off — small for a 6-atom system but non-zero, hence the + relaxed (but still tight) tolerances. """ - out = _run_mpi_subprocess(extra_args=["--nsteps", "50"]) - # Trajectory advanced; final state will differ from the run-0 - # reference. Just sanity-check finite values + reasonable forces. - assert np.all(np.isfinite(out["forces"])) - assert np.isfinite(out["pe"]) - # Force magnitudes shouldn't blow up; pick a generous bound for - # the small-box water-like 6-atom system. - assert np.max(np.abs(out["forces"])) < 100.0, ( - f"forces exploded after 50 steps: max|f|={np.max(np.abs(out['forces']))}" + out_mpi = _run_mpi_subprocess(extra_args=["--nsteps", "25"], nprocs=2) + out_ref = _run_mpi_subprocess(extra_args=["--nsteps", "25"], nprocs=1) + np.testing.assert_allclose( + out_mpi["forces"], + out_ref["forces"], + atol=1e-6, + rtol=1e-6, ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10) From 17064354511b6ab56e12d6f02dd97eecf7f8f83d Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 26 Apr 2026 22:17:27 +0800 Subject: [PATCH 12/34] fix(cc): handle empty subdomain in copy_from_nlist; expand MPI tests with virial - common.cc: NeighborListData::copy_from_nlist used &ilist[0] / &jlist[ii][0] for the memcpy destination, which is OOB on an empty vector (libstdc++ debug-mode assertion) and undefined behaviour in general. Switch to .data() and skip the copy when the count is zero. Surfaced by the new empty-subdomain MPI test where rank 1 owns nloc=0 atoms; the same latent bug also applied to atoms with no neighbours. - run_mpi_pair_deepmd_dpa3_pt2.py: also gather the per-atom virial via ``compute centroid/stress/atom NULL pair`` and ``lmp.gather("c_virial", 1, 9)``. Output rows are now (3 force) + (9 virial) per atom, id-sorted. - test_lammps_dpa3_pt2.py: * test_pair_deepmd_mpi_dpa3 now asserts virial against expected_v (with the same column permutation as test_pair_deepmd_virial), closing the previous "virial multi-rank deferred" gap. * test_pair_deepmd_mpi_dpa3_nlist_rebuild now also compares virial between the multi-rank and single-rank reference runs. * New test_pair_deepmd_mpi_dpa3_empty_subdomain: 30 x 13 x 13 box with all atoms in x in [0.25, 12.83]; under processors "2 1 1" rank 1 owns zero local atoms. Compares forces + virial + energy against a same-archive single-rank reference. --- source/api_cc/src/common.cc | 15 ++- .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 22 ++++- source/lmp/tests/test_lammps_dpa3_pt2.py | 94 +++++++++++++++++-- 3 files changed, 114 insertions(+), 17 deletions(-) diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc index 1ad1a5c97b..7154992892 100644 --- a/source/api_cc/src/common.cc +++ b/source/api_cc/src/common.cc @@ -276,13 +276,20 @@ void deepmd::NeighborListData::copy_from_nlist(const InputNlist& inlist, int inum = natoms >= 0 ? natoms : inlist.inum; ilist.resize(inum); jlist.resize(inum); - memcpy(&ilist[0], inlist.ilist, inum * sizeof(int)); + // Guard against an empty subdomain (inum == 0): &ilist[0] on an + // empty vector is OOB under libstdc++ debug-mode and undefined + // behaviour in general. Use data() and skip the copy when empty. + if (inum > 0) { + memcpy(ilist.data(), inlist.ilist, inum * sizeof(int)); + } for (int ii = 0; ii < inum; ++ii) { int jnum = inlist.numneigh[ii]; jlist[ii].resize(jnum); - memcpy(&jlist[ii][0], inlist.firstneigh[ii], jnum * sizeof(int)); - for (int jj = 0; jj < jnum; ++jj) { - jlist[ii][jj] &= inlist.mask; + if (jnum > 0) { + memcpy(jlist[ii].data(), inlist.firstneigh[ii], jnum * sizeof(int)); + for (int jj = 0; jj < jnum; ++jj) { + jlist[ii][jj] &= inlist.mask; + } } } } diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py index fa536b5c6f..d07af7a158 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -8,8 +8,10 @@ reports nswap > 0 (multi-rank), driving MPI ghost-atom exchange via ``deepmd_export::border_op`` per layer. -Rank 0 writes potential energy + per-atom forces to ``OUTPUT`` so the parent -pytest process can compare against the single-rank reference. +Rank 0 writes potential energy + per-atom forces (3 cols) + per-atom +virial (9 cols, from ``compute centroid/stress/atom NULL pair`` in +LAMMPS internal units) to ``OUTPUT`` so the parent pytest process can +compare against the single-rank reference. """ from __future__ import ( @@ -76,6 +78,12 @@ lammps.pair_style(f"deepmd {args.PB_FILE}") lammps.pair_coeff("* *") +# Per-atom virial from the pair contribution. ``centroid/stress/atom`` +# is parallel-safe (rank-local data, gathered below). LAMMPS computes +# stress*volume per atom in internal units; the parent test reverses +# the unit conversion (divide by ``constants.nktv2p``) before comparing +# against the reference virial. +lammps.compute("virial all centroid/stress/atom NULL pair") lammps.run(0) # Optional: run additional MD steps to exercise the with-comm @@ -93,18 +101,26 @@ # future LAMMPS change in gather ordering. forces_global = lammps.lmp.gather_atoms("f", 1, 3) ids_global = lammps.lmp.gather_atoms("id", 0, 1) +# Gather the per-atom virial across ranks. ``lmp.gather`` accepts +# named per-atom computes (``c_``) and returns the global, +# id-ordered array on every rank. +virial_global = lammps.lmp.gather("c_virial", 1, 9) # ``PyLammps.eval`` is rank-0-only. if rank == 0: pe_global = lammps.eval("pe") natoms = lammps.atoms.natoms forces = np.array(forces_global, dtype=np.float64).reshape(natoms, 3) + virials = np.array(virial_global, dtype=np.float64).reshape(natoms, 9) ids = np.array(ids_global, dtype=np.int64).reshape(natoms) # Sort by atom id so output is unambiguously id-ordered (id 1 first). order = np.argsort(ids) forces = forces[order] + virials = virials[order] with open(args.OUTPUT, "w") as f: f.write(f"{pe_global:.16e}\n") - for row in forces: + # Each row: 3 force components followed by 9 virial components. + for fi, vi in zip(forces, virials, strict=True): + row = np.concatenate([fi, vi]) f.write(" ".join(f"{v:.16e}" for v in row) + "\n") MPI.Finalize() diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index d14a46cea0..84127246d6 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -35,6 +35,12 @@ data_file = Path(__file__).parent / "data_dpa3_pt2.lmp" data_file_si = Path(__file__).parent / "data_dpa3_pt2.si" data_type_map_file = Path(__file__).parent / "data_type_map_dpa3_pt2.lmp" +# Elongated-box variant for the empty-subdomain MPI test: x is +# extended to 30 Å while atoms remain in x ∈ [0.25, 12.83]. Combined +# with ``processors 2 1 1`` this leaves rank 1 (x ≥ 15) with zero +# local atoms — a corner case the comm-dispatch path must handle +# without crashing or producing wrong forces. +data_file_empty_subdomain = Path(__file__).parent / "data_dpa3_pt2_empty_subdomain.lmp" # Reference values from gen_dpa3.py / test_deeppot_dpa3_ptexpt.cc (PBC) expected_ae = np.array( @@ -158,10 +164,19 @@ def setup_module() -> None: type_OH, data_file_si, ) + # Elongated x-axis; atoms unchanged. With ``processors 2 1 1`` the + # split is at x = 15 Å and rank 1 owns x ≥ 15, which is empty. + box_empty_subdomain = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0]) + write_lmp_data(box_empty_subdomain, coord, type_OH, data_file_empty_subdomain) def teardown_module() -> None: - for f in [data_file, data_type_map_file, data_file_si]: + for f in [ + data_file, + data_type_map_file, + data_file_si, + data_file_empty_subdomain, + ]: if f.exists(): os.remove(f) @@ -344,15 +359,22 @@ def test_pair_deepmd_si(lammps_si) -> None: def _run_mpi_subprocess( extra_args: list[str] | None = None, nprocs: int = 2, + data_path: Path | None = None, ) -> dict: """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under - ``mpirun -n `` and return ``{"pe": float, "forces": (n, 3) array}``. + ``mpirun -n `` and return + ``{"pe": float, "forces": (n, 3) array, "virials": (n, 9) array}``. With ``nprocs == 1`` the runner is invoked with ``--processors 1 1 1`` so the C++ side sees ``nswap == 0`` and routes to the regular (single-rank) artifact of the dual-artifact .pt2 — useful as a same-archive reference for multi-rank comparisons. + + ``data_path`` (default ``data_file``) selects the LAMMPS data file — + the empty-subdomain test points at a non-default elongated-box file. """ + if data_path is None: + data_path = data_file with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: out_path = f.name try: @@ -362,7 +384,7 @@ def _run_mpi_subprocess( str(nprocs), sys.executable, str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"), - str(data_file.resolve()), + str(data_path.resolve()), str(pb_file_mpi.resolve()), out_path, ] @@ -374,11 +396,14 @@ def _run_mpi_subprocess( with open(out_path) as fh: lines = fh.read().strip().splitlines() pe = float(lines[0]) - forces = np.array( + rows = np.array( [list(map(float, line.split())) for line in lines[1:]], dtype=np.float64, ) - return {"pe": pe, "forces": forces} + # Each row is (3 force) + (9 virial); see runner script. + forces = rows[:, :3] + virials = rows[:, 3:] + return {"pe": pe, "forces": forces, "virials": virials} finally: if os.path.exists(out_path): os.remove(out_path) @@ -392,16 +417,17 @@ def _run_mpi_subprocess( ) def test_pair_deepmd_mpi_dpa3() -> None: """Multi-rank LAMMPS run for DPA3 .pt2 must match the single-rank - reference within numerical tolerance for energy and forces. + reference within numerical tolerance for energy, forces, and virial. Forces are the autograd output of energy through the with-comm graph, so they implicitly validate the backward path of - ``deepmd_export::border_op``. Virial requires a separate - ``compute pressure NULL virial`` which interacts poorly with - PyLammps multi-rank (hangs); deferred to a follow-up. + ``deepmd_export::border_op``. Per-atom virial is gathered from + ``compute centroid/stress/atom NULL pair`` (parallel-safe) — the + earlier deadlock comment was specific to ``compute pressure NULL + virial`` + ``lammps.eval(...)``, which we sidestep entirely. Requires the .pt2 archive to carry a with-comm artifact (Phase 3 - output for GNN models). If the archive lacks it, the C++ falls + output for GNN models). If the archive lacks it, the C++ falls back to the regular artifact and produces wrong cross-rank values — which the assertion would catch (loud test failure, not silent). """ @@ -417,6 +443,20 @@ def test_pair_deepmd_mpi_dpa3() -> None: atol=1e-8, rtol=0, ) + # Per-atom virial matches the gen_dpa3.py reference. LAMMPS + # centroid/stress/atom returns components in [xx, yy, zz, xy, xz, + # yz, yx, zx, zy] order; ``expected_v`` columns follow the same + # column-major flattening as the single-rank ``test_pair_deepmd_virial`` + # (which uses idx_map [0, 4, 8, 3, 6, 7, 1, 2, 5] from c_virial[1..9] + # to expected_v columns). The inverse permutation maps + # ``out["virials"]`` columns back to ``expected_v`` columns. + expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2] + np.testing.assert_allclose( + out["virials"][:, expected_v_to_lammps] / constants.nktv2p, + expected_v, + atol=1e-8, + rtol=0, + ) @pytest.mark.skipif( @@ -450,4 +490,38 @@ def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None: atol=1e-6, rtol=1e-6, ) + np.testing.assert_allclose( + out_mpi["virials"], + out_ref["virials"], + atol=1e-6, + rtol=1e-6, + ) assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_empty_subdomain() -> None: + """Multi-rank DPA3 with one rank owning zero local atoms. + + Uses a 30 x 13 x 13 box with all six atoms clustered in x in + [0.25, 12.83]. Under ``processors 2 1 1`` the split is at x = 15 + so rank 1 owns an empty subdomain. The comm-dispatch path must + still produce correct forces and virial (compared against a + same-archive single-rank reference of the same configuration). + + This catches: zero-length send/recv lists in the comm tensors, + division-by-zero in nlocal-dependent reshapes, and any silent + drop of a rank's contribution when it has no atoms to evaluate. + """ + out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain) + out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-12, abs=1e-12) From a81fc10bd6c6b48cbd868987417a32d82c5bd252 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 27 Apr 2026 08:50:52 +0800 Subject: [PATCH 13/34] test: cover DPA2 multi-rank dispatch + fix opaque-op import order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - source/lmp/tests/test_lammps_dpa2_pt2.py (NEW): runs DPA2 .pt2 under mpirun -n 2 with the with-comm artifact and asserts pe + per-atom forces + per-atom virial match a same-archive single-rank reference. Closes the recorded gap "DPA2 multi-rank dispatch never exercised end-to-end" (gnn_mpi_untested_paths.md). The runner script (run_mpi_pair_deepmd_dpa3_pt2.py) is descriptor-agnostic so no new driver is needed. - source/tests/infer/gen_dpa2.py: drop dead config_mpi block accidentally added during planning. DPA2's repformer has no use_loc_mapping knob (unlike DPA3), so the single deeppot_dpa2.pt2 already carries the dual-artifact layout — _has_message_passing returns True for any DPA2 model. - source/tests/pt_expt/conftest.py: ``import deepmd.pt`` at conftest evaluation time so libdeepmd_op_pt.so is loaded and ``deepmd_export::{border_op, border_op_backward}`` are registered before any pt_expt test module imports ``deepmd.pt_expt.utils`` (which transitively imports ``comm.py`` and its ``_check_underlying_ops_loaded()`` runtime check). Previously this worked only when the test was collected alongside earlier modules that happened to import deepmd.pt first; running the spin/export tests in isolation crashed at collection. - source/tests/pt_expt/model/test_spin_export_with_comm.py: fix pre-existing test data bug — model has sel=[20,20,20] (sum=60) but the trace test was passing nlist with width 6, tripping the _format_nlist post-condition assertion. Now uses the correct sum(sel) width. Surfaced once the conftest fix above made the test reliably runnable in isolation. --- source/lmp/tests/test_lammps_dpa2_pt2.py | 145 ++++++++++++++++++ source/tests/infer/gen_dpa2.py | 4 + source/tests/pt_expt/conftest.py | 9 ++ .../model/test_spin_export_with_comm.py | 6 +- 4 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 source/lmp/tests/test_lammps_dpa2_pt2.py diff --git a/source/lmp/tests/test_lammps_dpa2_pt2.py b/source/lmp/tests/test_lammps_dpa2_pt2.py new file mode 100644 index 0000000000..48ed966605 --- /dev/null +++ b/source/lmp/tests/test_lammps_dpa2_pt2.py @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Multi-rank LAMMPS test for DPA2 .pt2 (extends GNN MPI Phase 5 to DPA2). + +DPA2's repformer block participates in the per-layer ghost-atom MPI +exchange just like DPA3's repflows; the with-comm AOTInductor artifact +is produced automatically by ``deepmd/pt_expt/utils/serialization.py`` +because ``_has_message_passing`` returns True for any DPA2 model. + +Unlike DPA3 (which has ``use_loc_mapping``), DPA2's repformer always +takes a ``mapping`` tensor, so a single ``deeppot_dpa2.pt2`` already +carries the dual-artifact layout — no separate ``_mpi.pt2`` needed. + +This file targets the gap "DPA2 multi-rank dispatch never tested +end-to-end" recorded in +``memory/gnn_mpi_untested_paths.md::Dispatch wired, no test fixture``. +The reference is a same-archive single-rank run (``mpirun -n 1`` +through the same dual-artifact ``.pt2``); no hardcoded reference +values are needed. +""" + +from __future__ import ( + annotations, +) + +import importlib.util +import os +import shutil +import subprocess as sp +import sys +import tempfile +from pathlib import ( + Path, +) + +import numpy as np +import pytest +from write_lmp_data import ( + write_lmp_data, +) + +# Reuses the same generic mpirun driver as the DPA3 multi-rank tests — +# the script is descriptor-agnostic (just LAMMPS + pair_style deepmd). +RUNNER_PATH = Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py" + +pb_file = Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa2.pt2" +data_file = Path(__file__).parent / "data_dpa2_pt2.lmp" + +box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0]) +coord = np.array( + [ + [12.83, 2.56, 2.18], + [12.09, 2.87, 2.74], + [0.25, 3.32, 1.68], + [3.36, 3.00, 1.81], + [3.51, 2.51, 2.60], + [4.27, 3.22, 1.56], + ] +) +type_OH = np.array([1, 2, 2, 1, 2, 2]) + + +def setup_module() -> None: + if os.environ.get("ENABLE_PYTORCH", "1") != "1": + pytest.skip( + "Skip test because PyTorch support is not enabled.", + ) + write_lmp_data(box, coord, type_OH, data_file) + + +def teardown_module() -> None: + if data_file.exists(): + os.remove(data_file) + + +def _run_mpi_subprocess(nprocs: int = 2) -> dict: + """Invoke the generic mpirun driver and parse the output. + + With ``nprocs == 2`` (default) the runner forces ``processors 2 1 1`` + so ``DeepPotPTExpt`` routes to the with-comm artifact. With + ``nprocs == 1`` the runner uses ``processors 1 1 1`` and the C++ + side falls back to the regular artifact — useful as a same-archive + reference for value comparison. + """ + with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: + out_path = f.name + try: + argv = [ + "mpirun", + "-n", + str(nprocs), + sys.executable, + str(RUNNER_PATH), + str(data_file.resolve()), + str(pb_file.resolve()), + out_path, + ] + if nprocs == 1: + argv.extend(["--processors", "1 1 1"]) + sp.check_call(argv) + with open(out_path) as fh: + lines = fh.read().strip().splitlines() + pe = float(lines[0]) + rows = np.array( + [list(map(float, line.split())) for line in lines[1:]], + dtype=np.float64, + ) + forces = rows[:, :3] + virials = rows[:, 3:] + return {"pe": pe, "forces": forces, "virials": virials} + finally: + if os.path.exists(out_path): + os.remove(out_path) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa2() -> None: + """Multi-rank DPA2 .pt2 dispatch must match the same-archive + single-rank reference for energy, forces, and virial. + + Verifies that: + - ``DeepPotPTExpt::compute`` correctly routes to the with-comm + artifact for DPA2 (descriptor-agnostic dispatch). + - The pt_expt ``DescrptBlockRepformers._exchange_ghosts`` override + drives ``deepmd_export::border_op`` for repformer's per-layer + ghost exchange (the path equivalent to DPA3's repflows). + - Different ``model_nnei`` from DPA3 (DPA2 repformer has nsel=15 + vs DPA3's e_sel=30) — exercises the dynamic-nnei with-comm + trace at a different baked-in value. + + No hardcoded reference; compares against a same-archive single-rank + run (``mpirun -n 1`` + ``processors 1 1 1`` falls back to the + regular artifact). + """ + out_mpi = _run_mpi_subprocess(nprocs=2) + out_ref = _run_mpi_subprocess(nprocs=1) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-12, abs=1e-12) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) diff --git a/source/tests/infer/gen_dpa2.py b/source/tests/infer/gen_dpa2.py index 8ce277fcf5..5aff706aab 100644 --- a/source/tests/infer/gen_dpa2.py +++ b/source/tests/infer/gen_dpa2.py @@ -108,6 +108,10 @@ def main(): pt2_path = os.path.join(base_dir, "deeppot_dpa2.pt2") print(f"Exporting to {pt2_path} ...") # noqa: T201 + # DPA2's repformer block has no ``use_loc_mapping`` knob (unlike + # DPA3), so a single .pt2 already carries the dual-artifact layout + # (regular + with-comm) — _has_message_passing returns True and the + # serializer produces both. No separate _mpi.pt2 needed. pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data), do_atomic_virial=True) pth_path = os.path.join(base_dir, "deeppot_dpa2.pth") diff --git a/source/tests/pt_expt/conftest.py b/source/tests/pt_expt/conftest.py index f2a9b07a6a..06bca2fec5 100644 --- a/source/tests/pt_expt/conftest.py +++ b/source/tests/pt_expt/conftest.py @@ -17,6 +17,15 @@ _get_current_function_mode_stack, ) +# Import ``deepmd.pt`` at conftest evaluation time so libdeepmd_op_pt.so +# is loaded and ``deepmd_export::{border_op, border_op_backward}`` are +# registered before any pt_expt test module imports +# ``deepmd.pt_expt.utils`` (which transitively imports ``comm.py`` and +# its ``_check_underlying_ops_loaded()`` runtime check). Previously this +# worked only when collected alongside earlier tests that happened to +# import deepmd.pt first. +import deepmd.pt # noqa: F401 - side-effect: register custom ops + def _pop_device_contexts() -> list: """Pop all stale DeviceContext modes from the torch function mode stack.""" diff --git a/source/tests/pt_expt/model/test_spin_export_with_comm.py b/source/tests/pt_expt/model/test_spin_export_with_comm.py index 93b22bf864..f77c9fe415 100644 --- a/source/tests/pt_expt/model/test_spin_export_with_comm.py +++ b/source/tests/pt_expt/model/test_spin_export_with_comm.py @@ -95,14 +95,16 @@ def test_spin_forward_common_lower_exportable_with_comm_traces() -> None: model.eval() # Build sample inputs (nframes=1 to match the override's nb=1 - # constraint; spin doubles natoms). + # constraint; spin doubles natoms). nlist width must match the + # model's sum(sel); the descriptor's _format_nlist asserts this. nloc = 6 # 3 real + 3 virtual nall = 8 # 1 ghost on each side n_dim_coord = 3 + nnei = sum(SPIN_GNN_DATA["descriptor"]["sel"]) ext_coord = torch.zeros(1, nall, n_dim_coord, dtype=torch.float64) ext_atype = torch.zeros(1, nall, dtype=torch.int64) ext_spin = torch.zeros(1, nall, n_dim_coord, dtype=torch.float64) - nlist = torch.zeros(1, nloc, 6, dtype=torch.int64) # nnei from sel + nlist = torch.zeros(1, nloc, nnei, dtype=torch.int64) mapping = torch.zeros(1, nall, dtype=torch.int64) fparam = None aparam = None From ece5c3daa8bcc4b9574e3fe584f3472204e89c3e Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 30 Apr 2026 17:35:31 +0800 Subject: [PATCH 14/34] test: extend MPI coverage with N>2 decompositions and schema-drift unit test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes two gaps from the GNN-MPI untested-paths catalog: - ``test_pair_deepmd_mpi_dpa3_decomposition`` (parametrized): runs DPA3 .pt2 under three additional processor grids — ``4@2x2x1`` (2D), ``4@4x1x1`` (1D-deep chain), and ``8@2x2x2`` (3D). All three must match the gen_dpa3.py reference for energy / per-atom force / per-atom virial within atol=1e-8. The 2x2x2 split puts several subdomains empty, so this also exercises the ``copy_from_nlist`` empty-subdomain guard in a 3D layout. - ``source/tests/pt_expt/utils/test_has_message_passing.py``: pins ``_has_message_passing`` against schema drift. The detection chain (``model.atomic_model.descriptor`` -> ``descriptor.has_message_passing()`` -> ``block.use_loc_mapping``) is brittle to attribute renames in the dpmodel descriptor layer; a silent regression would disable the with-comm artifact and break multi-rank LAMMPS for GNN users with no test failure to flag it. The test asserts the documented value for 5 baseline configs (se_e2_a, dpa1, dpa3 use_loc_mapping=True/False, dpa2) plus two stub-model defensive cases. The runner helper ``_run_mpi_subprocess`` gains an optional ``processors`` arg so the new parametrized test can dictate the LAMMPS ``processors`` grid; existing tests keep their previous defaults (``2 1 1`` for nprocs=2, ``1 1 1`` for nprocs=1). --- source/lmp/tests/test_lammps_dpa3_pt2.py | 55 ++++- .../pt_expt/utils/test_has_message_passing.py | 229 ++++++++++++++++++ 2 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 source/tests/pt_expt/utils/test_has_message_passing.py diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index 84127246d6..61058b770e 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -360,6 +360,7 @@ def _run_mpi_subprocess( extra_args: list[str] | None = None, nprocs: int = 2, data_path: Path | None = None, + processors: str | None = None, ) -> dict: """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under ``mpirun -n `` and return @@ -372,6 +373,10 @@ def _run_mpi_subprocess( ``data_path`` (default ``data_file``) selects the LAMMPS data file — the empty-subdomain test points at a non-default elongated-box file. + + ``processors`` overrides the runner's default decomposition string + (``"2 1 1"``); used by the ``test_*_decomposition`` variants to + exercise 2D / 3D processor grids (Px*Py*Pz must equal nprocs). """ if data_path is None: data_path = data_file @@ -388,7 +393,9 @@ def _run_mpi_subprocess( str(pb_file_mpi.resolve()), out_path, ] - if nprocs == 1: + if processors is not None: + argv.extend(["--processors", processors]) + elif nprocs == 1: argv.extend(["--processors", "1 1 1"]) if extra_args: argv.extend(extra_args) @@ -525,3 +532,49 @@ def test_pair_deepmd_mpi_dpa3_empty_subdomain() -> None: out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 ) assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-12, abs=1e-12) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +@pytest.mark.parametrize( + "nprocs,processors", + [ + (4, "2 2 1"), # 2D decomposition; nswap > 2, two-direction borders + (4, "4 1 1"), # 1D-deep chain; sendlist depth = 3 (each pair is 1+2 swaps) + (8, "2 2 2"), # 3D decomposition; full xyz border exchange + ], +) +def test_pair_deepmd_mpi_dpa3_decomposition(nprocs, processors) -> None: + """Multi-rank DPA3 .pt2 must match the single-rank reference under + deeper / 3D processor grids beyond the canonical 2x1x1 (N=2) layout. + + Production MD typically runs with 8/16/32+ ranks and 2D/3D + decompositions. Bugs that don't fire at N=2 (deeper sendlist + chains, 3D border swaps, asymmetric subdomains, multiple empty + cells in the 2x2x2 split of a small fixture) have zero coverage + without this test. + + The 6-atom 13x13x13 fixture is intentionally small relative to + the rank count: in the 2x2x2 split each subdomain is + ~6.5x6.5x6.5 A, so several subdomains are empty — exercising the + empty-subdomain ``copy_from_nlist`` guard fix in 3D. + """ + out_mpi = _run_mpi_subprocess(nprocs=nprocs, processors=processors) + # Step-0 evaluation; bit-exact match expected against the + # gen_dpa3.py-derived reference. + assert out_mpi["pe"] == pytest.approx(expected_e, rel=0, abs=1e-8) + for ii in range(6): + np.testing.assert_allclose( + out_mpi["forces"][ii], expected_f[ii], atol=1e-8, rtol=0 + ) + expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2] + np.testing.assert_allclose( + out_mpi["virials"][:, expected_v_to_lammps] / constants.nktv2p, + expected_v, + atol=1e-8, + rtol=0, + ) diff --git a/source/tests/pt_expt/utils/test_has_message_passing.py b/source/tests/pt_expt/utils/test_has_message_passing.py new file mode 100644 index 0000000000..673e4d8bd0 --- /dev/null +++ b/source/tests/pt_expt/utils/test_has_message_passing.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Schema-drift regression test for ``_has_message_passing``. + +``_has_message_passing`` (in ``deepmd/pt_expt/utils/serialization.py``) +gates whether the dual-artifact ``.pt2`` is produced for GNN models — +specifically, whether the with-comm AOTInductor module is compiled and +nested inside the archive. The detection relies on a chain of attribute +lookups: + +* ``model.atomic_model.descriptor`` +* ``descriptor.has_message_passing()`` +* For repflows/repformers: ``block.use_loc_mapping`` + +A rename of any of these (refactor in the dpmodel descriptor layer, a +new GNN block name, etc.) silently disables the with-comm artifact and +multi-rank LAMMPS users get a single-artifact .pt2 that crashes on the +first ghost exchange — with no test failure to flag the breakage. + +This test pins the contract: assert ``_has_message_passing`` returns +the documented value for each baseline configuration. +""" + +from __future__ import ( + annotations, +) + +import copy + +import pytest + +from deepmd.dpmodel.model.model import ( + get_model, +) +from deepmd.pt_expt.utils.serialization import ( + _has_message_passing, +) + + +def _se_e2_a_config() -> dict: + """Non-GNN descriptor — must report False.""" + return { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_e2_a", + "rcut": 6.0, + "rcut_smth": 0.5, + "sel": [20, 20], + "neuron": [2, 4], + "axis_neuron": 2, + "type_one_side": True, + "precision": "float64", + "seed": 1, + }, + "fitting_net": { + "neuron": [4, 4], + "resnet_dt": True, + "precision": "float64", + "seed": 1, + }, + } + + +def _dpa1_config() -> dict: + """DPA1 (se_atten) — non-GNN; must report False.""" + return { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "rcut": 6.0, + "rcut_smth": 0.5, + "sel": 20, + "neuron": [2, 4], + "axis_neuron": 2, + "attn": 5, + "attn_layer": 1, + "type_one_side": True, + "precision": "float64", + "seed": 1, + }, + "fitting_net": { + "neuron": [4, 4], + "resnet_dt": True, + "precision": "float64", + "seed": 1, + }, + } + + +def _dpa3_config(use_loc_mapping: bool) -> dict: + """DPA3 (repflows). use_loc_mapping=False -> True, True -> False.""" + return { + "type_map": ["O", "H"], + "descriptor": { + "type": "dpa3", + "repflow": { + "n_dim": 8, + "e_dim": 6, + "a_dim": 4, + "nlayers": 1, + "e_rcut": 4.0, + "e_rcut_smth": 0.5, + "e_sel": 8, + "a_rcut": 3.5, + "a_rcut_smth": 0.5, + "a_sel": 4, + "axis_neuron": 4, + "update_angle": False, + }, + "use_loc_mapping": use_loc_mapping, + }, + "fitting_net": {"neuron": [16, 16], "seed": 1}, + } + + +def _dpa2_config() -> dict: + """DPA2 (repformer) — GNN; repformer has no use_loc_mapping knob, + so always reports True. + """ + return { + "type_map": ["O", "H"], + "descriptor": { + "type": "dpa2", + "repinit": { + "rcut": 6.0, + "rcut_smth": 2.0, + "nsel": 20, + "neuron": [2, 4], + "axis_neuron": 4, + "tebd_dim": 8, + "tebd_input_mode": "concat", + "set_davg_zero": True, + "type_one_side": True, + "use_three_body": False, + }, + "repformer": { + "rcut": 3.0, + "rcut_smth": 1.5, + "nsel": 10, + "nlayers": 1, + "g1_dim": 8, + "g2_dim": 5, + "axis_neuron": 4, + "update_g1_has_conv": True, + "update_g1_has_drrd": True, + "update_g1_has_grrg": True, + "update_g2_has_attn": True, + "attn1_hidden": 8, + "attn1_nhead": 2, + "attn2_hidden": 5, + "attn2_nhead": 1, + "update_style": "res_avg", + "set_davg_zero": True, + }, + "concat_output_tebd": True, + "precision": "float64", + "seed": 1, + }, + "fitting_net": { + "neuron": [4, 4], + "resnet_dt": True, + "seed": 1, + }, + } + + +@pytest.mark.parametrize( + "config_factory,expected", + [ + (_se_e2_a_config, False), + (_dpa1_config, False), + (lambda: _dpa3_config(use_loc_mapping=True), False), + (lambda: _dpa3_config(use_loc_mapping=False), True), + (_dpa2_config, True), + ], + ids=[ + "se_e2_a-non-gnn", + "dpa1-non-gnn", + "dpa3-use-loc-mapping-true", + "dpa3-use-loc-mapping-false", + "dpa2-repformer", + ], +) +def test_has_message_passing_matches_descriptor_kind(config_factory, expected) -> None: + """``_has_message_passing`` must report the documented value for + each baseline descriptor configuration. + + A False positive (non-GNN reported as GNN) wastes compile time on + a useless with-comm artifact. A False negative (GNN with + use_loc_mapping=False reported as non-GNN) is worse: multi-rank + LAMMPS gets a single-artifact .pt2 and crashes on the first ghost + exchange. This test pins both directions. + """ + config = config_factory() + model = get_model(copy.deepcopy(config)) + assert _has_message_passing(model) is expected + + +def test_has_message_passing_no_descriptor_returns_false() -> None: + """Models without a single ``atomic_model.descriptor`` (e.g. linear + / ZBL / frozen) must report False — the function defends against + AttributeError and treats the model as local. + """ + + class _StubAtomicModel: + # Intentionally no ``descriptor`` attribute. + pass + + class _StubModel: + atomic_model = _StubAtomicModel() + + assert _has_message_passing(_StubModel()) is False + + +def test_has_message_passing_descriptor_without_query_returns_false() -> None: + """If the descriptor exists but lacks ``has_message_passing``, the + function must report False rather than raise. + """ + + class _StubDescriptor: + # Intentionally no ``has_message_passing`` method. + pass + + class _StubAtomicModel: + descriptor = _StubDescriptor() + + class _StubModel: + atomic_model = _StubAtomicModel() + + assert _has_message_passing(_StubModel()) is False From 0ef1bfc8bd184de357cf2052df966913ab347c1a Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 30 Apr 2026 18:40:33 +0800 Subject: [PATCH 15/34] test: cover NULL-type atoms (atype<0) under mpirun MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the recorded gap "NULL-type atoms under mpirun" — until now ``build_comm_tensors_positional_with_virtual_atoms`` and the ``fwd_map``-based comm-tensor remap had never been exercised in a multi-rank LAMMPS run despite being reachable any time a user runs a model on a system with atom types outside its ``type_map``. Fixture (``data_dpa3_pt2_null_type.lmp``): the canonical 6 real atoms (types 1, 2) plus 2 LAMMPS type-3 atoms placed at (5.5, 6, 6) and (7.5, 7, 7) — straddling the x=6.5 rank boundary under ``processors 2 1 1`` and within rcut (=6) of multiple real atoms. The pair_coeff ``* * O H NULL`` maps LAMMPS type 3 to deepmd atype=-1, so ``select_real_atoms_coord`` filters them and ``DeepPotPTExpt::compute`` takes the ``build_comm_tensors_positional_with_virtual_atoms`` branch. The NULL atoms appear in cross-rank sendlists because both sit in the boundary's rcut window, so the remap must: - drop the -1 fwd_map slots from each swap's sendlist; - decrement sendnum/recvnum by the number dropped; - translate surviving indices into real-atom space. Test asserts: - forces on the 6 real atoms match the no-NULL baseline ``expected_f`` (atol 1e-8); - NULL atom forces are zero (atol 1e-12) — deepmd is the only pair_style and skips them; - total potential energy matches ``expected_e``; - per-atom virial on real atoms matches ``expected_v``. Runner script (``run_mpi_pair_deepmd_dpa3_pt2.py``) gains two optional flags: ``--pair-coeff`` (override the default ``"* *"``) and ``--mass3`` (mass for a third LAMMPS atom type). Existing tests keep their previous defaults unchanged. The ``_run_mpi_subprocess`` helper gains a ``runner_args`` kwarg to forward arbitrary flags to the runner; existing call sites are unaffected. --- .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 25 +++++- source/lmp/tests/test_lammps_dpa3_pt2.py | 89 +++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py index d07af7a158..4180ffac47 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -51,6 +51,23 @@ "domain decomposition (nswap>0). Pass '1 1 1' for a single-rank " "reference run on the same archive (single-artifact dispatch).", ) +parser.add_argument( + "--pair-coeff", + type=str, + default="* *", + help="pair_coeff arguments (after 'pair_coeff'). Default '* *' " + "uses identity LAMMPS-type-to-deepmd-atype mapping (assumes the " + "data file's types match the model's type_map order). For NULL-type " + "tests pass e.g. '* * O H NULL' so the third LAMMPS type becomes " + "deepmd atype=-1 (filtered before model evaluation).", +) +parser.add_argument( + "--mass3", + type=float, + default=None, + help="Optional mass for LAMMPS atom type 3 (and any higher types). " + "Used by the NULL-type fixture; ignored when only 2 types exist.", +) args = parser.parse_args() lammps = PyLammps() @@ -73,11 +90,17 @@ lammps.read_data(args.DATAFILE) lammps.mass("1 16") lammps.mass("2 2") +if args.mass3 is not None: + # Used by the NULL-type test where the data file has a 3rd LAMMPS + # type that maps to a NULL deepmd atype (filtered before model + # evaluation). The mass value is physically irrelevant — these + # atoms get zero force from the deepmd model. + lammps.mass(f"3 {args.mass3}") lammps.timestep(0.0005) lammps.fix("1 all nve") lammps.pair_style(f"deepmd {args.PB_FILE}") -lammps.pair_coeff("* *") +lammps.pair_coeff(args.pair_coeff) # Per-atom virial from the pair contribution. ``centroid/stress/atom`` # is parallel-safe (rank-local data, gathered below). LAMMPS computes # stress*volume per atom in internal units; the parent test reverses diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index 61058b770e..1d4c77fa9e 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -41,6 +41,14 @@ # local atoms — a corner case the comm-dispatch path must handle # without crashing or producing wrong forces. data_file_empty_subdomain = Path(__file__).parent / "data_dpa3_pt2_empty_subdomain.lmp" +# NULL-type variant: 6 real atoms (types 1,2) + 2 type-3 atoms straddling +# the x=6.5 rank boundary. With ``pair_coeff * * O H NULL`` LAMMPS type 3 +# maps to deepmd atype=-1, so those atoms are filtered by +# ``select_real_atoms_coord`` and the comm tensors must be remapped via +# ``fwd_map`` before being handed to the with-comm artifact. Forces on +# the 6 real atoms must match the no-NULL baseline; NULL atoms get zero +# force from the deepmd model. +data_file_null_type = Path(__file__).parent / "data_dpa3_pt2_null_type.lmp" # Reference values from gen_dpa3.py / test_deeppot_dpa3_ptexpt.cc (PBC) expected_ae = np.array( @@ -168,6 +176,24 @@ def setup_module() -> None: # split is at x = 15 Å and rank 1 owns x ≥ 15, which is empty. box_empty_subdomain = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0]) write_lmp_data(box_empty_subdomain, coord, type_OH, data_file_empty_subdomain) + # NULL-type fixture: original 6 real atoms (types 1,2) plus 2 LAMMPS + # type-3 atoms placed within rcut (~6 Å) of real atoms on BOTH sides + # of the x=6.5 rank boundary. The NULL atoms appear in real atoms' + # neighbour lists and in the cross-rank sendlists, so the comm-tensor + # remap (``fwd_map``-based) is genuinely exercised — not trivial. + coord_null_type = np.concatenate( + [ + coord, + np.array( + [ + [5.5, 6.0, 6.0], # rank 0 side, near boundary + [7.5, 7.0, 7.0], # rank 1 side, near boundary + ] + ), + ] + ) + type_null = np.concatenate([type_OH, np.array([3, 3])]) + write_lmp_data(box, coord_null_type, type_null, data_file_null_type) def teardown_module() -> None: @@ -176,6 +202,7 @@ def teardown_module() -> None: data_type_map_file, data_file_si, data_file_empty_subdomain, + data_file_null_type, ]: if f.exists(): os.remove(f) @@ -361,6 +388,7 @@ def _run_mpi_subprocess( nprocs: int = 2, data_path: Path | None = None, processors: str | None = None, + runner_args: list[str] | None = None, ) -> dict: """Helper: invoke run_mpi_pair_deepmd_dpa3_pt2.py under ``mpirun -n `` and return @@ -399,6 +427,8 @@ def _run_mpi_subprocess( argv.extend(["--processors", "1 1 1"]) if extra_args: argv.extend(extra_args) + if runner_args: + argv.extend(runner_args) sp.check_call(argv) with open(out_path) as fh: lines = fh.read().strip().splitlines() @@ -578,3 +608,62 @@ def test_pair_deepmd_mpi_dpa3_decomposition(nprocs, processors) -> None: atol=1e-8, rtol=0, ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_null_type() -> None: + """Multi-rank DPA3 .pt2 with NULL-type atoms. + + Exercises ``select_real_atoms_coord`` filtering AND + ``build_comm_tensors_positional_with_virtual_atoms`` remapping + under multi-rank dispatch — neither path was reachable in any + previous test fixture. + + Setup: 6 real atoms (types 1,2) at the canonical positions plus + 2 LAMMPS type-3 atoms straddling the x=6.5 rank boundary. With + ``pair_coeff * * O H NULL`` the type-3 atoms map to deepmd + atype=-1 and are filtered before model evaluation. Because the + NULL atoms sit within rcut of real atoms on BOTH sides of the + boundary, they appear in cross-rank sendlists — forcing the + ``fwd_map``-based remap (which translates unfiltered LAMMPS + indices into filtered real-atom indices, dropping ``-1`` slots). + + Assertions: + - Forces on the 6 real atoms (ids 1..6, id-sorted output) match + the no-NULL baseline ``expected_f`` exactly. NULL atoms don't + contribute to the deepmd model so real-atom forces are + identical to the 6-atom baseline. + - NULL-atom forces (ids 7,8) are zero — the deepmd model is the + only pair_style and skips them entirely. + - Total energy matches ``expected_e``. + - Per-atom virial on real atoms matches ``expected_v``. + """ + out_mpi = _run_mpi_subprocess( + nprocs=2, + data_path=data_file_null_type, + runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"], + ) + # Forces on real atoms (ids 1..6) match the no-NULL baseline. + real_forces = out_mpi["forces"][:6] + for ii in range(6): + np.testing.assert_allclose(real_forces[ii], expected_f[ii], atol=1e-8, rtol=0) + # NULL atoms (ids 7,8) get zero force from the deepmd model. + null_forces = out_mpi["forces"][6:] + np.testing.assert_allclose(null_forces, 0.0, atol=1e-12, rtol=0) + # Total potential energy unchanged (NULL atoms contribute 0). + assert out_mpi["pe"] == pytest.approx(expected_e, rel=0, abs=1e-8) + # Per-atom virial on real atoms matches expected_v with the same + # column permutation as test_pair_deepmd_mpi_dpa3. + expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2] + real_virials = out_mpi["virials"][:6] + np.testing.assert_allclose( + real_virials[:, expected_v_to_lammps] / constants.nktv2p, + expected_v, + atol=1e-8, + rtol=0, + ) From 0c95b3af6b4e5f5bf59dede637553c0af1c073a8 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 30 Apr 2026 22:01:00 +0800 Subject: [PATCH 16/34] test: cover three NULL-type edge cases (isolated / all-null-rank / nlist-rebuild) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes three more entries from the GNN-MPI untested-paths catalog, all variations on the multi-rank NULL-type filter path: - ``test_pair_deepmd_mpi_dpa3_null_isolated``: large box (30 x 13 x 13) puts a NULL atom at x=7.5, in rank 0's subdomain interior. With rcut=6 the boundary rcut-windows on rank 0 are x in [0, 6] (PBC of the right wall via x=30) and [9, 15] (rank 1's left wall); atoms in (6, 9) are local but never appear in any sendlist. Exercises ``has_null_atoms == True`` with a no-op remap (sendlists contain no NULL entries to drop) — complementary to ``test_pair_deepmd_mpi_dpa3_null_type`` which exercises the remap-with-NULLs case. - ``test_pair_deepmd_mpi_dpa3_all_null_rank``: rank 1 owns ONLY NULL atoms (intersection of empty-subdomain and NULL-type paths). After ``select_real_atoms_coord`` rank 1 has nloc_real=0, so the ``copy_from_nlist`` empty-subdomain guard must fire AND the ``_with_virtual_atoms`` remap must handle a sendlist whose entire local section was NULL. - ``test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild``: rebuilds the nlist 3 times in 3 MD steps using ``neigh_modify every 1``. NULL atoms drift across the boundary so sendlist composition changes per rebuild — validates that the remap re-runs correctly on every ago=0 trigger and stays consistent with the cached ``mapping_tensor`` / ``firstneigh_tensor`` in ``DeepPotPTExpt::compute``. Also speeds up ``test_pair_deepmd_mpi_dpa3_nlist_rebuild`` (existing non-NULL test) by switching from ``every 10`` + 25 steps to ``every 1`` + 3 steps — same 3 rebuilds, ~1/3 the wall time. Runner script gains a ``--neigh-every`` flag (default 10). All three new tests compare mpirun -n 2 against an mpirun -n 1 reference on the same fixture. --- .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 11 +- source/lmp/tests/test_lammps_dpa3_pt2.py | 210 +++++++++++++++++- 2 files changed, 209 insertions(+), 12 deletions(-) diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py index 4180ffac47..6691bdd220 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -68,6 +68,15 @@ help="Optional mass for LAMMPS atom type 3 (and any higher types). " "Used by the NULL-type fixture; ignored when only 2 types exist.", ) +parser.add_argument( + "--neigh-every", + type=int, + default=10, + help="LAMMPS ``neigh_modify every`` value. Default 10 mirrors the " + "production-realistic interval. Pass 1 for tests that want to " + "trigger nlist rebuilds on every step (and run a small ``--nsteps`` " + "to keep wall time low while still exercising the rebuild path).", +) args = parser.parse_args() lammps = PyLammps() @@ -86,7 +95,7 @@ # ``mapping`` tensor. It is harmless under multi-rank. lammps.atom_modify("map yes") lammps.neighbor("2.0 bin") -lammps.neigh_modify("every 10 delay 0 check no") +lammps.neigh_modify(f"every {args.neigh_every} delay 0 check no") lammps.read_data(args.DATAFILE) lammps.mass("1 16") lammps.mass("2 2") diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index 1d4c77fa9e..17bc1c2892 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -49,6 +49,20 @@ # the 6 real atoms must match the no-NULL baseline; NULL atoms get zero # force from the deepmd model. data_file_null_type = Path(__file__).parent / "data_dpa3_pt2_null_type.lmp" +# Isolated-NULL fixture: box=30 Å in x so rank 0 (x ∈ [0, 15]) has a +# subdomain interior that is NOT within rcut of any boundary. With +# rcut=6, boundary-adjacent regions are [0, 6] (PBC of right wall) +# and [9, 15] (left wall of rank 1) — atoms in x in (6, 9) are LOCAL +# but not in any sendlist. Place 1 NULL atom at x=7.5 (in this gap) +# so the remap branch is reached but the sendlists contain no NULL +# entries — exercises ``has_null_atoms=true`` with no-op remap. +data_file_null_isolated = Path(__file__).parent / "data_dpa3_pt2_null_isolated.lmp" +# All-NULL-rank fixture: box=30 Å in x. 6 real atoms in rank 0 +# (x < 13). 2 NULL atoms in rank 1 (x ∈ {20, 25}). Under +# ``processors 2 1 1`` rank 1 owns ONLY NULL atoms, so after +# ``select_real_atoms_coord`` rank 1 has nloc_real=0 (intersection +# of empty-subdomain and NULL-type paths). +data_file_all_null_rank = Path(__file__).parent / "data_dpa3_pt2_all_null_rank.lmp" # Reference values from gen_dpa3.py / test_deeppot_dpa3_ptexpt.cc (PBC) expected_ae = np.array( @@ -194,6 +208,37 @@ def setup_module() -> None: ) type_null = np.concatenate([type_OH, np.array([3, 3])]) write_lmp_data(box, coord_null_type, type_null, data_file_null_type) + # Isolated-NULL fixture: same elongated box as empty-subdomain + # plus one NULL atom in rank 0's subdomain interior (x ∈ (6, 9)). + coord_null_isolated = np.concatenate([coord, np.array([[7.5, 6.5, 6.5]])]) + type_null_isolated = np.concatenate([type_OH, np.array([3])]) + write_lmp_data( + box_empty_subdomain, + coord_null_isolated, + type_null_isolated, + data_file_null_isolated, + ) + # All-NULL-rank fixture: box=30 in x. Real atoms in rank 0 + # (their original coords; all x < 13). NULL atoms placed in + # rank 1 (x ∈ {20, 25}). Rank 1 owns ONLY NULL atoms. + coord_all_null_rank = np.concatenate( + [ + coord, + np.array( + [ + [20.0, 6.5, 6.5], + [25.0, 6.5, 6.5], + ] + ), + ] + ) + type_all_null_rank = np.concatenate([type_OH, np.array([3, 3])]) + write_lmp_data( + box_empty_subdomain, + coord_all_null_rank, + type_all_null_rank, + data_file_all_null_rank, + ) def teardown_module() -> None: @@ -203,6 +248,8 @@ def teardown_module() -> None: data_file_si, data_file_empty_subdomain, data_file_null_type, + data_file_null_isolated, + data_file_all_null_rank, ]: if f.exists(): os.remove(f) @@ -506,21 +553,25 @@ def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None: """Multi-rank with neighbor-list rebuilds, validated against a single-rank reference of the same archive and trajectory. - Runs 25 MD steps with ``neigh_modify every 10 delay 0 check no``, - so the multi-rank trajectory crosses two nlist rebuilds (at steps - 10 and 20) before the final force evaluation. The same trajectory - is then run under ``mpirun -n 1`` (regular-artifact dispatch on - the same dual-artifact .pt2) to obtain a reference; comparing the - two catches a wrong-but-finite force from a dispatch bug that the - previous finite/bounded check would miss. + Uses ``neigh_modify every 1`` so a rebuild happens on every step, + then runs 3 steps — yields 3 rebuilds in roughly 1/8 the wall + time of a 25-step ``every 10`` run. The same trajectory is then + run under ``mpirun -n 1`` (regular-artifact dispatch on the same + dual-artifact .pt2) to obtain a reference; comparing the two + catches a wrong-but-finite force from a dispatch bug. - NVE is deterministic up to floating-point summation order, so the - cross-rank divergence after 25 steps is bounded by accumulated + NVE is deterministic up to floating-point summation order, so + the cross-rank divergence after 3 steps is bounded by accumulated round-off — small for a 6-atom system but non-zero, hence the relaxed (but still tight) tolerances. """ - out_mpi = _run_mpi_subprocess(extra_args=["--nsteps", "25"], nprocs=2) - out_ref = _run_mpi_subprocess(extra_args=["--nsteps", "25"], nprocs=1) + runner_args = ["--neigh-every", "1"] + out_mpi = _run_mpi_subprocess( + extra_args=["--nsteps", "3"], nprocs=2, runner_args=runner_args + ) + out_ref = _run_mpi_subprocess( + extra_args=["--nsteps", "3"], nprocs=1, runner_args=runner_args + ) np.testing.assert_allclose( out_mpi["forces"], out_ref["forces"], @@ -667,3 +718,140 @@ def test_pair_deepmd_mpi_dpa3_null_type() -> None: atol=1e-8, rtol=0, ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_null_isolated() -> None: + """NULL atom local on a rank but absent from every sendlist. + + Box is 30x13x13 with split at x=15. With rcut=6 the boundary + rcut-windows on rank 0 are x ∈ [0, 6] (PBC of right wall via + x=30) and x ∈ [9, 15] (left wall of rank 1). Atoms in + x ∈ (6, 9) are LOCAL on rank 0 but never appear in any + cross-rank sendlist. Placing a NULL atom at x=7.5 puts it in + that gap. + + Coverage: ``has_null_atoms == True`` triggers the + ``_with_virtual_atoms`` branch, but the remap encounters NO + NULL entries in any sendlist (no-op remap). The + ``test_pair_deepmd_mpi_dpa3_null_type`` test exercises the + remap-with-NULLs case; this one pins the + remap-with-no-NULLs-in-sendlist case. + + Comparison is mpi-vs-single-rank on the same fixture (no hardcoded + reference because the box differs from the canonical 13x13x13). + """ + out_mpi = _run_mpi_subprocess( + nprocs=2, + data_path=data_file_null_isolated, + runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"], + ) + out_ref = _run_mpi_subprocess( + nprocs=1, + data_path=data_file_null_isolated, + runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"], + ) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=0, abs=1e-8) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_all_null_rank() -> None: + """Rank that owns ONLY NULL atoms (intersection of empty-subdomain + and NULL-type paths). + + Box=30x13x13, split at x=15. Real atoms (types 1,2) are all in + rank 0 (x < 13). NULL atoms (type 3) are at x ∈ {20, 25}, + both in rank 1. After ``select_real_atoms_coord``: + + - Rank 0: nloc_real=6 (all real local), receives NULL atoms as + ghosts via PBC -> filtered -> nall_real ≤ nall. + - Rank 1: nloc_real=0 (all local atoms filtered out — empty + subdomain after filter), receives real atoms as ghosts. + + Tests that the comm-dispatch path handles a rank with zero real + locals correctly. The empty-subdomain ``copy_from_nlist`` guard + must fire on rank 1, AND the ``_with_virtual_atoms`` remap must + handle the case where the local section of the sendlist is + entirely NULL. + """ + out_mpi = _run_mpi_subprocess( + nprocs=2, + data_path=data_file_all_null_rank, + runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"], + ) + out_ref = _run_mpi_subprocess( + nprocs=1, + data_path=data_file_all_null_rank, + runner_args=["--pair-coeff", "* * O H NULL", "--mass3", "5.0"], + ) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=0, abs=1e-8) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None: + """NULL-type atoms across nlist rebuilds during MD. + + Uses ``neigh_modify every 1`` so the nlist rebuilds on every MD + step, then runs 3 steps — yielding 3 rebuilds in roughly 1/8 the + wall time of a 25-step ``every 10`` run. Atoms still move (NVE + integration), so the comm-tensor composition (which atoms appear + in each swap's sendlist, where NULL atoms map under + ``fwd_map``) genuinely changes between rebuilds. + + Coverage: validates that the ``_with_virtual_atoms`` remap + re-runs correctly on every ago=0 trigger and that the cached + state in ``DeepPotPTExpt::compute`` (mapping_tensor, + firstneigh_tensor) plus the per-step rebuilt comm tensors stay + consistent under NULL filtering. Compares mpi-2-rank vs + mpi-1-rank trajectories. + """ + runner_args = [ + "--pair-coeff", + "* * O H NULL", + "--mass3", + "5.0", + "--neigh-every", + "1", + ] + out_mpi = _run_mpi_subprocess( + nprocs=2, + data_path=data_file_null_type, + extra_args=["--nsteps", "3"], + runner_args=runner_args, + ) + out_ref = _run_mpi_subprocess( + nprocs=1, + data_path=data_file_null_type, + extra_args=["--nsteps", "3"], + runner_args=runner_args, + ) + np.testing.assert_allclose( + out_mpi["forces"], out_ref["forces"], atol=1e-6, rtol=1e-6 + ) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-6, rtol=1e-6 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10) From ad7761cba20d6b5db6fb1733788ee5adfc299112 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 30 Apr 2026 22:16:29 +0800 Subject: [PATCH 17/34] test: NULL atoms cross rank boundary; prune redundant decomposition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related changes that strengthen the NULL-type rebuild test and trim the decomposition variant set: - ``test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild`` now sets a high initial velocity (v_x = 2000 A/ps) on LAMMPS type-3 atoms via the runner's new ``--null-vx`` flag and a per-type ``velocity`` command. With timestep 0.0005 ps each NULL atom moves 1.0 A per step — enough to physically cross the x=6.5 rank boundary in step 1 (NULL @ 5.5 -> 6.5 -> 7.5 -> 8.5). NULL atoms therefore migrate ranks across rebuilds, exercising the case where a NULL's fwd_map index moves between the local-section and ghost-section of per-rank sendlists. Real atoms keep v=0 so their dynamics are stable; the deepmd model never sees NULL atoms (filtered by ``select_real_atoms_coord``) so unphysical NULL velocity is harmless. mpi-2 vs mpi-1 reference match within atol=1e-6 / rel=1e-8. - ``test_pair_deepmd_mpi_dpa3_decomposition``: drop the ``[4-2 2 1]`` variant. Its 2D coverage is fully subsumed by ``[8-2 2 2]`` (which is 3D, so 2D face exchange is a strict subset). The two remaining variants — ``[4-4 1 1]`` for 1D-deep sendlist chains and ``[8-2 2 2]`` for 3D borders — are complementary and not subsumable. Saves ~5.5s of suite wall time. Runner script gains a ``--null-vx`` flag (no-op when not passed, so existing tests are unaffected). --- .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 20 +++++++++ source/lmp/tests/test_lammps_dpa3_pt2.py | 45 ++++++++++++------- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py index 6691bdd220..b35a2a38d4 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -77,6 +77,18 @@ "trigger nlist rebuilds on every step (and run a small ``--nsteps`` " "to keep wall time low while still exercising the rebuild path).", ) +parser.add_argument( + "--null-vx", + type=float, + default=None, + help="Optional initial x-velocity (units: Angstrom/ps in metal " + "units) for LAMMPS atom type 3 atoms. Real atoms stay at v=0. " + "Used by the NULL-type rebuild test to make NULL atoms cross the " + "rank boundary in a few MD steps without destabilising real-atom " + "dynamics — the deepmd model never sees NULL atoms (filtered by " + "``select_real_atoms_coord``) so their unphysical velocity is " + "harmless.", +) args = parser.parse_args() lammps = PyLammps() @@ -107,6 +119,14 @@ lammps.mass(f"3 {args.mass3}") lammps.timestep(0.0005) lammps.fix("1 all nve") +if args.null_vx is not None: + # Restrict initial velocity to LAMMPS type 3 atoms (NULL-type + # in the deepmd plugin's pair_coeff mapping). Real atoms stay + # at v=0; only the NULL atoms get the high vx, so the deepmd + # model's force outputs on real atoms remain bounded and the + # NVE integrator stays stable. + lammps.group("nullgroup type 3") + lammps.velocity(f"nullgroup set {args.null_vx:.6f} 0.0 0.0 units box") lammps.pair_style(f"deepmd {args.PB_FILE}") lammps.pair_coeff(args.pair_coeff) diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index 17bc1c2892..e7b2b525b8 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -624,7 +624,10 @@ def test_pair_deepmd_mpi_dpa3_empty_subdomain() -> None: @pytest.mark.parametrize( "nprocs,processors", [ - (4, "2 2 1"), # 2D decomposition; nswap > 2, two-direction borders + # 2D ``2 2 1`` is omitted: ``8 @ 2 2 2`` already exercises 2D + # face exchange (it's a superset, in 3D), so the 2D-only case + # is redundant. The two kept variants give complementary + # coverage: 1D-deep sendlist chains vs 3D border exchange. (4, "4 1 1"), # 1D-deep chain; sendlist depth = 3 (each pair is 1+2 swaps) (8, "2 2 2"), # 3D decomposition; full xyz border exchange ], @@ -812,20 +815,30 @@ def test_pair_deepmd_mpi_dpa3_all_null_rank() -> None: importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" ) def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None: - """NULL-type atoms across nlist rebuilds during MD. - - Uses ``neigh_modify every 1`` so the nlist rebuilds on every MD - step, then runs 3 steps — yielding 3 rebuilds in roughly 1/8 the - wall time of a 25-step ``every 10`` run. Atoms still move (NVE - integration), so the comm-tensor composition (which atoms appear - in each swap's sendlist, where NULL atoms map under - ``fwd_map``) genuinely changes between rebuilds. - - Coverage: validates that the ``_with_virtual_atoms`` remap - re-runs correctly on every ago=0 trigger and that the cached - state in ``DeepPotPTExpt::compute`` (mapping_tensor, - firstneigh_tensor) plus the per-step rebuilt comm tensors stay - consistent under NULL filtering. Compares mpi-2-rank vs + """NULL-type atoms physically crossing the rank boundary during MD. + + NULL atoms get a high initial v_x = 2000 A/ps via + ``--null-vx 2000`` so that with timestep 0.0005 ps each NULL atom + moves 1.0 A per step. Initial NULL positions are x=5.5 (rank 0, + moving right) and x=7.5 (rank 1, also moving right — wraps via + PBC). After 3 steps with ``neigh_modify every 1``: + + - NULL @ x=5.5 -> 6.5 -> 7.5 -> 8.5 : crosses x=6.5 boundary + between steps 0 and 1 (moves from rank 0 to rank 1). + - NULL @ x=7.5 -> 8.5 -> 9.5 -> 10.5 : stays in rank 1 but + drifts deeper into the rcut window of rank 0. + + Real atoms stay at v=0 so their dynamics are stable; the deepmd + model never sees the NULL atoms (filtered by + ``select_real_atoms_coord``) so unphysical NULL velocity is + harmless. The boundary crossing changes which rank owns each + NULL atom across rebuilds — exercising the case where a NULL's + fwd_map index moves between the local-section and ghost-section + of the per-rank sendlists. + + Coverage: ``has_null_atoms`` must remain True across rebuilds; + the ``_with_virtual_atoms`` remap must produce correct outputs + even as NULL atoms migrate ranks. Compares mpi-2-rank vs mpi-1-rank trajectories. """ runner_args = [ @@ -835,6 +848,8 @@ def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None: "5.0", "--neigh-every", "1", + "--null-vx", + "2000.0", ] out_mpi = _run_mpi_subprocess( nprocs=2, From b25e00c971d8cb8c62ecc484b0e2eea0e6e694ed Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 30 Apr 2026 22:27:32 +0800 Subject: [PATCH 18/34] test: mixed-direction NULL velocities + real-atom thermal motion Strengthens ``test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild`` so the rebuilds see non-trivial sendlist composition changes: - NULL atoms now move in OPPOSITE directions via the new ``--null-vx-split`` flag. NULL id=7 (at x=5.5) gets v_x=-2000 A/ps -> drifts left (and via PBC into rank 1's far ghost region). NULL id=8 (at x=7.5) gets v_x=+2000 A/ps -> drifts right (deeper into rank 1's domain). The +/- split means each rebuild sees one NULL entering rank 0's sendlist while the other leaves it. - Real atoms get thermal velocities at T=10000 K via the new ``--real-temp`` flag (LAMMPS ``velocity realgroup create T seed``). Each real atom gets a different random direction, so the sendlist composition is also perturbed by real-atom motion (small but detectable under ``every 1`` rebuilds). NULL atoms still don't contribute to the deepmd model (filtered by ``select_real_atoms_coord``), so their unphysical velocity is harmless. Real-atom thermal motion at T=10000 K corresponds to RMS speed ~3-9 A/ps (mass-weighted) -> ~0.005-0.015 A motion per step; small enough that NVE stays stable but enough to perturb sendlists. Both new flags are no-ops when not passed; existing tests are unaffected. --- .../lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py | 44 ++++++++++++-- source/lmp/tests/test_lammps_dpa3_pt2.py | 58 +++++++++++-------- 2 files changed, 73 insertions(+), 29 deletions(-) diff --git a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py index b35a2a38d4..042f47c56c 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_dpa3_pt2.py @@ -89,6 +89,26 @@ "``select_real_atoms_coord``) so their unphysical velocity is " "harmless.", ) +parser.add_argument( + "--null-vx-split", + action="store_true", + help="With ``--null-vx X``, split type-3 atoms into two groups by " + "their LAMMPS atom-id parity: even ids get +X, odd ids get -X. " + "Used by the NULL-type rebuild test to send different NULLs in " + "opposite directions, so the cross-rank sendlist composition " + "changes in BOTH directions per rebuild (rank 0 loses one NULL, " + "gains another simultaneously).", +) +parser.add_argument( + "--real-temp", + type=float, + default=None, + help="Optional initial thermal temperature (Kelvin) for non-NULL " + "atoms via ``velocity realgroup create T seed``. Each real atom " + "gets a random thermal velocity in a different direction — used " + "to exercise sendlist composition changes from real-atom motion " + "rather than only from NULL motion.", +) args = parser.parse_args() lammps = PyLammps() @@ -119,14 +139,30 @@ lammps.mass(f"3 {args.mass3}") lammps.timestep(0.0005) lammps.fix("1 all nve") +# Initial velocities. Order matters: thermalize real atoms first +# (``velocity all create`` would also affect type 3, so we restrict +# it to a real-atom group), then set the NULL bias on type 3. +if args.real_temp is not None: + lammps.group("realgroup type 1 2") + lammps.velocity(f"realgroup create {args.real_temp:.6f} 12345 mom yes rot yes") if args.null_vx is not None: # Restrict initial velocity to LAMMPS type 3 atoms (NULL-type # in the deepmd plugin's pair_coeff mapping). Real atoms stay - # at v=0; only the NULL atoms get the high vx, so the deepmd - # model's force outputs on real atoms remain bounded and the - # NVE integrator stays stable. + # at v=0 (or thermal); only the NULL atoms get the high vx, so + # the deepmd model's force outputs on real atoms remain bounded + # and the NVE integrator stays stable. lammps.group("nullgroup type 3") - lammps.velocity(f"nullgroup set {args.null_vx:.6f} 0.0 0.0 units box") + if args.null_vx_split: + # Send NULL atoms with even/odd LAMMPS atom-id in opposite + # directions. Hardcoded to the null_type fixture's NULL ids + # (7, 8); sufficient because the runner is only used by this + # branch's tests, not as a general utility. + lammps.group("null_id7 id 7") + lammps.group("null_id8 id 8") + lammps.velocity(f"null_id7 set {-args.null_vx:.6f} 0.0 0.0 units box") + lammps.velocity(f"null_id8 set {args.null_vx:.6f} 0.0 0.0 units box") + else: + lammps.velocity(f"nullgroup set {args.null_vx:.6f} 0.0 0.0 units box") lammps.pair_style(f"deepmd {args.PB_FILE}") lammps.pair_coeff(args.pair_coeff) diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index e7b2b525b8..c3dafa48e8 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -815,31 +815,36 @@ def test_pair_deepmd_mpi_dpa3_all_null_rank() -> None: importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" ) def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None: - """NULL-type atoms physically crossing the rank boundary during MD. - - NULL atoms get a high initial v_x = 2000 A/ps via - ``--null-vx 2000`` so that with timestep 0.0005 ps each NULL atom - moves 1.0 A per step. Initial NULL positions are x=5.5 (rank 0, - moving right) and x=7.5 (rank 1, also moving right — wraps via - PBC). After 3 steps with ``neigh_modify every 1``: - - - NULL @ x=5.5 -> 6.5 -> 7.5 -> 8.5 : crosses x=6.5 boundary - between steps 0 and 1 (moves from rank 0 to rank 1). - - NULL @ x=7.5 -> 8.5 -> 9.5 -> 10.5 : stays in rank 1 but - drifts deeper into the rcut window of rank 0. - - Real atoms stay at v=0 so their dynamics are stable; the deepmd - model never sees the NULL atoms (filtered by - ``select_real_atoms_coord``) so unphysical NULL velocity is - harmless. The boundary crossing changes which rank owns each - NULL atom across rebuilds — exercising the case where a NULL's - fwd_map index moves between the local-section and ghost-section - of the per-rank sendlists. - - Coverage: ``has_null_atoms`` must remain True across rebuilds; - the ``_with_virtual_atoms`` remap must produce correct outputs - even as NULL atoms migrate ranks. Compares mpi-2-rank vs - mpi-1-rank trajectories. + """NULL atoms cross the boundary in OPPOSITE directions while + real atoms move randomly via thermal motion — sendlist + composition changes both ways per rebuild. + + Initial conditions: + - Real atoms (types 1, 2): thermal velocities at T=10000 K + (``--real-temp 10000``). Each real atom gets a different + random direction; mass-weighted RMS speed is roughly + 3 - 9 A/ps so motion in 3 steps is ~0.005 - 0.015 A. Tiny + but enough to perturb sendlist composition under + ``every 1`` rebuilds. + - NULL atom 7 (id=7) at x=5.5: gets ``v_x = -2000 A/ps`` via + ``--null-vx 2000 --null-vx-split`` (odd id -> negative). + Wraps via PBC: x = 5.5 -> 4.5 -> 3.5 -> 2.5 (stays in rank 0 + but drifts deeper into the PBC ghost region of rank 1). + - NULL atom 8 (id=8) at x=7.5: gets ``v_x = +2000 A/ps`` + (even id -> positive). x = 7.5 -> 8.5 -> 9.5 -> 10.5 (stays + in rank 1 but drifts deeper). + + The +x/-x split means each rebuild sees NULL atoms entering + different sendlists (rank 0's right-edge sendlist gains NULL 7 + even as it loses NULL 8 deeper into rank 1's domain, and vice + versa). Real-atom thermal motion provides additional sendlist + perturbation per atom. + + Coverage: ``has_null_atoms`` must remain True; the + ``_with_virtual_atoms`` remap must produce correct outputs as + NULL atoms migrate in mixed directions and real-atom positions + shift. Compares mpi-2-rank vs mpi-1-rank trajectories + deterministically (both use the same velocity seed 12345). """ runner_args = [ "--pair-coeff", @@ -850,6 +855,9 @@ def test_pair_deepmd_mpi_dpa3_null_type_nlist_rebuild() -> None: "1", "--null-vx", "2000.0", + "--null-vx-split", + "--real-temp", + "10000.0", ] out_mpi = _run_mpi_subprocess( nprocs=2, From 124dc5e85a1913d3505b4d8f47dac5cecc621387 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 30 Apr 2026 22:35:53 +0800 Subject: [PATCH 19/34] test: empty-subdomain test exercises cached mapping_tensor path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends ``test_pair_deepmd_mpi_dpa3_empty_subdomain`` to run 5 MD steps with ``neigh_modify every 100`` instead of a single ``lammps.run(0)``. This forces: step 0 -> ago=0 (full rebuild; mapping_tensor + firstneigh_tensor populated for the first time on the empty-subdomain rank) step 1 -> ago=1 (cache HIT — mapping_tensor and firstneigh_tensor reused) step 2 -> ago=2 (cache hit) step 3 -> ago=3 (cache hit) step 4 -> ago=4 (cache hit) step 5 -> ago=5 (cache hit) Closes the catalog gap "Empty subdomain under PR 5407's mapping_tensor cache". Previously only step 0 was tested, which is always ago=0; the cache-hit branch in DeepPotPTExpt::compute on a rank with nloc=0 was unexercised. Compares mpi-2 vs mpi-1 trajectory with the same atol=1e-6 / rel=1e-8 tolerances as the other rebuild tests. --- source/lmp/tests/test_lammps_dpa3_pt2.py | 39 +++++++++++++++++++----- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/source/lmp/tests/test_lammps_dpa3_pt2.py b/source/lmp/tests/test_lammps_dpa3_pt2.py index c3dafa48e8..a7a58b9f49 100644 --- a/source/lmp/tests/test_lammps_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_dpa3_pt2.py @@ -596,23 +596,46 @@ def test_pair_deepmd_mpi_dpa3_nlist_rebuild() -> None: def test_pair_deepmd_mpi_dpa3_empty_subdomain() -> None: """Multi-rank DPA3 with one rank owning zero local atoms. + Runs 5 MD steps with ``neigh_modify every 100`` so the nlist is + rebuilt only once (at step 0, ago=0) and the next 4 force + evaluations exercise the cached ``mapping_tensor`` / + ``firstneigh_tensor`` path (PR 5407 caching) under empty + subdomain. Atoms move ~0 (v=0 default) so positions only differ + by tiny round-off, but the C++ dispatch path with cached state + on rank 1 (which has nloc=0) must still produce correct + cross-rank forces. + Uses a 30 x 13 x 13 box with all six atoms clustered in x in [0.25, 12.83]. Under ``processors 2 1 1`` the split is at x = 15 so rank 1 owns an empty subdomain. The comm-dispatch path must still produce correct forces and virial (compared against a - same-archive single-rank reference of the same configuration). + same-archive single-rank reference of the same trajectory). This catches: zero-length send/recv lists in the comm tensors, - division-by-zero in nlocal-dependent reshapes, and any silent - drop of a rank's contribution when it has no atoms to evaluate. + division-by-zero in nlocal-dependent reshapes, silent drop of a + rank's contribution when it has no atoms to evaluate, AND + cache-hit (ago>0) bugs specific to the empty-subdomain rank. """ - out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain) - out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain) - np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + runner_args = ["--neigh-every", "100"] + out_mpi = _run_mpi_subprocess( + nprocs=2, + data_path=data_file_empty_subdomain, + extra_args=["--nsteps", "5"], + runner_args=runner_args, + ) + out_ref = _run_mpi_subprocess( + nprocs=1, + data_path=data_file_empty_subdomain, + extra_args=["--nsteps", "5"], + runner_args=runner_args, + ) np.testing.assert_allclose( - out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + out_mpi["forces"], out_ref["forces"], atol=1e-6, rtol=1e-6 ) - assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-12, abs=1e-12) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-6, rtol=1e-6 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10) @pytest.mark.skipif( From 5fef5c6b55752452240e063fcecc115c6087eefe Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 30 Apr 2026 23:17:10 +0800 Subject: [PATCH 20/34] test(spin-mpi): cover spin GNN multi-rank end-to-end Adds the smallest reachable test that exercises the full spin GNN multi-rank dispatch chain (Tier-1 #2 in gnn_mpi_untested_paths). - gen_spin.py: also produce deeppot_dpa3_spin_mpi.pt2 (DPA3 + use_loc_mapping=False + use_spin=[True, False]) so a dual-artifact spin GNN .pt2 exists for testing. - run_mpi_pair_deepmd_spin_dpa3_pt2.py: MPI runner that drives the spin pair_style and gathers force / force_mag / virial across ranks. fm goes via 'compute property/atom fmx fmy fmz' since the legacy extract/gather_atoms registry doesn't expose 'fm'. - test_lammps_spin_dpa3_pt2.py: mpirun -n 2 vs same-archive mpirun -n 1 reference for energy / force / force_mag / virial (atol 1e-8). A divergence is necessarily a problem in DeepSpinPTExpt multi-rank dispatch, the spin branch of _exchange_ghosts, the C++ deepmd_export::border_op invocation, or the comm-tensor builder. - _build_dynamic_shapes: bump nall_dim min from 1 to 4 when has_spin. Without this, torch.export raises CONSTRAINT_VIOLATION on the pre-doubling nall axis when tracing GNN spin with with_comm_dict (the suggested fix in the error matches min=4). Eager parity (test_spin_dpa3_eager_parity) and trace-only validation already existed; this PR closes the gap by adding AOTI compile + LAMMPS load + real MPI exchange. Known limitations: - Single configuration tested (4 atoms, 2 ranks, type_map ["Ni", "O"], use_spin=[True, False]). No NULL-type, empty-subdomain, nlist-rebuild variants for spin yet -- the non-spin DPA3 path covers those code branches and the spin override differs only in the real/virtual split, which the new test exercises. - do_atomic_virial=True only (matches all current multi-rank tests; Tier-1 #3 still open). - N=2 only; no decomposition/N>2 spin variant. - CPU only. --- deepmd/pt_expt/utils/serialization.py | 8 +- .../run_mpi_pair_deepmd_spin_dpa3_pt2.py | 117 ++ source/lmp/tests/test_lammps_spin_dpa3_pt2.py | 178 ++ source/tests/infer/deeppot_dpa3_spin_mpi.yaml | 1863 +++++++++++++++++ source/tests/infer/gen_spin.py | 76 +- 5 files changed, 2240 insertions(+), 2 deletions(-) create mode 100644 source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py create mode 100644 source/lmp/tests/test_lammps_spin_dpa3_pt2.py create mode 100644 source/tests/infer/deeppot_dpa3_spin_mpi.yaml diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index acaa58dbeb..cdf1937fe5 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -318,7 +318,13 @@ def _build_dynamic_shapes( nframes_dim: torch.export.Dim | int = ( 1 if with_comm_dict else torch.export.Dim("nframes", min=1) ) - nall_dim = torch.export.Dim("nall", min=1) + # Spin models double atom count internally (real + virtual). Some + # GNN ops in the spin path generate a min=4 constraint on the + # *pre-doubling* nall axis (matches "Suggested fixes" from + # torch.export's CONSTRAINT_VIOLATION error). Bump the min for spin + # so the export does not error on the inferred guard. + nall_min = 4 if has_spin else 1 + nall_dim = torch.export.Dim("nall", min=nall_min) nloc_dim = torch.export.Dim("nloc", min=1) nnei_dim = torch.export.Dim("nnei", min=max(1, model_nnei)) diff --git a/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py new file mode 100644 index 0000000000..a8d7fe71a6 --- /dev/null +++ b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py @@ -0,0 +1,117 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Multi-rank LAMMPS driver for the DPA3 spin GNN .pt2 fixture. + +Mirrors ``run_mpi_pair_deepmd_dpa3_pt2.py`` but for spin models: +``atom_style spin`` / ``pair_style deepspin`` and gathers the +per-atom magnetic force ``fm`` in addition to the normal force and +per-atom virial. The DPA3 spin .pt2 with ``use_loc_mapping=False`` +carries a with-comm AOTI artifact (Phase 3 dual-artifact layout); the +C++ ``DeepSpinPTExpt`` (Phase 4c) routes to it when LAMMPS reports +nswap > 0 (multi-rank), driving MPI ghost-atom exchange via +``deepmd_export::border_op``. + +Rank 0 writes potential energy + per-atom forces (3 cols) + +per-atom force_mag (3 cols) + per-atom virial (9 cols, from +``compute centroid/stress/atom NULL pair`` in LAMMPS internal units) +to ``OUTPUT`` so the parent pytest process can compare against the +single-rank reference. +""" + +from __future__ import ( + annotations, +) + +import argparse + +import numpy as np +from lammps import ( + PyLammps, +) +from mpi4py import ( + MPI, +) + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() + +parser = argparse.ArgumentParser() +parser.add_argument( + "DATAFILE", type=str, help="LAMMPS data file (atom positions + spin)" +) +parser.add_argument("PB_FILE", type=str, help=".pt2 model file (spin GNN)") +parser.add_argument( + "OUTPUT", type=str, help="Output file for energies + forces + force_mag + virial" +) +parser.add_argument( + "--nsteps", + type=int, + default=0, + help="Number of MD steps to run after the initial force evaluation. " + "Note: integrating spin requires fix nve/spin which is outside the " + "scope of this multi-rank correctness test; we only run static " + "force/energy evaluations and an optional run > 0 to exercise the " + "with-comm dispatch across neighbour-list rebuilds.", +) +parser.add_argument( + "--processors", + type=str, + default="2 1 1", + help="LAMMPS processors grid. Default '2 1 1' forces multi-rank " + "domain decomposition (nswap>0). Pass '1 1 1' for a single-rank " + "reference run on the same archive.", +) +args = parser.parse_args() + +lammps = PyLammps() +lammps.processors(args.processors) +lammps.units("metal") +lammps.boundary("p p p") +lammps.atom_style("spin") +lammps.atom_modify("map yes") +lammps.neighbor("2.0 bin") +lammps.neigh_modify("every 10 delay 0 check no") +lammps.read_data(args.DATAFILE) +lammps.mass("1 58") +lammps.mass("2 16") +lammps.timestep(0.0005) +lammps.fix("1 all nve") + +lammps.pair_style(f"deepspin {args.PB_FILE}") +lammps.pair_coeff("* *") +lammps.compute("virial all centroid/stress/atom NULL pair") +# Per-atom magnetic force components. LAMMPS does not expose ``fm`` +# through the legacy ``extract``/``gather_atoms`` registry, so we go +# via ``compute property/atom fmx fmy fmz`` + ``gather`` to obtain a +# global, id-ordered (nlocal+nghost reduced) array on every rank. +lammps.compute("fmprop all property/atom fmx fmy fmz") +lammps.run(0) + +if args.nsteps > 0: + lammps.run(args.nsteps) + +# All per-atom data goes through the LAMMPS global gather API. +# ``c_fmprop`` is the compute defined above (fmx/fmy/fmz columns). +forces_global = lammps.lmp.gather_atoms("f", 1, 3) +ids_global = lammps.lmp.gather_atoms("id", 0, 1) +virial_global = lammps.lmp.gather("c_virial", 1, 9) +fm_global = lammps.lmp.gather("c_fmprop", 1, 3) + +if rank == 0: + pe_global = lammps.eval("pe") + natoms = lammps.atoms.natoms + forces = np.array(forces_global, dtype=np.float64).reshape(natoms, 3) + fm = np.array(fm_global, dtype=np.float64).reshape(natoms, 3) + virials = np.array(virial_global, dtype=np.float64).reshape(natoms, 9) + ids = np.array(ids_global, dtype=np.int64).reshape(natoms) + order = np.argsort(ids) + forces = forces[order] + fm = fm[order] + virials = virials[order] + with open(args.OUTPUT, "w") as f: + f.write(f"{pe_global:.16e}\n") + # Each row: 3 force + 3 force_mag + 9 virial = 15 columns. + for fi, fmi, vi in zip(forces, fm, virials, strict=True): + row = np.concatenate([fi, fmi, vi]) + f.write(" ".join(f"{v:.16e}" for v in row) + "\n") + +MPI.Finalize() diff --git a/source/lmp/tests/test_lammps_spin_dpa3_pt2.py b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py new file mode 100644 index 0000000000..fd4ee0a7cb --- /dev/null +++ b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Multi-rank LAMMPS test for the DPA3 spin GNN .pt2 fixture. + +The DPA3 spin .pt2 (``deeppot_dpa3_spin_mpi.pt2``) is generated by +``source/tests/infer/gen_spin.py`` with ``use_loc_mapping=False``, +producing a dual-artifact archive whose nested +``forward_lower_with_comm.pt2`` is selected by ``DeepSpinPTExpt`` when +LAMMPS reports ``nswap > 0`` (multi-rank). This test exercises the +spin GNN multi-rank dispatch end-to-end: + +1. Eager parity is already covered by + ``source/tests/pt_expt/model/test_spin_export_with_comm.py + ::test_spin_dpa3_eager_parity`` (Python override only). +2. AOTI compile of the with-comm artifact is verified at fixture + generation time (``gen_spin.py`` calls ``convert_backend`` which + triggers the compile). +3. **This test** wires the loaded artifact through ``DeepSpinPTExpt``, + ``commPTExpt::build_comm_tensors_positional``, the C++ + ``deepmd_export::border_op`` registration, and real MPI ghost + exchange between two LAMMPS subdomains. A passing test means the + full chain (Python override + AOTI export + C++ load + comm-tensor + build + custom op invocation + MPI exchange) produces forces / + force_mag / virial identical to a same-archive single-rank + reference within numerical tolerance. + +Compares mpi-2 vs same-archive mpi-1 to avoid hardcoding numerical +references (the same approach used for the DPA3 / DPA2 multi-rank +tests). Same-archive means the regular and with-comm artifacts come +from the same trace, so any divergence is purely the multi-rank +dispatch path's responsibility. +""" + +from __future__ import ( + annotations, +) + +import importlib.util +import os +import shutil +import subprocess as sp +import sys +import tempfile +from pathlib import ( + Path, +) + +import numpy as np +import pytest +from write_lmp_data import ( + write_lmp_data_spin, +) + +pb_file_mpi = ( + Path(__file__).parent.parent.parent + / "tests" + / "infer" + / "deeppot_dpa3_spin_mpi.pt2" +) +data_file = Path(__file__).parent / "data_dpa3_spin_pt2.lmp" + +# 4-atom Ni-O system; same layout as ``test_lammps_spin_pt2.py``. With +# ``processors 2 1 1`` the split sits at x=6.5 -> 2 atoms per rank. +box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0]) +coord = np.array( + [ + [12.83, 2.56, 2.18], + [12.09, 2.87, 2.74], + [3.51, 2.51, 2.60], + [4.27, 3.22, 1.56], + ] +) +spin = np.array( + [ + [0, 0, 1.2737], + [0, 0, 1.2737], + [0, 0, 0], + [0, 0, 0], + ] +) +type_NiO = np.array([1, 1, 2, 2]) + + +def setup_module() -> None: + if os.environ.get("ENABLE_PYTORCH", "1") != "1": + pytest.skip( + "Skip test because PyTorch support is not enabled.", + ) + write_lmp_data_spin(box, coord, spin, type_NiO, data_file) + + +def teardown_module() -> None: + if data_file.exists(): + os.remove(data_file) + + +def _run_mpi_subprocess( + nprocs: int, + extra_args: list[str] | None = None, + processors: str | None = None, +) -> dict: + """Run ``run_mpi_pair_deepmd_spin_dpa3_pt2.py`` under + ``mpirun -n `` and return + ``{"pe", "forces", "force_mag", "virials"}``. + """ + with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: + out_path = f.name + try: + argv = [ + "mpirun", + "-n", + str(nprocs), + sys.executable, + str(Path(__file__).parent / "run_mpi_pair_deepmd_spin_dpa3_pt2.py"), + str(data_file.resolve()), + str(pb_file_mpi.resolve()), + out_path, + ] + if processors is not None: + argv.extend(["--processors", processors]) + elif nprocs == 1: + argv.extend(["--processors", "1 1 1"]) + if extra_args: + argv.extend(extra_args) + sp.check_call(argv) + with open(out_path) as fh: + lines = fh.read().strip().splitlines() + pe = float(lines[0]) + rows = np.array( + [list(map(float, line.split())) for line in lines[1:]], + dtype=np.float64, + ) + # Each row: 3 force + 3 force_mag + 9 virial = 15 cols (see runner). + forces = rows[:, :3] + force_mag = rows[:, 3:6] + virials = rows[:, 6:] + return { + "pe": pe, + "forces": forces, + "force_mag": force_mag, + "virials": virials, + } + finally: + if os.path.exists(out_path): + os.remove(out_path) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_spin() -> None: + """Multi-rank LAMMPS run for spin DPA3 .pt2 must match the + same-archive single-rank reference within numerical tolerance for + energy, forces, force_mag, and per-atom virial. + + Going via mpi-1 (rather than a hardcoded reference array) means we + are validating the multi-rank dispatch path itself, isolated from + any tracing / AOTI precision drift that might appear at fixture + generation time. Single-rank uses the regular artifact (nswap=0); + multi-rank uses the with-comm artifact — so a divergence here is + necessarily a problem in either ``DeepSpinPTExpt`` multi-rank + dispatch, the spin branch of ``_exchange_ghosts``, the C++ + ``deepmd_export::border_op`` invocation, or the comm-tensor + builder. + """ + out_mpi = _run_mpi_subprocess(nprocs=2) + out_ref = _run_mpi_subprocess(nprocs=1) + + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-10, abs=1e-12) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["force_mag"], out_ref["force_mag"], atol=1e-8, rtol=0 + ) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) diff --git a/source/tests/infer/deeppot_dpa3_spin_mpi.yaml b/source/tests/infer/deeppot_dpa3_spin_mpi.yaml new file mode 100644 index 0000000000..6fce85245a --- /dev/null +++ b/source/tests/infer/deeppot_dpa3_spin_mpi.yaml @@ -0,0 +1,1863 @@ +backend: dpmodel +model: + backbone_model: + "@class": Model + "@variables": + out_bias: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - - 0.0 + - - 0.0 + - - 0.0 + - - 0.0 + out_std: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - - 1.0 + - - 1.0 + - - 1.0 + - - 1.0 + "@version": 2 + atom_exclude_types: &id002 + - 2 + - 3 + descriptor: + "@class": Descriptor + "@version": 2 + activation_function: silu + add_chg_spin_ebd: false + concat_output_tebd: false + env_protection: 1.0e-06 + exclude_types: &id003 + - - 3 + - 0 + - - 3 + - 1 + - - 3 + - 2 + - - 3 + - 3 + ntypes: 4 + precision: float64 + repflow_args: + a_compress_e_rate: 1 + a_compress_rate: 0 + a_compress_use_split: false + a_dim: 4 + a_rcut: 3.5 + a_rcut_smth: 0.5 + a_sel: 4 + axis_neuron: 4 + e_dim: 6 + e_rcut: 4.0 + e_rcut_smth: 0.5 + e_sel: 8 + edge_init_use_dist: false + fix_stat_std: 0.3 + n_dim: 8 + n_multi_edge_message: 1 + nlayers: 1 + optim_update: true + sel_reduce_factor: 10.0 + smooth_edge_update: false + update_angle: false + update_residual: 0.1 + update_residual_init: const + update_style: res_residual + use_dynamic_sel: false + use_exp_switch: false + repflow_variable: + "@variables": + davg: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + - - 0.0 + - 0.0 + - 0.0 + - 0.0 + dstd: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + - - 0.3 + - 0.3 + - 0.3 + - 0.3 + angle_embd: + "@class": Layer + "@variables": + b: null + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - 0.17649720801051316 + - 0.26111987625920485 + - -0.5130082451185703 + - -0.473906411761865 + "@version": 2 + activation_function: none + bias: false + precision: float64 + resnet: false + trainable: true + use_timestep: false + edge_embd: + "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - -0.34071690817734457 + - -0.10233679058558394 + - -0.042158524863509905 + - 0.1901865304247668 + - 0.5454968471423458 + - -0.15466206519031384 + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - -0.41394371690540416 + - -0.020029235227998383 + - 0.7548439568852728 + - 0.18561320525199423 + - -0.1235585931191982 + - -0.6320668874586287 + "@version": 2 + activation_function: none + bias: true + precision: float64 + resnet: false + trainable: true + use_timestep: false + env_mat: + protection: 0.0 + rcut: 4.0 + rcut_smth: 0.5 + use_exp_switch: false + repflow_layers: + - "@class": RepFlowLayer + "@variables": + a_residual: [] + e_residual: + - "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + n_residual: + - "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + "@version": 2 + a_compress_e_rate: 1 + a_compress_rate: 0 + a_compress_use_split: false + a_dim: 4 + a_rcut: 3.5 + a_rcut_smth: 0.5 + a_sel: 4 + activation_function: silu + axis_neuron: 4 + e_dim: 6 + e_rcut: 4.0 + e_rcut_smth: 0.5 + e_sel: + - 8 + edge_self_linear: + "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.023029640256287606 + - -0.1553057617002839 + - 0.288173154238969 + - 0.05107253540449686 + - 0.2614964936530096 + - -0.46218537330158843 + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - -0.05677878533841125 + - 0.08257140520321203 + - -0.17682541236689134 + - -0.06780335156677138 + - 0.2613225810769312 + - -0.2528499571680411 + - - -0.1060811611888437 + - 0.2834597688459181 + - 0.17021435817066793 + - -0.2119118384053945 + - 0.044257126661162875 + - -0.20348530980582044 + - - -0.17206345113221683 + - 0.19628652842871394 + - -0.03853877890775931 + - 0.0064469870425646406 + - -0.2035021228657865 + - 0.33893151231499635 + - - -0.1603541649096622 + - -0.07431170557593793 + - -0.1285051383598977 + - -0.09531516630926942 + - -0.10774430641710406 + - 0.10558546868439946 + - - -0.027408677366010784 + - 0.03171038951939523 + - -0.26649612755080526 + - 0.0749559135333121 + - 0.12753219377780048 + - -0.12375862279261161 + - - -0.3561807917324476 + - -0.028580689473013433 + - -0.2740045204725747 + - 0.12423725221263406 + - -0.0746927118825747 + - -0.16583458613892285 + - - -0.22497394079088218 + - -0.10329264538957945 + - -0.06015496745765555 + - -0.24047390264558702 + - -0.2470728805254821 + - -0.03091482236168157 + - - 0.313786674711663 + - -0.014345848137540798 + - -0.1446657411476756 + - -0.11134433415995286 + - -0.10957716367503506 + - 0.25318230359455945 + - - -0.11353216449019704 + - -0.24278855525542462 + - -0.0657328669264313 + - -0.08357873620530161 + - -0.19969579432068596 + - 0.0217399962565733 + - - -0.017346111123240478 + - -0.20460540022763518 + - 0.19580548714183002 + - 0.26081320850512657 + - -0.01937612111130992 + - 0.26782602217325135 + - - 0.169450738702664 + - 0.0007586409725729347 + - 0.2217946757639642 + - -0.08785618340632881 + - 0.08754553673729902 + - 0.0459550075486224 + - - -0.10347442434861592 + - -0.05665265742992996 + - -0.15657294594958857 + - 0.07518451488260593 + - -0.200469822163535 + - 0.008552407309104103 + - - -0.13418495292451688 + - -0.15007855071339413 + - -0.47561245640659827 + - 0.05519145026405931 + - 0.034426127944687676 + - -0.19833628864440492 + - - -0.061539077996517144 + - 0.140236735963936 + - 0.44907007382747016 + - -0.17502514002597466 + - -0.13141545313528988 + - -0.10225960767785013 + - - 0.15849623153238157 + - 0.14969793438965767 + - 0.05020396887825857 + - -0.42237393212574514 + - -0.43560848414739306 + - -0.34368434587411545 + - - -0.090558268807612 + - -0.10586479947976848 + - -0.17654116465986686 + - -0.17464251661717356 + - -0.17707748016637653 + - 0.4728011907076426 + - - 0.06839467741547146 + - 0.20172332403056187 + - -0.20761658659357723 + - -0.3179201458113386 + - 0.1570191398976539 + - 0.30829366728408747 + - - -0.07346831672915768 + - -0.01603422028135059 + - -0.2343121216044693 + - -0.09228986456390967 + - -0.12259985802096685 + - 0.13925704477109332 + - - 0.03112673892006412 + - -0.12259170091097643 + - -0.01873720650800844 + - -0.02825905483134531 + - -0.07410620360262994 + - -0.13890487670689447 + - - 0.2599426512954838 + - -0.030475413023044056 + - 0.04418102981236639 + - 0.14747916053674695 + - 0.11469436259489629 + - -0.12589465767715197 + - - 0.1534348683560527 + - -0.2598559665351654 + - 0.1691188844559884 + - -0.05815067519957393 + - -0.09922406205302857 + - -0.026111067214965193 + - - -0.09469687008709152 + - -0.25433614509748265 + - -0.3230603080176275 + - 0.10565697308598668 + - 0.11382397843310456 + - 0.12636033735242963 + "@version": 2 + activation_function: none + bias: true + precision: float64 + resnet: false + trainable: true + use_timestep: false + n_dim: 8 + n_multi_edge_message: 1 + node_edge_linear: + "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - -0.08691917211888521 + - -0.2190238624015504 + - -0.03155197949842874 + - -0.06801377552229042 + - -0.36593263653854924 + - -0.2524793728902088 + - -0.2985692887165394 + - 0.041241949395424804 + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - -0.0696849625476975 + - 0.2297992430114777 + - 0.2016390404837622 + - -0.14268332516715398 + - 0.05104561028973491 + - -0.0047524771390660015 + - 0.2007647888532239 + - -0.13892443853282946 + - - -0.06269186432762 + - -0.3620323943560662 + - 0.1429598213093448 + - 0.32251103225335886 + - 0.03297508356302982 + - 0.09813020765990464 + - -0.2128967691985101 + - 0.1446653191521595 + - - 0.3408694676048958 + - 0.2108742996875453 + - 0.3708891734344055 + - 0.03603632207631642 + - -0.021861106604374982 + - 0.05885265981211556 + - -0.3850668694292795 + - -0.02565715855818745 + - - 0.2554067355579947 + - -0.12385934461529113 + - 0.20794804172087122 + - 0.34771760519493133 + - 0.0030775484903255083 + - 0.08033323613153229 + - 0.04547640227535313 + - 0.03256133523805088 + - - -0.20228475171413982 + - -0.40882303462099395 + - -0.13933573248982073 + - -0.09056898648309795 + - 0.06705102826758672 + - -0.10643998751725821 + - 0.3714789434592029 + - -0.15660714896565422 + - - 0.0620445405627027 + - -0.14981233984554174 + - 0.1377612457580642 + - -0.3264453797874674 + - 0.18886992363386892 + - -0.13120999191064697 + - 0.2639396300281778 + - 0.20744058178112204 + - - 0.0902283316504931 + - 0.2642720697422412 + - 0.11616051352480065 + - 0.33194344559115435 + - -0.07519119975054182 + - -0.05062288710700148 + - -0.0033899752949634763 + - 0.12074780296663348 + - - -0.14625494457636853 + - -0.39187048236106403 + - 0.005863181213654556 + - 0.08058988215606765 + - 0.229684677996952 + - 0.02491096095922189 + - -0.07923462148233366 + - 0.03463149425218323 + - - 0.1459761391053138 + - 0.1826916307305693 + - -0.24330282168960599 + - -0.15404338160080283 + - -0.15732528026070658 + - 0.10082194118502665 + - 0.10780094880007303 + - 0.10439459076027502 + - - 0.2967580322447816 + - 0.19310548831515634 + - 0.13271337197427788 + - -0.003964549207383962 + - -0.3053587625881187 + - 0.12883374510336365 + - 0.045960329737757634 + - 0.19761345822107057 + - - -0.13773367016862742 + - 0.09659775346412201 + - 0.13561552570758134 + - 0.07814681408507194 + - -0.28773064288403055 + - 0.07556744144481048 + - -0.09838713644355178 + - 0.009867107649393275 + - - -0.09655717020123253 + - -0.10871554819348611 + - -0.11670258568304015 + - 0.2177137774640066 + - -0.14817421356773255 + - -0.03606693672811542 + - -0.026029214690369364 + - -0.040666475049662726 + - - -0.0677385423671006 + - -0.12993597893178244 + - 0.1180039874263662 + - 0.1384604584579823 + - -0.024227421664540914 + - 0.1679245814762119 + - -0.19280274838451647 + - 0.0990223630355508 + - - 0.0758415027141385 + - 0.16215196433523008 + - 0.2767732385588474 + - 0.022163750613004355 + - -0.12254120989786124 + - -0.12391951174230557 + - -0.028791741351884195 + - -0.0595519969823867 + - - 0.22247449036902905 + - 0.07567917899966987 + - -0.18221068561029122 + - -0.1496346790319525 + - 0.01739141266484531 + - 0.03295277270138665 + - -0.27927822171693173 + - -0.13558030103477586 + - - 0.1712575942124072 + - 0.13705603104177683 + - 0.290608271870899 + - 0.25077636518593155 + - 0.06723740912894116 + - -0.29077479630216374 + - -0.25998108625190797 + - 0.15096707384533595 + - - -0.011258223444056591 + - 0.07940059884337107 + - 0.14539160696529732 + - -0.33401238196882443 + - 0.0359760729699335 + - -0.02226084988227022 + - 0.12276616178918343 + - 0.0439592954772777 + - - 0.07596667366672871 + - -0.11052600964268607 + - -0.13155071841622368 + - -0.07425437999013539 + - 0.00827734508288158 + - 0.07414300482320346 + - 0.052019022231599196 + - 0.16368644986528788 + - - 0.31022863799320216 + - 0.11380817934759249 + - 0.11671054675679823 + - 0.03833224311415518 + - 0.1545146635596559 + - 0.5283089690392868 + - -0.17235747525638992 + - -0.16802245441710034 + - - 0.19547575805994974 + - 0.03442738806627725 + - 0.035134165349037516 + - 0.1685202553837112 + - -0.13706885637245225 + - -0.09105484518308726 + - 0.24401116664356562 + - -0.042463896239058455 + - - 0.18293429344914702 + - -0.0797150153045118 + - 0.2837300628985514 + - -0.03290000697254011 + - 0.07484025269991934 + - 0.4486382833349405 + - 0.18215765586473062 + - 0.14222755521955213 + - - -0.054949228485595726 + - 0.2298266346316468 + - -0.13022437426681047 + - 0.31473958548227127 + - -0.16053599380138361 + - 0.12351036770696595 + - -0.2026640600757936 + - -0.3120452604960154 + "@version": 2 + activation_function: none + bias: true + precision: float64 + resnet: false + trainable: true + use_timestep: false + node_self_mlp: + "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.3339521660758233 + - 0.19617864215638078 + - 0.11685150273643896 + - -0.04301114015831818 + - -0.2646745547826684 + - -0.05874585577443532 + - 0.4130256006886377 + - -0.6003500792716773 + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - -0.22061013087519665 + - 0.17161694901625085 + - 0.25079797681294247 + - -0.06984190636344022 + - 0.402412783105689 + - -0.13232509868240386 + - -0.12410592033624109 + - -0.5243896508356666 + - - -0.34531669337816745 + - 0.2590681097532894 + - -0.4170438578433154 + - 0.33209656716128205 + - 0.20907222698506978 + - 0.21026382825889875 + - -0.04125433055358784 + - -0.3362049950725693 + - - -0.02306669199993831 + - -0.27140136827851236 + - 0.08675906253383281 + - 0.20991982378397447 + - -0.20157467157772102 + - 0.10954533237221269 + - -0.30521247150866015 + - 0.1039196228402914 + - - 0.2927901959232568 + - -0.05686111266739088 + - -0.352867716741099 + - 0.06499009437306054 + - 0.2935084094905296 + - -0.5208455549268021 + - -0.06412894033597939 + - 0.2617524844957687 + - - -0.26859205166611555 + - -0.017740123512057532 + - -0.16973184286647353 + - -0.041497625408519805 + - -0.33848186563738925 + - -0.498133067071094 + - 0.06453515847241846 + - -0.28211046673410256 + - - -0.0031712540783364537 + - 0.14054927501098227 + - -0.16739625499774285 + - 0.02924799819668618 + - 0.19945724852581612 + - -0.07433092972702877 + - 0.33641837410477954 + - -0.1935354318143647 + - - -0.2896583115032089 + - -0.4291374752779325 + - 0.18521131755882006 + - 0.036186935403130116 + - 0.27669775576389155 + - -0.04763160274577408 + - 0.1400908330823242 + - 0.15697986928574623 + - - -0.45902865822845124 + - 0.33250108656046035 + - 0.0306169230429561 + - -0.035381192364331175 + - -0.0510947377580893 + - 0.03972955950151097 + - 0.6129808284962325 + - 0.027297205883797467 + "@version": 2 + activation_function: none + bias: true + precision: float64 + resnet: false + trainable: true + use_timestep: false + node_sym_linear: + "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.04402011262092976 + - 0.19539133788288796 + - 0.02243486288225181 + - -0.15932598464026163 + - -0.1441065175896103 + - -0.20205704607775893 + - 0.007090553889850609 + - -0.20221671762001667 + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - 0.07307166266137728 + - 0.06127645860254937 + - -0.18492998679736772 + - 0.04613999102916452 + - 0.0071079000479441 + - -0.03731231022031221 + - -0.09483134725409749 + - -0.04779952388125717 + - - -0.06975495248291474 + - -0.19948091645922555 + - -0.19101500000694704 + - -0.0756612239190429 + - 0.18223713459959498 + - 0.004660326879702162 + - -0.07331926290215518 + - -0.11804049351864328 + - - -0.008643603296694117 + - -0.07891692454926386 + - -0.24683520896350286 + - -0.07498319216962253 + - 0.14604675984008694 + - 0.09601184516912262 + - -0.01561740011879576 + - 0.05490167651869453 + - - -0.019884970427089133 + - 0.0007666914047260165 + - -0.16505651916265357 + - -0.16723740821054547 + - 0.1234653183096876 + - -0.04403642108952563 + - -0.03727304005788303 + - -0.18190516409632088 + - - -0.09168767286099873 + - -0.1549419399425698 + - 0.08193144903871091 + - 0.15640675555750194 + - -0.06305034848986912 + - 0.16836512195133213 + - 0.009048302220263893 + - 0.05322075713280992 + - - -0.06468543813846328 + - 0.0948348631241292 + - 0.006867290444906741 + - -0.24931773871448817 + - 0.08788089155489308 + - -0.0739514302480491 + - 0.025288498321181765 + - -0.08305521153831118 + - - -0.07393598220040017 + - 0.07042478981157554 + - -0.07236047649200875 + - -0.04706083253081129 + - 0.011054293351306345 + - -0.08799610585856558 + - 0.1563680796185477 + - 0.04333789772104407 + - - -0.10653678039670528 + - 0.18112426500221723 + - 0.009186401470971654 + - 0.006153194931152504 + - -0.04989535662898608 + - -0.17876067282409114 + - -0.15602193322162777 + - 0.00781917954318876 + - - -0.06699569918753231 + - 0.30735630566871885 + - -0.016096279041795645 + - -0.22956358044083025 + - -0.0065529000816888765 + - -0.06902463180143781 + - 0.06768609922953323 + - 0.1187567871665586 + - - -0.03935798540152214 + - -0.10867329670955546 + - -0.04052094555163571 + - -0.04078630187590839 + - -0.0748763378601901 + - 0.01860182181594992 + - 0.20057959184872112 + - 0.16046549209905156 + - - 0.031478615338666395 + - 0.11567514563874234 + - 0.08294594125898624 + - -0.07089853590674343 + - 0.20101923186451937 + - -0.11766930025015115 + - 0.21570163379940235 + - 0.14563108587004206 + - - 0.07932986781606469 + - -0.19442968907969105 + - 0.05697454617840562 + - -0.19484656091831729 + - 0.04754566926156801 + - -0.12152155059832441 + - 0.08105546170302243 + - -0.09483406966077029 + - - -0.10943334690817784 + - 0.11702284889224986 + - 0.06551144399385757 + - -0.003108503735325857 + - -0.1466684268106551 + - 0.11582453333312602 + - -0.19609870968779317 + - -0.11809063481420465 + - - -0.11120967058944209 + - -0.07178289284260277 + - -0.07505138171189361 + - -0.17137771295621249 + - -0.012516091428859523 + - 0.056132912587423756 + - -0.011172736867909887 + - 0.0014926969164057145 + - - -0.1652803302650934 + - 0.08452449793427 + - -0.06260662069159101 + - -0.07909718643578055 + - 0.00574135469567161 + - -0.05691391300603163 + - 0.2457179942284785 + - -0.08037694862142311 + - - 0.1761032538671494 + - -0.15524353322856968 + - -0.20338260987738993 + - -0.09738847488694806 + - 0.05960295717975261 + - -0.0268406105291267 + - 0.19154482080963495 + - -0.05557739958347549 + - - -0.23162474155138468 + - 0.005428848189956548 + - 0.14498512403306713 + - 0.015859517797165032 + - 0.13342303538966063 + - 0.07757097608660568 + - 0.061885304048992174 + - -0.02774862502778554 + - - -0.099674682792698 + - 0.1743242267060875 + - 0.0565895993699819 + - -0.1431728246694354 + - -0.04572377374634247 + - 0.1932522842767088 + - -0.13605774184771868 + - -0.079596349847149 + - - 0.015159290423222593 + - 0.0741788473825365 + - -0.025111776236424455 + - 0.11728977172727281 + - -0.05246129405331076 + - -0.3560652693695576 + - 0.22489664505020285 + - -0.11322150427163667 + - - 0.1172685876179488 + - 0.015449206720498673 + - 0.11464505230123948 + - -0.13045379503420262 + - -0.18460226345307634 + - -0.0735660416536509 + - 0.02668836976483192 + - 0.009471901506209893 + - - -0.12415218588856815 + - -0.028427823628392242 + - -0.0726329032188482 + - 0.2205454016716484 + - -0.06981635935553832 + - -0.06914918285976224 + - -0.07547512647684368 + - 0.19585301943839276 + - - 0.02068794647278527 + - 0.11434955856950152 + - 0.04733548159377606 + - -0.0940771421180628 + - 0.106950218084799 + - 0.11995323224700441 + - 0.07016105028143815 + - -0.07349788842232614 + - - -0.028316732941958092 + - -0.006316920155388264 + - 0.014323448114816232 + - 0.07909510285143638 + - 0.08089223619428912 + - -0.1285448965066473 + - -0.02731037643994388 + - -0.048232324099890284 + - - -0.04229476466912251 + - -0.10545582133061814 + - -0.1399519987577358 + - -0.24859786794141928 + - 0.04555029533580089 + - -0.06637709714144181 + - -0.11891839416041088 + - -0.05608836594526548 + - - -0.1481671676394082 + - -0.11826472343612228 + - 0.18759449377634982 + - -0.0027813243183313764 + - -0.06187858233767373 + - -0.16870507895423517 + - 0.15432198341660605 + - 0.2442525725033602 + - - 0.11655618965628044 + - 0.16410614799338208 + - -0.15922334755571288 + - 0.05294100944284731 + - -0.042676438943807564 + - 0.05982722192738627 + - 0.08818007330306689 + - 0.08799006862019813 + - - -0.1816952674192488 + - 0.33018315199731113 + - 0.14825048237904745 + - -0.12977688627249692 + - -0.014039894202582361 + - 0.021698570605095405 + - -0.10536292700472008 + - -0.016298405526400214 + - - 0.18891280861168214 + - -0.037066320234429954 + - 0.051989201606798936 + - -0.33236261122879446 + - -0.2233240290736924 + - 0.17632501110044835 + - 0.02791043546786102 + - 0.08058616657592761 + - - -0.12416787825473675 + - -0.0018776550590277605 + - -0.1361594510955972 + - -0.031008628174283 + - -0.1510470016534144 + - -0.1968118582063139 + - 0.05923927005740039 + - 0.10906017525194028 + - - -0.01747528984400593 + - 0.043571037430286425 + - 0.09735765094593854 + - 0.038496104792229716 + - 0.021583898030338507 + - -0.11795161808253331 + - -0.11404406907043374 + - -0.06541831356900717 + - - 0.05781757062086345 + - 0.06545403068342133 + - 0.07182196888387801 + - 0.06571017380833269 + - 0.25549620850343796 + - -0.01712221435859817 + - 0.02746476505848508 + - -0.16813933068880024 + - - 0.15811742659496866 + - -0.10097487333290259 + - 0.0007478905750386516 + - 0.15986815657402492 + - 0.0879704571647486 + - 0.051839404360383305 + - 0.04773139180116972 + - -0.1562216704347126 + - - -0.00554177026701311 + - 0.026672084558123862 + - -0.026556168406945337 + - 0.017618135480540704 + - 0.04290442846891425 + - -0.16108845422437917 + - 0.03885069382837762 + - -0.08559226312134341 + - - -0.10984387513362157 + - 0.06020841962256015 + - 0.013439129456291792 + - -0.1211722988008539 + - 0.0321361577334442 + - 0.04742132269747014 + - -0.08371259477093888 + - -0.14250805695920574 + - - 0.04498243399350513 + - -0.03633434279549633 + - 0.17043129619564554 + - 0.13738977779076048 + - 0.03367329367643751 + - 0.13141345496526305 + - 0.14626062464255066 + - -0.087660426894852 + - - 0.13304548046946202 + - -0.02074921039690319 + - -0.19614199925540662 + - 0.09145888259449976 + - -0.16872056060024043 + - -0.057806035869808946 + - -0.012927002228426554 + - -0.18968555494779932 + - - 0.09056415309144267 + - 0.19579647713404205 + - 0.12419307551929215 + - 0.03068324855507999 + - 0.16324257199502792 + - 0.28864177653836015 + - -0.04884842530407823 + - 0.05243039778651716 + - - -0.1354513040660592 + - -0.0032083328727676315 + - 0.035763067000830435 + - -0.10752629467535854 + - -0.004527627068300205 + - -0.26678729966885645 + - 0.16095749546546945 + - -0.0768457166279081 + - - 0.24290534029168284 + - -0.19993818991295886 + - 0.05863500838014017 + - 0.1075745460176732 + - -0.2703641493668329 + - 0.022882752217475207 + - -0.18377439784177813 + - -0.02475991439750886 + - - -0.0970343883793403 + - 0.022190761521183 + - -0.31137609288015433 + - -0.12852583938411438 + - 0.06380585650762231 + - -0.05537350140183391 + - 0.009834307052428782 + - -0.18327381164681603 + - - -0.058720582106338445 + - 0.012207974777885133 + - 0.04906298973398652 + - -0.0252045636071624 + - 0.04064401311527239 + - -0.12030307623147056 + - -0.02607458251331658 + - -0.12104904963385374 + - - 0.14380149442345772 + - -0.08586187966457755 + - -0.0562380312253021 + - 0.1183995520092173 + - -0.008618891616010692 + - -0.30556252122213096 + - 0.157107693967395 + - -0.150824446001649 + - - -0.12986340463514554 + - -0.13953775800615473 + - 0.06688782609307184 + - 0.30709990962247197 + - -0.10057794483875744 + - -0.15572836837520085 + - 0.22240522808485344 + - 0.07486567450323982 + - - -0.00026497955681491453 + - -0.462148220797257 + - -0.04683339159019641 + - 0.10954858908660245 + - 0.048155719596331595 + - -0.08404934441388894 + - 0.15848474948089222 + - -0.029754000979091536 + - - -0.008795641657631076 + - -0.021341761230446545 + - -0.10489671109204046 + - 0.03213370243212562 + - -0.021792936100149974 + - -0.018371450392434912 + - 0.0007292277382723748 + - 0.07679112359755517 + - - -0.06130007400378907 + - -0.06581095863692285 + - 0.06501448048047738 + - -0.14197246804370967 + - 0.15983589537290877 + - -0.15693380789472725 + - -0.17963845906090375 + - 0.10204145028546817 + - - -0.07077050429398143 + - 0.1990098057969514 + - -0.2525111691805106 + - -0.22059894251537618 + - -0.27531410890875607 + - -0.0693243961021514 + - 0.03876302523241355 + - 0.12122101629786736 + - - -0.12820657692829063 + - -0.10772035941442479 + - 0.10829696580051636 + - 0.1493715060396245 + - 0.13488833866187872 + - 0.09022524867490032 + - 0.007332743974581279 + - 0.1529338321168549 + - - -0.22245363971842472 + - -0.08917661330105822 + - 0.10304564318043377 + - -0.07026805272160686 + - 0.016625750231852813 + - 0.23074109385732217 + - 0.053971407495566504 + - -0.15089059679319458 + - - 0.1294396068073317 + - -0.038487426453509534 + - 0.09393650831599386 + - 0.09638990927578407 + - 0.17905918157852316 + - 0.06760574587425355 + - 0.0639998107196389 + - -0.1587157815816586 + - - 0.06077231806824999 + - 0.006159909130812671 + - 0.15285274367932117 + - -0.026531120401424045 + - 0.06104797756042876 + - -0.174933801016035 + - 0.25284181425638513 + - -0.16931699181750984 + - - -0.09480440252644158 + - -0.11919995631753837 + - 0.1374865485894956 + - 0.03525829583245701 + - 0.055414318086174905 + - 0.039970825479268265 + - -0.028476173719310948 + - 0.007895110382084259 + - - -0.08849522170883828 + - 0.1556903658898126 + - -0.06942905817654972 + - 0.17917871676321492 + - -0.12839965901095401 + - -0.1457242708290995 + - 0.2073632537418445 + - -0.0033056633245595168 + - - -0.14321940581992326 + - 0.016216983383358995 + - -0.05603214608550905 + - 0.034067014410779244 + - -0.004165932252642813 + - 0.03579825379823718 + - 0.2274077472661256 + - 0.12282153328534674 + - - -0.17424677728325255 + - 0.03032450606197887 + - -0.3407467917235723 + - 0.08460871296272927 + - -0.21233509125037692 + - 0.038581785470083826 + - -0.1271081651221865 + - -0.05674635282930029 + - - -0.06365889303105148 + - -0.0346798442701684 + - 0.04178115473238202 + - -0.03570145798701077 + - 0.2255873927499116 + - -0.21936512330368732 + - -0.19469567244011848 + - -0.007461014512643234 + "@version": 2 + activation_function: none + bias: true + precision: float64 + resnet: false + trainable: true + use_timestep: false + ntypes: 4 + optim_update: true + precision: float64 + sel_reduce_factor: 10.0 + smooth_edge_update: false + update_angle: false + update_residual: 0.1 + update_residual_init: const + update_style: res_residual + use_dynamic_sel: false + trainable: true + type: dpa3 + type_embedding: + "@class": TypeEmbedNet + "@version": 2 + activation_function: Linear + embedding: + "@class": EmbeddingNetwork + "@version": 2 + activation_function: Linear + bias: false + in_dim: 4 + layers: + - "@class": Layer + "@variables": + b: null + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - 0.06913868355931278 + - -0.3276059448146492 + - -0.22478586008940918 + - -0.03129740042629991 + - -0.2511436154794455 + - -0.4760319710462916 + - 0.183856376649989 + - 0.220680920691283 + - - -0.1331166944050067 + - -0.2985446381663858 + - -0.1299144028716818 + - 0.12716526105014014 + - 0.24445281051361242 + - 0.052359417290304015 + - -0.06639194378815659 + - -0.0515428623822807 + - - -0.3302870133986425 + - 0.1177804767091647 + - 0.06915893387117533 + - -0.4204302050492702 + - -0.3161145657939801 + - 0.322920377419993 + - 0.19395457855721343 + - -0.11365337655752422 + - - -0.16993400446851198 + - -0.157416126804567 + - -0.08090448953478106 + - 0.20830555342316676 + - -0.11308079862243182 + - 0.044490575624147384 + - 0.28211395871639494 + - 0.07920112686609734 + "@version": 2 + activation_function: Linear + bias: false + precision: float64 + resnet: true + trainable: true + use_timestep: false + neuron: + - 8 + precision: float64 + resnet_dt: false + neuron: + - 8 + ntypes: 4 + padding: true + precision: float64 + resnet_dt: false + trainable: true + type_map: &id001 + - Ni + - O + - Ni_spin + - O_spin + use_econf_tebd: false + use_tebd_bias: false + type_map: *id001 + use_econf_tebd: false + use_loc_mapping: false + use_tebd_bias: false + fitting: + "@class": Fitting + "@variables": + aparam_avg: null + aparam_inv_std: null + bias_atom_e: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - 0.0 + - - 0.0 + - - 0.0 + - - 0.0 + case_embd: null + fparam_avg: null + fparam_inv_std: null + "@version": 4 + activation_function: tanh + atom_ener: null + default_fparam: null + dim_case_embd: 0 + dim_descrpt: 8 + dim_out: 1 + exclude_types: *id002 + layer_name: null + mixed_types: true + nets: + "@class": NetworkCollection + "@version": 1 + ndim: 0 + network_type: fitting_network + networks: + - "@class": FittingNetwork + "@version": 1 + activation_function: tanh + bias_out: true + in_dim: 8 + layers: + - "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - -0.17635825349201156 + - -0.3566199551346283 + - -0.4657350300900149 + - 0.49182010702811113 + - 0.032647600656972545 + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - 0.19950263394684065 + - 0.05634765583345527 + - -0.1442129593712478 + - -0.1085774516963511 + - 0.11311331965894553 + - - -0.2775489843491954 + - -0.35666203499239274 + - -0.3389432389106902 + - -0.05632492275434322 + - -0.48859095817655873 + - - -0.0295274439718225 + - -0.1886895411820409 + - 0.53672545544271 + - 0.07574020379061007 + - -0.42704120525642686 + - - -0.00993498946754372 + - 0.3770750367653306 + - -0.4385261113155961 + - 0.0468328088042057 + - 0.012607351815014095 + - - 0.1092056939586687 + - -0.08440204904008866 + - -0.6198116015257329 + - 0.1936974618526528 + - -0.11584195169630225 + - - -0.6395628609700832 + - -0.3937842385131085 + - -0.1370675696847499 + - -0.08281792882082432 + - -0.14269944588470002 + - - 0.003683595092519098 + - -0.1064836461083355 + - 0.1513375212109038 + - -0.3798449359483027 + - -0.27711500793004523 + - - -0.24136291455222364 + - -0.19077785910921263 + - -0.12067289115480624 + - -0.05720709372900689 + - -0.044669501979496415 + "@version": 2 + activation_function: tanh + bias: true + precision: float64 + resnet: true + trainable: true + use_timestep: false + - "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.09501004658257778 + - 0.1663807327224991 + - -0.5185313341630086 + - -0.7740662908662731 + - -0.18752579321547022 + idt: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - -0.2188085499554977 + - -0.4014642473754725 + - 0.032489550654357095 + - 0.06343911616091243 + - -0.00407617112574573 + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - -0.3141891418069332 + - 0.30132598326837057 + - -0.1868614701027005 + - -0.1853536726835805 + - -0.14904917553209618 + - - -0.4993776326714626 + - 0.2929711950476154 + - -0.3300253064210836 + - -0.4799775188835898 + - -0.12327559985245252 + - - 0.16627900477763782 + - 0.18281489789715116 + - -0.0796215789550366 + - 0.11637836794519682 + - 0.019126199990905587 + - - 0.47193798042526686 + - 0.3935489978037474 + - 0.1926588188573466 + - 0.11685532990383077 + - -0.3143759410105157 + - - 0.2619509948079511 + - 0.17134734041574828 + - 0.16467987243470003 + - -0.17768942725372738 + - 0.17196893072212313 + "@version": 2 + activation_function: tanh + bias: true + precision: float64 + resnet: true + trainable: true + use_timestep: true + - "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.1498329073500072 + - -0.10390305511196503 + - -0.7262688617464856 + - -0.14980303343140125 + - -0.3578894004618838 + idt: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.3290381873321775 + - 0.23103250534551598 + - -0.6940851206117438 + - -0.19335307745332778 + - -0.9240817753801489 + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - -0.3092290441156226 + - -0.496367611501348 + - -0.052492949379292775 + - 0.06663748312823926 + - 0.027714401468510886 + - - -0.10433141997317527 + - -0.323901631855259 + - -0.24739439873488192 + - 0.3076895568713741 + - 0.1593814472209255 + - - -0.07111829721069259 + - -0.27598680250101504 + - 0.16632764307325093 + - 0.1801382402999823 + - 0.3107523993064097 + - - -0.012140157566561928 + - 0.07469305237763302 + - 0.26428018852282276 + - -0.11500213881655802 + - -0.2731498304335624 + - - 0.29941998505510775 + - 0.39267279762211 + - 0.06586779164332648 + - 0.10010820203885952 + - -0.04143485413490972 + "@version": 2 + activation_function: tanh + bias: true + precision: float64 + resnet: true + trainable: true + use_timestep: true + - "@class": Layer + "@variables": + b: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - 0.044426160812178636 + idt: null + w: + "@class": np.ndarray + "@is_variable": true + "@version": 1 + dtype: float64 + value: + - - 0.26432565710368733 + - - 0.17264367113482967 + - - -0.04729186377886323 + - - -0.08841444813809296 + - - 0.2969145415081517 + "@version": 2 + activation_function: none + bias: true + precision: float64 + resnet: false + trainable: true + use_timestep: false + neuron: + - 5 + - 5 + - 5 + out_dim: 1 + precision: float64 + resnet_dt: true + ntypes: 4 + neuron: + - 5 + - 5 + - 5 + ntypes: 4 + numb_aparam: 0 + numb_fparam: 0 + precision: float64 + rcond: null + resnet_dt: true + spin: null + tot_ener_zero: false + trainable: + - true + - true + - true + - true + type: ener + type_map: + - Ni + - O + - Ni_spin + - O_spin + use_aparam_as_mask: false + var_name: energy + pair_exclude_types: *id003 + preset_out_bias: null + rcond: null + type: standard + type_map: + - Ni + - O + - Ni_spin + - O_spin + spin: + use_spin: + - true + - false + virtual_scale: + - 0.314 + - 0.0 + type: spin_ener +model_def_script: + descriptor: + precision: float64 + repflow: + a_dim: 4 + a_rcut: 3.5 + a_rcut_smth: 0.5 + a_sel: 4 + axis_neuron: 4 + e_dim: 6 + e_rcut: 4.0 + e_rcut_smth: 0.5 + e_sel: 8 + n_dim: 8 + nlayers: 1 + update_angle: false + seed: 1 + type: dpa3 + use_loc_mapping: false + fitting_net: + neuron: + - 5 + - 5 + - 5 + resnet_dt: true + seed: 1 + spin: + use_spin: + - true + - false + virtual_scale: + - 0.314 + - 0.0 + type_map: + - Ni + - O +software: deepmd-kit +time: "2026-04-30 14:57:42.534472+00:00" +version: 3.0.0 diff --git a/source/tests/infer/gen_spin.py b/source/tests/infer/gen_spin.py index d37e3207ff..789a65971b 100644 --- a/source/tests/infer/gen_spin.py +++ b/source/tests/infer/gen_spin.py @@ -84,6 +84,66 @@ def _build_yaml(yaml_path: str) -> None: save_dp_model(yaml_path, data) +def _build_dpa3_mpi_yaml(yaml_path: str) -> None: + """Build a DPA3 spin model with use_loc_mapping=False (multi-rank). + + The default ``deeppot_dpa_spin.yaml`` uses se_atten (DPA1) which + is non-GNN — single-artifact .pt2, no multi-rank ghost exchange. + This variant uses DPA3 (repflows, GNN) with use_loc_mapping=False + so the dual-artifact .pt2 carries the with-comm AOTI module that + DeepSpinPTExpt routes to under mpirun > 1. + + Type map matches the existing 4-atom Ni-O test data + (``write_lmp_data_spin``): two types, Ni (spin-active), O (no spin). + """ + from deepmd.dpmodel.model.model import ( + get_model, + ) + from deepmd.dpmodel.utils.serialization import ( + save_dp_model, + ) + + config = { + "type_map": ["Ni", "O"], + "descriptor": { + "type": "dpa3", + "repflow": { + "n_dim": 8, + "e_dim": 6, + "a_dim": 4, + "nlayers": 1, + "e_rcut": 4.0, + "e_rcut_smth": 0.5, + "e_sel": 8, + "a_rcut": 3.5, + "a_rcut_smth": 0.5, + "a_sel": 4, + "axis_neuron": 4, + "update_angle": False, + }, + "use_loc_mapping": False, + "precision": "float64", + "seed": 1, + }, + "fitting_net": {"neuron": [5, 5, 5], "resnet_dt": True, "seed": 1}, + "spin": {"use_spin": [True, False], "virtual_scale": [0.3140, 0.0]}, + } + + model = get_model(copy.deepcopy(config)) + model_dict = model.serialize() + + data = { + "model": model_dict, + "model_def_script": config, + "backend": "dpmodel", + "software": "deepmd-kit", + "version": "3.0.0", + } + + print(f"Building DPA3 spin dpmodel and saving to {yaml_path} ...") # noqa: T201 + save_dp_model(yaml_path, data) + + def main(): from deepmd.entrypoints.convert_backend import ( convert_backend, @@ -96,12 +156,23 @@ def main(): pth_path = os.path.join(base_dir, "deeppot_dpa_spin.pth") pt2_path = os.path.join(base_dir, "deeppot_dpa_spin.pt2") - # ---- 1. Build .yaml if it doesn't exist ---- + # Multi-rank GNN spin variant (DPA3 + use_loc_mapping=False). + # Produces a dual-artifact .pt2 that DeepSpinPTExpt routes to + # under mpirun > 1 (Phase 4c spin multi-rank dispatch). + yaml_dpa3_path = os.path.join(base_dir, "deeppot_dpa3_spin_mpi.yaml") + pt2_dpa3_path = os.path.join(base_dir, "deeppot_dpa3_spin_mpi.pt2") + + # ---- 1. Build .yamls if they don't exist ---- if not os.path.exists(yaml_path): _build_yaml(yaml_path) else: print(f"Using existing {yaml_path}") # noqa: T201 + if not os.path.exists(yaml_dpa3_path): + _build_dpa3_mpi_yaml(yaml_dpa3_path) + else: + print(f"Using existing {yaml_dpa3_path}") # noqa: T201 + # ---- 2. Convert .yaml -> .pth and .yaml -> .pt2 ---- # Import deepmd.pt to register the backend (needed for convert_backend) import deepmd.pt # noqa: F401 @@ -114,6 +185,9 @@ def main(): print(f"Converting to {pt2_path} ...") # noqa: T201 convert_backend(INPUT=yaml_path, OUTPUT=pt2_path, atomic_virial=True) + print(f"Converting to {pt2_dpa3_path} ...") # noqa: T201 + convert_backend(INPUT=yaml_dpa3_path, OUTPUT=pt2_dpa3_path, atomic_virial=True) + print("Export done.") # noqa: T201 # ---- 3. Run inference for PBC test ---- From 803b2a4b7ee6d3b46874efb2fceef92933d7eceb Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 1 May 2026 20:50:14 +0800 Subject: [PATCH 21/34] test(spin-mpi): cover empty-subdomain and NULL-type for spin DPA3 Resolves the two spin-specific gaps left open by the previous commit: - test_pair_deepmd_mpi_dpa3_spin_empty_subdomain: elongated 30 A box + processors '2 1 1' leaves rank 1 with nloc=0. Exercises the copy_from_nlist empty-rank guard for the spin path (the with-comm artifact still runs on rank 1 with nloc_real=0). - test_pair_deepmd_mpi_dpa3_spin_null_type: 2 NULL (LAMMPS type-3, deepmd atype=-1) atoms straddling the x=6.5 rank boundary, within rcut of real atoms on both sides. Goes through DeepSpinPTExpt with nall_real < nall, triggering the has_null_atoms branch that calls build_comm_tensors_positional_with_virtual_atoms (fwd_map-based sendlist remap) for spin. Asserts NULL atoms get zero forces from the deepmd model and real-atom values match the mpi-1 reference. Both compare mpi-2 vs same-archive mpi-1 (atol 1e-8) so any divergence is necessarily in the multi-rank dispatch, not in tracing precision. Runner generalised with --pair-coeff and --mass3 flags (mirrors the non-spin DPA3 runner). --- .../run_mpi_pair_deepmd_spin_dpa3_pt2.py | 23 ++- source/lmp/tests/test_lammps_spin_dpa3_pt2.py | 132 +++++++++++++++++- 2 files changed, 151 insertions(+), 4 deletions(-) diff --git a/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py index a8d7fe71a6..3637238968 100644 --- a/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py +++ b/source/lmp/tests/run_mpi_pair_deepmd_spin_dpa3_pt2.py @@ -60,6 +60,22 @@ "domain decomposition (nswap>0). Pass '1 1 1' for a single-rank " "reference run on the same archive.", ) +parser.add_argument( + "--pair-coeff", + type=str, + default="* *", + help="pair_coeff arguments (after 'pair_coeff'). Default '* *' " + "uses identity LAMMPS-type-to-deepmd-atype mapping. For NULL-type " + "tests pass e.g. '* * Ni O NULL' so the third LAMMPS type becomes " + "deepmd atype=-1 (filtered before model evaluation).", +) +parser.add_argument( + "--mass3", + type=float, + default=None, + help="Optional mass for LAMMPS atom type 3 (and any higher types). " + "Used by the NULL-type fixture; ignored when only 2 types exist.", +) args = parser.parse_args() lammps = PyLammps() @@ -73,11 +89,16 @@ lammps.read_data(args.DATAFILE) lammps.mass("1 58") lammps.mass("2 16") +if args.mass3 is not None: + # NULL-type fixture: third LAMMPS type maps to deepmd atype=-1 + # via pair_coeff and is filtered before model evaluation. Mass + # is physically irrelevant. + lammps.mass(f"3 {args.mass3}") lammps.timestep(0.0005) lammps.fix("1 all nve") lammps.pair_style(f"deepspin {args.PB_FILE}") -lammps.pair_coeff("* *") +lammps.pair_coeff(args.pair_coeff) lammps.compute("virial all centroid/stress/atom NULL pair") # Per-atom magnetic force components. LAMMPS does not expose ``fm`` # through the legacy ``extract``/``gather_atoms`` registry, so we go diff --git a/source/lmp/tests/test_lammps_spin_dpa3_pt2.py b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py index fd4ee0a7cb..7c7c5787a7 100644 --- a/source/lmp/tests/test_lammps_spin_dpa3_pt2.py +++ b/source/lmp/tests/test_lammps_spin_dpa3_pt2.py @@ -57,6 +57,22 @@ / "deeppot_dpa3_spin_mpi.pt2" ) data_file = Path(__file__).parent / "data_dpa3_spin_pt2.lmp" +# Elongated-box fixture for the spin empty-subdomain MPI test: x is +# extended to 30 A while atoms remain in x in [3, 13]. Combined with +# ``processors 2 1 1`` this leaves rank 1 (x >= 15) with zero local +# atoms, exercising the ``copy_from_nlist`` empty-rank guard for spin. +data_file_empty_subdomain = ( + Path(__file__).parent / "data_dpa3_spin_pt2_empty_subdomain.lmp" +) +# NULL-type fixture: 4 real Ni-O atoms + 2 LAMMPS type-3 atoms +# straddling the x=6.5 rank boundary. With ``pair_coeff * * Ni O NULL`` +# LAMMPS type 3 maps to deepmd atype=-1, so those atoms are filtered +# by ``select_real_atoms_coord`` and the comm tensors must be remapped +# via ``fwd_map`` before being handed to the with-comm artifact. +# Forces / force_mag on the 4 real atoms must match the no-NULL +# baseline (mpi-1 reference run); NULL atoms get zero forces from the +# deepmd model. +data_file_null_type = Path(__file__).parent / "data_dpa3_spin_pt2_null_type.lmp" # 4-atom Ni-O system; same layout as ``test_lammps_spin_pt2.py``. With # ``processors 2 1 1`` the split sits at x=6.5 -> 2 atoms per rank. @@ -86,22 +102,56 @@ def setup_module() -> None: "Skip test because PyTorch support is not enabled.", ) write_lmp_data_spin(box, coord, spin, type_NiO, data_file) + # Elongated x-axis; atoms unchanged. ``processors 2 1 1`` splits + # at x=15 A and rank 1 owns x >= 15, which is empty. + box_empty = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0]) + write_lmp_data_spin(box_empty, coord, spin, type_NiO, data_file_empty_subdomain) + # NULL-type fixture: append 2 LAMMPS type-3 atoms within rcut + # (~4 A) of real atoms on BOTH sides of the x=6.5 rank boundary, + # so they appear in cross-rank sendlists and the fwd_map-based + # comm-tensor remap is genuinely exercised. NULL atoms still need + # spin coordinates (write_lmp_data_spin format); we give them + # zero spin like the type-2 (O) atoms. + coord_null = np.concatenate( + [ + coord, + np.array( + [ + [5.5, 6.0, 6.0], # rank 0 side, near boundary + [7.5, 7.0, 7.0], # rank 1 side, near boundary + ] + ), + ] + ) + spin_null = np.concatenate([spin, np.zeros((2, 3))]) + type_null = np.concatenate([type_NiO, np.array([3, 3])]) + write_lmp_data_spin(box, coord_null, spin_null, type_null, data_file_null_type) def teardown_module() -> None: - if data_file.exists(): - os.remove(data_file) + for f in [data_file, data_file_empty_subdomain, data_file_null_type]: + if f.exists(): + os.remove(f) def _run_mpi_subprocess( nprocs: int, extra_args: list[str] | None = None, processors: str | None = None, + data_path: Path | None = None, + runner_args: list[str] | None = None, ) -> dict: """Run ``run_mpi_pair_deepmd_spin_dpa3_pt2.py`` under ``mpirun -n `` and return ``{"pe", "forces", "force_mag", "virials"}``. + + ``data_path`` (default ``data_file``) selects the LAMMPS data file + -- the empty-subdomain and NULL-type tests point at non-default + fixtures. ``runner_args`` flows additional flags (e.g. + ``--pair-coeff``, ``--mass3``) to the subprocess runner. """ + if data_path is None: + data_path = data_file with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: out_path = f.name try: @@ -111,7 +161,7 @@ def _run_mpi_subprocess( str(nprocs), sys.executable, str(Path(__file__).parent / "run_mpi_pair_deepmd_spin_dpa3_pt2.py"), - str(data_file.resolve()), + str(data_path.resolve()), str(pb_file_mpi.resolve()), out_path, ] @@ -121,6 +171,8 @@ def _run_mpi_subprocess( argv.extend(["--processors", "1 1 1"]) if extra_args: argv.extend(extra_args) + if runner_args: + argv.extend(runner_args) sp.check_call(argv) with open(out_path) as fh: lines = fh.read().strip().splitlines() @@ -176,3 +228,77 @@ def test_pair_deepmd_mpi_dpa3_spin() -> None: np.testing.assert_allclose( out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_spin_empty_subdomain() -> None: + """Spin DPA3 multi-rank with one empty rank. + + Elongated x box (30 A) + ``processors 2 1 1`` puts all 4 atoms on + rank 0; rank 1 has nloc=0. Exercises the C++ ``copy_from_nlist`` + empty-rank guard for the spin path (the with-comm artifact still + runs on rank 1 with nloc_real=0). Compares against same-archive + mpi-1 reference. + """ + out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain) + out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain) + + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-10, abs=1e-12) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["force_mag"], out_ref["force_mag"], atol=1e-8, rtol=0 + ) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_spin_null_type() -> None: + """Spin DPA3 multi-rank with NULL-type atoms straddling the + rank boundary. + + Two LAMMPS type-3 atoms (mapped to deepmd atype=-1 via + ``pair_coeff * * Ni O NULL``) sit at x=5.5 and x=7.5, just inside + the rcut window of either side of the x=6.5 boundary. They appear + in the cross-rank sendlists and are filtered by + ``select_real_atoms_coord`` -- so the spin path goes through + ``DeepSpinPTExpt::compute`` with ``nall_real < nall``, triggering + the ``has_null_atoms`` branch that calls + ``build_comm_tensors_positional_with_virtual_atoms`` (fwd_map-based + sendlist remap). Compares mpi-2 vs same-archive mpi-1 reference + (nullifying NULL forces and using the same fwd_map remap on rank 0 + too). + """ + runner_args = ["--pair-coeff", "* * Ni O NULL", "--mass3", "1.0"] + out_mpi = _run_mpi_subprocess( + nprocs=2, data_path=data_file_null_type, runner_args=runner_args + ) + out_ref = _run_mpi_subprocess( + nprocs=1, data_path=data_file_null_type, runner_args=runner_args + ) + + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-10, abs=1e-12) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["force_mag"], out_ref["force_mag"], atol=1e-8, rtol=0 + ) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + # Sanity: NULL atoms (ids 5, 6) get exactly zero forces from the + # deepmd model. ``write_lmp_data_spin`` writes atoms in the order + # given (id 1..N), so type-3 NULL atoms are ids 5, 6 (after the 4 + # real Ni-O atoms). + np.testing.assert_array_equal(out_mpi["forces"][4:], np.zeros((2, 3))) + np.testing.assert_array_equal(out_mpi["force_mag"][4:], np.zeros((2, 3))) From 47f0c29395026c5019bc3ab45cf3a6e97b0d96e0 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 1 May 2026 21:16:55 +0800 Subject: [PATCH 22/34] test(spin-mpi): drop committed deeppot_dpa3_spin_mpi.yaml gen_spin.py rebuilds the yaml from the deterministic config + seed in _build_dpa3_mpi_yaml when missing, and the multi-rank test compares mpi-2 vs same-archive mpi-1 (no hardcoded numerical references depend on these weights). Unlike deeppot_dpa_spin.yaml -- whose committed weights are pinned because C++ tests hardcode reference values against them -- nothing requires deeppot_dpa3_spin_mpi.yaml to be checkpointed in git. --- source/tests/infer/deeppot_dpa3_spin_mpi.yaml | 1863 ----------------- 1 file changed, 1863 deletions(-) delete mode 100644 source/tests/infer/deeppot_dpa3_spin_mpi.yaml diff --git a/source/tests/infer/deeppot_dpa3_spin_mpi.yaml b/source/tests/infer/deeppot_dpa3_spin_mpi.yaml deleted file mode 100644 index 6fce85245a..0000000000 --- a/source/tests/infer/deeppot_dpa3_spin_mpi.yaml +++ /dev/null @@ -1,1863 +0,0 @@ -backend: dpmodel -model: - backbone_model: - "@class": Model - "@variables": - out_bias: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - - 0.0 - - - 0.0 - - - 0.0 - - - 0.0 - out_std: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - - 1.0 - - - 1.0 - - - 1.0 - - - 1.0 - "@version": 2 - atom_exclude_types: &id002 - - 2 - - 3 - descriptor: - "@class": Descriptor - "@version": 2 - activation_function: silu - add_chg_spin_ebd: false - concat_output_tebd: false - env_protection: 1.0e-06 - exclude_types: &id003 - - - 3 - - 0 - - - 3 - - 1 - - - 3 - - 2 - - - 3 - - 3 - ntypes: 4 - precision: float64 - repflow_args: - a_compress_e_rate: 1 - a_compress_rate: 0 - a_compress_use_split: false - a_dim: 4 - a_rcut: 3.5 - a_rcut_smth: 0.5 - a_sel: 4 - axis_neuron: 4 - e_dim: 6 - e_rcut: 4.0 - e_rcut_smth: 0.5 - e_sel: 8 - edge_init_use_dist: false - fix_stat_std: 0.3 - n_dim: 8 - n_multi_edge_message: 1 - nlayers: 1 - optim_update: true - sel_reduce_factor: 10.0 - smooth_edge_update: false - update_angle: false - update_residual: 0.1 - update_residual_init: const - update_style: res_residual - use_dynamic_sel: false - use_exp_switch: false - repflow_variable: - "@variables": - davg: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - - - 0.0 - - 0.0 - - 0.0 - - 0.0 - dstd: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - - - 0.3 - - 0.3 - - 0.3 - - 0.3 - angle_embd: - "@class": Layer - "@variables": - b: null - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - 0.17649720801051316 - - 0.26111987625920485 - - -0.5130082451185703 - - -0.473906411761865 - "@version": 2 - activation_function: none - bias: false - precision: float64 - resnet: false - trainable: true - use_timestep: false - edge_embd: - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - -0.34071690817734457 - - -0.10233679058558394 - - -0.042158524863509905 - - 0.1901865304247668 - - 0.5454968471423458 - - -0.15466206519031384 - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - -0.41394371690540416 - - -0.020029235227998383 - - 0.7548439568852728 - - 0.18561320525199423 - - -0.1235585931191982 - - -0.6320668874586287 - "@version": 2 - activation_function: none - bias: true - precision: float64 - resnet: false - trainable: true - use_timestep: false - env_mat: - protection: 0.0 - rcut: 4.0 - rcut_smth: 0.5 - use_exp_switch: false - repflow_layers: - - "@class": RepFlowLayer - "@variables": - a_residual: [] - e_residual: - - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - n_residual: - - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - - 0.1 - "@version": 2 - a_compress_e_rate: 1 - a_compress_rate: 0 - a_compress_use_split: false - a_dim: 4 - a_rcut: 3.5 - a_rcut_smth: 0.5 - a_sel: 4 - activation_function: silu - axis_neuron: 4 - e_dim: 6 - e_rcut: 4.0 - e_rcut_smth: 0.5 - e_sel: - - 8 - edge_self_linear: - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.023029640256287606 - - -0.1553057617002839 - - 0.288173154238969 - - 0.05107253540449686 - - 0.2614964936530096 - - -0.46218537330158843 - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - -0.05677878533841125 - - 0.08257140520321203 - - -0.17682541236689134 - - -0.06780335156677138 - - 0.2613225810769312 - - -0.2528499571680411 - - - -0.1060811611888437 - - 0.2834597688459181 - - 0.17021435817066793 - - -0.2119118384053945 - - 0.044257126661162875 - - -0.20348530980582044 - - - -0.17206345113221683 - - 0.19628652842871394 - - -0.03853877890775931 - - 0.0064469870425646406 - - -0.2035021228657865 - - 0.33893151231499635 - - - -0.1603541649096622 - - -0.07431170557593793 - - -0.1285051383598977 - - -0.09531516630926942 - - -0.10774430641710406 - - 0.10558546868439946 - - - -0.027408677366010784 - - 0.03171038951939523 - - -0.26649612755080526 - - 0.0749559135333121 - - 0.12753219377780048 - - -0.12375862279261161 - - - -0.3561807917324476 - - -0.028580689473013433 - - -0.2740045204725747 - - 0.12423725221263406 - - -0.0746927118825747 - - -0.16583458613892285 - - - -0.22497394079088218 - - -0.10329264538957945 - - -0.06015496745765555 - - -0.24047390264558702 - - -0.2470728805254821 - - -0.03091482236168157 - - - 0.313786674711663 - - -0.014345848137540798 - - -0.1446657411476756 - - -0.11134433415995286 - - -0.10957716367503506 - - 0.25318230359455945 - - - -0.11353216449019704 - - -0.24278855525542462 - - -0.0657328669264313 - - -0.08357873620530161 - - -0.19969579432068596 - - 0.0217399962565733 - - - -0.017346111123240478 - - -0.20460540022763518 - - 0.19580548714183002 - - 0.26081320850512657 - - -0.01937612111130992 - - 0.26782602217325135 - - - 0.169450738702664 - - 0.0007586409725729347 - - 0.2217946757639642 - - -0.08785618340632881 - - 0.08754553673729902 - - 0.0459550075486224 - - - -0.10347442434861592 - - -0.05665265742992996 - - -0.15657294594958857 - - 0.07518451488260593 - - -0.200469822163535 - - 0.008552407309104103 - - - -0.13418495292451688 - - -0.15007855071339413 - - -0.47561245640659827 - - 0.05519145026405931 - - 0.034426127944687676 - - -0.19833628864440492 - - - -0.061539077996517144 - - 0.140236735963936 - - 0.44907007382747016 - - -0.17502514002597466 - - -0.13141545313528988 - - -0.10225960767785013 - - - 0.15849623153238157 - - 0.14969793438965767 - - 0.05020396887825857 - - -0.42237393212574514 - - -0.43560848414739306 - - -0.34368434587411545 - - - -0.090558268807612 - - -0.10586479947976848 - - -0.17654116465986686 - - -0.17464251661717356 - - -0.17707748016637653 - - 0.4728011907076426 - - - 0.06839467741547146 - - 0.20172332403056187 - - -0.20761658659357723 - - -0.3179201458113386 - - 0.1570191398976539 - - 0.30829366728408747 - - - -0.07346831672915768 - - -0.01603422028135059 - - -0.2343121216044693 - - -0.09228986456390967 - - -0.12259985802096685 - - 0.13925704477109332 - - - 0.03112673892006412 - - -0.12259170091097643 - - -0.01873720650800844 - - -0.02825905483134531 - - -0.07410620360262994 - - -0.13890487670689447 - - - 0.2599426512954838 - - -0.030475413023044056 - - 0.04418102981236639 - - 0.14747916053674695 - - 0.11469436259489629 - - -0.12589465767715197 - - - 0.1534348683560527 - - -0.2598559665351654 - - 0.1691188844559884 - - -0.05815067519957393 - - -0.09922406205302857 - - -0.026111067214965193 - - - -0.09469687008709152 - - -0.25433614509748265 - - -0.3230603080176275 - - 0.10565697308598668 - - 0.11382397843310456 - - 0.12636033735242963 - "@version": 2 - activation_function: none - bias: true - precision: float64 - resnet: false - trainable: true - use_timestep: false - n_dim: 8 - n_multi_edge_message: 1 - node_edge_linear: - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - -0.08691917211888521 - - -0.2190238624015504 - - -0.03155197949842874 - - -0.06801377552229042 - - -0.36593263653854924 - - -0.2524793728902088 - - -0.2985692887165394 - - 0.041241949395424804 - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - -0.0696849625476975 - - 0.2297992430114777 - - 0.2016390404837622 - - -0.14268332516715398 - - 0.05104561028973491 - - -0.0047524771390660015 - - 0.2007647888532239 - - -0.13892443853282946 - - - -0.06269186432762 - - -0.3620323943560662 - - 0.1429598213093448 - - 0.32251103225335886 - - 0.03297508356302982 - - 0.09813020765990464 - - -0.2128967691985101 - - 0.1446653191521595 - - - 0.3408694676048958 - - 0.2108742996875453 - - 0.3708891734344055 - - 0.03603632207631642 - - -0.021861106604374982 - - 0.05885265981211556 - - -0.3850668694292795 - - -0.02565715855818745 - - - 0.2554067355579947 - - -0.12385934461529113 - - 0.20794804172087122 - - 0.34771760519493133 - - 0.0030775484903255083 - - 0.08033323613153229 - - 0.04547640227535313 - - 0.03256133523805088 - - - -0.20228475171413982 - - -0.40882303462099395 - - -0.13933573248982073 - - -0.09056898648309795 - - 0.06705102826758672 - - -0.10643998751725821 - - 0.3714789434592029 - - -0.15660714896565422 - - - 0.0620445405627027 - - -0.14981233984554174 - - 0.1377612457580642 - - -0.3264453797874674 - - 0.18886992363386892 - - -0.13120999191064697 - - 0.2639396300281778 - - 0.20744058178112204 - - - 0.0902283316504931 - - 0.2642720697422412 - - 0.11616051352480065 - - 0.33194344559115435 - - -0.07519119975054182 - - -0.05062288710700148 - - -0.0033899752949634763 - - 0.12074780296663348 - - - -0.14625494457636853 - - -0.39187048236106403 - - 0.005863181213654556 - - 0.08058988215606765 - - 0.229684677996952 - - 0.02491096095922189 - - -0.07923462148233366 - - 0.03463149425218323 - - - 0.1459761391053138 - - 0.1826916307305693 - - -0.24330282168960599 - - -0.15404338160080283 - - -0.15732528026070658 - - 0.10082194118502665 - - 0.10780094880007303 - - 0.10439459076027502 - - - 0.2967580322447816 - - 0.19310548831515634 - - 0.13271337197427788 - - -0.003964549207383962 - - -0.3053587625881187 - - 0.12883374510336365 - - 0.045960329737757634 - - 0.19761345822107057 - - - -0.13773367016862742 - - 0.09659775346412201 - - 0.13561552570758134 - - 0.07814681408507194 - - -0.28773064288403055 - - 0.07556744144481048 - - -0.09838713644355178 - - 0.009867107649393275 - - - -0.09655717020123253 - - -0.10871554819348611 - - -0.11670258568304015 - - 0.2177137774640066 - - -0.14817421356773255 - - -0.03606693672811542 - - -0.026029214690369364 - - -0.040666475049662726 - - - -0.0677385423671006 - - -0.12993597893178244 - - 0.1180039874263662 - - 0.1384604584579823 - - -0.024227421664540914 - - 0.1679245814762119 - - -0.19280274838451647 - - 0.0990223630355508 - - - 0.0758415027141385 - - 0.16215196433523008 - - 0.2767732385588474 - - 0.022163750613004355 - - -0.12254120989786124 - - -0.12391951174230557 - - -0.028791741351884195 - - -0.0595519969823867 - - - 0.22247449036902905 - - 0.07567917899966987 - - -0.18221068561029122 - - -0.1496346790319525 - - 0.01739141266484531 - - 0.03295277270138665 - - -0.27927822171693173 - - -0.13558030103477586 - - - 0.1712575942124072 - - 0.13705603104177683 - - 0.290608271870899 - - 0.25077636518593155 - - 0.06723740912894116 - - -0.29077479630216374 - - -0.25998108625190797 - - 0.15096707384533595 - - - -0.011258223444056591 - - 0.07940059884337107 - - 0.14539160696529732 - - -0.33401238196882443 - - 0.0359760729699335 - - -0.02226084988227022 - - 0.12276616178918343 - - 0.0439592954772777 - - - 0.07596667366672871 - - -0.11052600964268607 - - -0.13155071841622368 - - -0.07425437999013539 - - 0.00827734508288158 - - 0.07414300482320346 - - 0.052019022231599196 - - 0.16368644986528788 - - - 0.31022863799320216 - - 0.11380817934759249 - - 0.11671054675679823 - - 0.03833224311415518 - - 0.1545146635596559 - - 0.5283089690392868 - - -0.17235747525638992 - - -0.16802245441710034 - - - 0.19547575805994974 - - 0.03442738806627725 - - 0.035134165349037516 - - 0.1685202553837112 - - -0.13706885637245225 - - -0.09105484518308726 - - 0.24401116664356562 - - -0.042463896239058455 - - - 0.18293429344914702 - - -0.0797150153045118 - - 0.2837300628985514 - - -0.03290000697254011 - - 0.07484025269991934 - - 0.4486382833349405 - - 0.18215765586473062 - - 0.14222755521955213 - - - -0.054949228485595726 - - 0.2298266346316468 - - -0.13022437426681047 - - 0.31473958548227127 - - -0.16053599380138361 - - 0.12351036770696595 - - -0.2026640600757936 - - -0.3120452604960154 - "@version": 2 - activation_function: none - bias: true - precision: float64 - resnet: false - trainable: true - use_timestep: false - node_self_mlp: - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.3339521660758233 - - 0.19617864215638078 - - 0.11685150273643896 - - -0.04301114015831818 - - -0.2646745547826684 - - -0.05874585577443532 - - 0.4130256006886377 - - -0.6003500792716773 - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - -0.22061013087519665 - - 0.17161694901625085 - - 0.25079797681294247 - - -0.06984190636344022 - - 0.402412783105689 - - -0.13232509868240386 - - -0.12410592033624109 - - -0.5243896508356666 - - - -0.34531669337816745 - - 0.2590681097532894 - - -0.4170438578433154 - - 0.33209656716128205 - - 0.20907222698506978 - - 0.21026382825889875 - - -0.04125433055358784 - - -0.3362049950725693 - - - -0.02306669199993831 - - -0.27140136827851236 - - 0.08675906253383281 - - 0.20991982378397447 - - -0.20157467157772102 - - 0.10954533237221269 - - -0.30521247150866015 - - 0.1039196228402914 - - - 0.2927901959232568 - - -0.05686111266739088 - - -0.352867716741099 - - 0.06499009437306054 - - 0.2935084094905296 - - -0.5208455549268021 - - -0.06412894033597939 - - 0.2617524844957687 - - - -0.26859205166611555 - - -0.017740123512057532 - - -0.16973184286647353 - - -0.041497625408519805 - - -0.33848186563738925 - - -0.498133067071094 - - 0.06453515847241846 - - -0.28211046673410256 - - - -0.0031712540783364537 - - 0.14054927501098227 - - -0.16739625499774285 - - 0.02924799819668618 - - 0.19945724852581612 - - -0.07433092972702877 - - 0.33641837410477954 - - -0.1935354318143647 - - - -0.2896583115032089 - - -0.4291374752779325 - - 0.18521131755882006 - - 0.036186935403130116 - - 0.27669775576389155 - - -0.04763160274577408 - - 0.1400908330823242 - - 0.15697986928574623 - - - -0.45902865822845124 - - 0.33250108656046035 - - 0.0306169230429561 - - -0.035381192364331175 - - -0.0510947377580893 - - 0.03972955950151097 - - 0.6129808284962325 - - 0.027297205883797467 - "@version": 2 - activation_function: none - bias: true - precision: float64 - resnet: false - trainable: true - use_timestep: false - node_sym_linear: - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.04402011262092976 - - 0.19539133788288796 - - 0.02243486288225181 - - -0.15932598464026163 - - -0.1441065175896103 - - -0.20205704607775893 - - 0.007090553889850609 - - -0.20221671762001667 - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - 0.07307166266137728 - - 0.06127645860254937 - - -0.18492998679736772 - - 0.04613999102916452 - - 0.0071079000479441 - - -0.03731231022031221 - - -0.09483134725409749 - - -0.04779952388125717 - - - -0.06975495248291474 - - -0.19948091645922555 - - -0.19101500000694704 - - -0.0756612239190429 - - 0.18223713459959498 - - 0.004660326879702162 - - -0.07331926290215518 - - -0.11804049351864328 - - - -0.008643603296694117 - - -0.07891692454926386 - - -0.24683520896350286 - - -0.07498319216962253 - - 0.14604675984008694 - - 0.09601184516912262 - - -0.01561740011879576 - - 0.05490167651869453 - - - -0.019884970427089133 - - 0.0007666914047260165 - - -0.16505651916265357 - - -0.16723740821054547 - - 0.1234653183096876 - - -0.04403642108952563 - - -0.03727304005788303 - - -0.18190516409632088 - - - -0.09168767286099873 - - -0.1549419399425698 - - 0.08193144903871091 - - 0.15640675555750194 - - -0.06305034848986912 - - 0.16836512195133213 - - 0.009048302220263893 - - 0.05322075713280992 - - - -0.06468543813846328 - - 0.0948348631241292 - - 0.006867290444906741 - - -0.24931773871448817 - - 0.08788089155489308 - - -0.0739514302480491 - - 0.025288498321181765 - - -0.08305521153831118 - - - -0.07393598220040017 - - 0.07042478981157554 - - -0.07236047649200875 - - -0.04706083253081129 - - 0.011054293351306345 - - -0.08799610585856558 - - 0.1563680796185477 - - 0.04333789772104407 - - - -0.10653678039670528 - - 0.18112426500221723 - - 0.009186401470971654 - - 0.006153194931152504 - - -0.04989535662898608 - - -0.17876067282409114 - - -0.15602193322162777 - - 0.00781917954318876 - - - -0.06699569918753231 - - 0.30735630566871885 - - -0.016096279041795645 - - -0.22956358044083025 - - -0.0065529000816888765 - - -0.06902463180143781 - - 0.06768609922953323 - - 0.1187567871665586 - - - -0.03935798540152214 - - -0.10867329670955546 - - -0.04052094555163571 - - -0.04078630187590839 - - -0.0748763378601901 - - 0.01860182181594992 - - 0.20057959184872112 - - 0.16046549209905156 - - - 0.031478615338666395 - - 0.11567514563874234 - - 0.08294594125898624 - - -0.07089853590674343 - - 0.20101923186451937 - - -0.11766930025015115 - - 0.21570163379940235 - - 0.14563108587004206 - - - 0.07932986781606469 - - -0.19442968907969105 - - 0.05697454617840562 - - -0.19484656091831729 - - 0.04754566926156801 - - -0.12152155059832441 - - 0.08105546170302243 - - -0.09483406966077029 - - - -0.10943334690817784 - - 0.11702284889224986 - - 0.06551144399385757 - - -0.003108503735325857 - - -0.1466684268106551 - - 0.11582453333312602 - - -0.19609870968779317 - - -0.11809063481420465 - - - -0.11120967058944209 - - -0.07178289284260277 - - -0.07505138171189361 - - -0.17137771295621249 - - -0.012516091428859523 - - 0.056132912587423756 - - -0.011172736867909887 - - 0.0014926969164057145 - - - -0.1652803302650934 - - 0.08452449793427 - - -0.06260662069159101 - - -0.07909718643578055 - - 0.00574135469567161 - - -0.05691391300603163 - - 0.2457179942284785 - - -0.08037694862142311 - - - 0.1761032538671494 - - -0.15524353322856968 - - -0.20338260987738993 - - -0.09738847488694806 - - 0.05960295717975261 - - -0.0268406105291267 - - 0.19154482080963495 - - -0.05557739958347549 - - - -0.23162474155138468 - - 0.005428848189956548 - - 0.14498512403306713 - - 0.015859517797165032 - - 0.13342303538966063 - - 0.07757097608660568 - - 0.061885304048992174 - - -0.02774862502778554 - - - -0.099674682792698 - - 0.1743242267060875 - - 0.0565895993699819 - - -0.1431728246694354 - - -0.04572377374634247 - - 0.1932522842767088 - - -0.13605774184771868 - - -0.079596349847149 - - - 0.015159290423222593 - - 0.0741788473825365 - - -0.025111776236424455 - - 0.11728977172727281 - - -0.05246129405331076 - - -0.3560652693695576 - - 0.22489664505020285 - - -0.11322150427163667 - - - 0.1172685876179488 - - 0.015449206720498673 - - 0.11464505230123948 - - -0.13045379503420262 - - -0.18460226345307634 - - -0.0735660416536509 - - 0.02668836976483192 - - 0.009471901506209893 - - - -0.12415218588856815 - - -0.028427823628392242 - - -0.0726329032188482 - - 0.2205454016716484 - - -0.06981635935553832 - - -0.06914918285976224 - - -0.07547512647684368 - - 0.19585301943839276 - - - 0.02068794647278527 - - 0.11434955856950152 - - 0.04733548159377606 - - -0.0940771421180628 - - 0.106950218084799 - - 0.11995323224700441 - - 0.07016105028143815 - - -0.07349788842232614 - - - -0.028316732941958092 - - -0.006316920155388264 - - 0.014323448114816232 - - 0.07909510285143638 - - 0.08089223619428912 - - -0.1285448965066473 - - -0.02731037643994388 - - -0.048232324099890284 - - - -0.04229476466912251 - - -0.10545582133061814 - - -0.1399519987577358 - - -0.24859786794141928 - - 0.04555029533580089 - - -0.06637709714144181 - - -0.11891839416041088 - - -0.05608836594526548 - - - -0.1481671676394082 - - -0.11826472343612228 - - 0.18759449377634982 - - -0.0027813243183313764 - - -0.06187858233767373 - - -0.16870507895423517 - - 0.15432198341660605 - - 0.2442525725033602 - - - 0.11655618965628044 - - 0.16410614799338208 - - -0.15922334755571288 - - 0.05294100944284731 - - -0.042676438943807564 - - 0.05982722192738627 - - 0.08818007330306689 - - 0.08799006862019813 - - - -0.1816952674192488 - - 0.33018315199731113 - - 0.14825048237904745 - - -0.12977688627249692 - - -0.014039894202582361 - - 0.021698570605095405 - - -0.10536292700472008 - - -0.016298405526400214 - - - 0.18891280861168214 - - -0.037066320234429954 - - 0.051989201606798936 - - -0.33236261122879446 - - -0.2233240290736924 - - 0.17632501110044835 - - 0.02791043546786102 - - 0.08058616657592761 - - - -0.12416787825473675 - - -0.0018776550590277605 - - -0.1361594510955972 - - -0.031008628174283 - - -0.1510470016534144 - - -0.1968118582063139 - - 0.05923927005740039 - - 0.10906017525194028 - - - -0.01747528984400593 - - 0.043571037430286425 - - 0.09735765094593854 - - 0.038496104792229716 - - 0.021583898030338507 - - -0.11795161808253331 - - -0.11404406907043374 - - -0.06541831356900717 - - - 0.05781757062086345 - - 0.06545403068342133 - - 0.07182196888387801 - - 0.06571017380833269 - - 0.25549620850343796 - - -0.01712221435859817 - - 0.02746476505848508 - - -0.16813933068880024 - - - 0.15811742659496866 - - -0.10097487333290259 - - 0.0007478905750386516 - - 0.15986815657402492 - - 0.0879704571647486 - - 0.051839404360383305 - - 0.04773139180116972 - - -0.1562216704347126 - - - -0.00554177026701311 - - 0.026672084558123862 - - -0.026556168406945337 - - 0.017618135480540704 - - 0.04290442846891425 - - -0.16108845422437917 - - 0.03885069382837762 - - -0.08559226312134341 - - - -0.10984387513362157 - - 0.06020841962256015 - - 0.013439129456291792 - - -0.1211722988008539 - - 0.0321361577334442 - - 0.04742132269747014 - - -0.08371259477093888 - - -0.14250805695920574 - - - 0.04498243399350513 - - -0.03633434279549633 - - 0.17043129619564554 - - 0.13738977779076048 - - 0.03367329367643751 - - 0.13141345496526305 - - 0.14626062464255066 - - -0.087660426894852 - - - 0.13304548046946202 - - -0.02074921039690319 - - -0.19614199925540662 - - 0.09145888259449976 - - -0.16872056060024043 - - -0.057806035869808946 - - -0.012927002228426554 - - -0.18968555494779932 - - - 0.09056415309144267 - - 0.19579647713404205 - - 0.12419307551929215 - - 0.03068324855507999 - - 0.16324257199502792 - - 0.28864177653836015 - - -0.04884842530407823 - - 0.05243039778651716 - - - -0.1354513040660592 - - -0.0032083328727676315 - - 0.035763067000830435 - - -0.10752629467535854 - - -0.004527627068300205 - - -0.26678729966885645 - - 0.16095749546546945 - - -0.0768457166279081 - - - 0.24290534029168284 - - -0.19993818991295886 - - 0.05863500838014017 - - 0.1075745460176732 - - -0.2703641493668329 - - 0.022882752217475207 - - -0.18377439784177813 - - -0.02475991439750886 - - - -0.0970343883793403 - - 0.022190761521183 - - -0.31137609288015433 - - -0.12852583938411438 - - 0.06380585650762231 - - -0.05537350140183391 - - 0.009834307052428782 - - -0.18327381164681603 - - - -0.058720582106338445 - - 0.012207974777885133 - - 0.04906298973398652 - - -0.0252045636071624 - - 0.04064401311527239 - - -0.12030307623147056 - - -0.02607458251331658 - - -0.12104904963385374 - - - 0.14380149442345772 - - -0.08586187966457755 - - -0.0562380312253021 - - 0.1183995520092173 - - -0.008618891616010692 - - -0.30556252122213096 - - 0.157107693967395 - - -0.150824446001649 - - - -0.12986340463514554 - - -0.13953775800615473 - - 0.06688782609307184 - - 0.30709990962247197 - - -0.10057794483875744 - - -0.15572836837520085 - - 0.22240522808485344 - - 0.07486567450323982 - - - -0.00026497955681491453 - - -0.462148220797257 - - -0.04683339159019641 - - 0.10954858908660245 - - 0.048155719596331595 - - -0.08404934441388894 - - 0.15848474948089222 - - -0.029754000979091536 - - - -0.008795641657631076 - - -0.021341761230446545 - - -0.10489671109204046 - - 0.03213370243212562 - - -0.021792936100149974 - - -0.018371450392434912 - - 0.0007292277382723748 - - 0.07679112359755517 - - - -0.06130007400378907 - - -0.06581095863692285 - - 0.06501448048047738 - - -0.14197246804370967 - - 0.15983589537290877 - - -0.15693380789472725 - - -0.17963845906090375 - - 0.10204145028546817 - - - -0.07077050429398143 - - 0.1990098057969514 - - -0.2525111691805106 - - -0.22059894251537618 - - -0.27531410890875607 - - -0.0693243961021514 - - 0.03876302523241355 - - 0.12122101629786736 - - - -0.12820657692829063 - - -0.10772035941442479 - - 0.10829696580051636 - - 0.1493715060396245 - - 0.13488833866187872 - - 0.09022524867490032 - - 0.007332743974581279 - - 0.1529338321168549 - - - -0.22245363971842472 - - -0.08917661330105822 - - 0.10304564318043377 - - -0.07026805272160686 - - 0.016625750231852813 - - 0.23074109385732217 - - 0.053971407495566504 - - -0.15089059679319458 - - - 0.1294396068073317 - - -0.038487426453509534 - - 0.09393650831599386 - - 0.09638990927578407 - - 0.17905918157852316 - - 0.06760574587425355 - - 0.0639998107196389 - - -0.1587157815816586 - - - 0.06077231806824999 - - 0.006159909130812671 - - 0.15285274367932117 - - -0.026531120401424045 - - 0.06104797756042876 - - -0.174933801016035 - - 0.25284181425638513 - - -0.16931699181750984 - - - -0.09480440252644158 - - -0.11919995631753837 - - 0.1374865485894956 - - 0.03525829583245701 - - 0.055414318086174905 - - 0.039970825479268265 - - -0.028476173719310948 - - 0.007895110382084259 - - - -0.08849522170883828 - - 0.1556903658898126 - - -0.06942905817654972 - - 0.17917871676321492 - - -0.12839965901095401 - - -0.1457242708290995 - - 0.2073632537418445 - - -0.0033056633245595168 - - - -0.14321940581992326 - - 0.016216983383358995 - - -0.05603214608550905 - - 0.034067014410779244 - - -0.004165932252642813 - - 0.03579825379823718 - - 0.2274077472661256 - - 0.12282153328534674 - - - -0.17424677728325255 - - 0.03032450606197887 - - -0.3407467917235723 - - 0.08460871296272927 - - -0.21233509125037692 - - 0.038581785470083826 - - -0.1271081651221865 - - -0.05674635282930029 - - - -0.06365889303105148 - - -0.0346798442701684 - - 0.04178115473238202 - - -0.03570145798701077 - - 0.2255873927499116 - - -0.21936512330368732 - - -0.19469567244011848 - - -0.007461014512643234 - "@version": 2 - activation_function: none - bias: true - precision: float64 - resnet: false - trainable: true - use_timestep: false - ntypes: 4 - optim_update: true - precision: float64 - sel_reduce_factor: 10.0 - smooth_edge_update: false - update_angle: false - update_residual: 0.1 - update_residual_init: const - update_style: res_residual - use_dynamic_sel: false - trainable: true - type: dpa3 - type_embedding: - "@class": TypeEmbedNet - "@version": 2 - activation_function: Linear - embedding: - "@class": EmbeddingNetwork - "@version": 2 - activation_function: Linear - bias: false - in_dim: 4 - layers: - - "@class": Layer - "@variables": - b: null - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - 0.06913868355931278 - - -0.3276059448146492 - - -0.22478586008940918 - - -0.03129740042629991 - - -0.2511436154794455 - - -0.4760319710462916 - - 0.183856376649989 - - 0.220680920691283 - - - -0.1331166944050067 - - -0.2985446381663858 - - -0.1299144028716818 - - 0.12716526105014014 - - 0.24445281051361242 - - 0.052359417290304015 - - -0.06639194378815659 - - -0.0515428623822807 - - - -0.3302870133986425 - - 0.1177804767091647 - - 0.06915893387117533 - - -0.4204302050492702 - - -0.3161145657939801 - - 0.322920377419993 - - 0.19395457855721343 - - -0.11365337655752422 - - - -0.16993400446851198 - - -0.157416126804567 - - -0.08090448953478106 - - 0.20830555342316676 - - -0.11308079862243182 - - 0.044490575624147384 - - 0.28211395871639494 - - 0.07920112686609734 - "@version": 2 - activation_function: Linear - bias: false - precision: float64 - resnet: true - trainable: true - use_timestep: false - neuron: - - 8 - precision: float64 - resnet_dt: false - neuron: - - 8 - ntypes: 4 - padding: true - precision: float64 - resnet_dt: false - trainable: true - type_map: &id001 - - Ni - - O - - Ni_spin - - O_spin - use_econf_tebd: false - use_tebd_bias: false - type_map: *id001 - use_econf_tebd: false - use_loc_mapping: false - use_tebd_bias: false - fitting: - "@class": Fitting - "@variables": - aparam_avg: null - aparam_inv_std: null - bias_atom_e: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - 0.0 - - - 0.0 - - - 0.0 - - - 0.0 - case_embd: null - fparam_avg: null - fparam_inv_std: null - "@version": 4 - activation_function: tanh - atom_ener: null - default_fparam: null - dim_case_embd: 0 - dim_descrpt: 8 - dim_out: 1 - exclude_types: *id002 - layer_name: null - mixed_types: true - nets: - "@class": NetworkCollection - "@version": 1 - ndim: 0 - network_type: fitting_network - networks: - - "@class": FittingNetwork - "@version": 1 - activation_function: tanh - bias_out: true - in_dim: 8 - layers: - - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - -0.17635825349201156 - - -0.3566199551346283 - - -0.4657350300900149 - - 0.49182010702811113 - - 0.032647600656972545 - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - 0.19950263394684065 - - 0.05634765583345527 - - -0.1442129593712478 - - -0.1085774516963511 - - 0.11311331965894553 - - - -0.2775489843491954 - - -0.35666203499239274 - - -0.3389432389106902 - - -0.05632492275434322 - - -0.48859095817655873 - - - -0.0295274439718225 - - -0.1886895411820409 - - 0.53672545544271 - - 0.07574020379061007 - - -0.42704120525642686 - - - -0.00993498946754372 - - 0.3770750367653306 - - -0.4385261113155961 - - 0.0468328088042057 - - 0.012607351815014095 - - - 0.1092056939586687 - - -0.08440204904008866 - - -0.6198116015257329 - - 0.1936974618526528 - - -0.11584195169630225 - - - -0.6395628609700832 - - -0.3937842385131085 - - -0.1370675696847499 - - -0.08281792882082432 - - -0.14269944588470002 - - - 0.003683595092519098 - - -0.1064836461083355 - - 0.1513375212109038 - - -0.3798449359483027 - - -0.27711500793004523 - - - -0.24136291455222364 - - -0.19077785910921263 - - -0.12067289115480624 - - -0.05720709372900689 - - -0.044669501979496415 - "@version": 2 - activation_function: tanh - bias: true - precision: float64 - resnet: true - trainable: true - use_timestep: false - - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.09501004658257778 - - 0.1663807327224991 - - -0.5185313341630086 - - -0.7740662908662731 - - -0.18752579321547022 - idt: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - -0.2188085499554977 - - -0.4014642473754725 - - 0.032489550654357095 - - 0.06343911616091243 - - -0.00407617112574573 - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - -0.3141891418069332 - - 0.30132598326837057 - - -0.1868614701027005 - - -0.1853536726835805 - - -0.14904917553209618 - - - -0.4993776326714626 - - 0.2929711950476154 - - -0.3300253064210836 - - -0.4799775188835898 - - -0.12327559985245252 - - - 0.16627900477763782 - - 0.18281489789715116 - - -0.0796215789550366 - - 0.11637836794519682 - - 0.019126199990905587 - - - 0.47193798042526686 - - 0.3935489978037474 - - 0.1926588188573466 - - 0.11685532990383077 - - -0.3143759410105157 - - - 0.2619509948079511 - - 0.17134734041574828 - - 0.16467987243470003 - - -0.17768942725372738 - - 0.17196893072212313 - "@version": 2 - activation_function: tanh - bias: true - precision: float64 - resnet: true - trainable: true - use_timestep: true - - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.1498329073500072 - - -0.10390305511196503 - - -0.7262688617464856 - - -0.14980303343140125 - - -0.3578894004618838 - idt: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.3290381873321775 - - 0.23103250534551598 - - -0.6940851206117438 - - -0.19335307745332778 - - -0.9240817753801489 - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - -0.3092290441156226 - - -0.496367611501348 - - -0.052492949379292775 - - 0.06663748312823926 - - 0.027714401468510886 - - - -0.10433141997317527 - - -0.323901631855259 - - -0.24739439873488192 - - 0.3076895568713741 - - 0.1593814472209255 - - - -0.07111829721069259 - - -0.27598680250101504 - - 0.16632764307325093 - - 0.1801382402999823 - - 0.3107523993064097 - - - -0.012140157566561928 - - 0.07469305237763302 - - 0.26428018852282276 - - -0.11500213881655802 - - -0.2731498304335624 - - - 0.29941998505510775 - - 0.39267279762211 - - 0.06586779164332648 - - 0.10010820203885952 - - -0.04143485413490972 - "@version": 2 - activation_function: tanh - bias: true - precision: float64 - resnet: true - trainable: true - use_timestep: true - - "@class": Layer - "@variables": - b: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - 0.044426160812178636 - idt: null - w: - "@class": np.ndarray - "@is_variable": true - "@version": 1 - dtype: float64 - value: - - - 0.26432565710368733 - - - 0.17264367113482967 - - - -0.04729186377886323 - - - -0.08841444813809296 - - - 0.2969145415081517 - "@version": 2 - activation_function: none - bias: true - precision: float64 - resnet: false - trainable: true - use_timestep: false - neuron: - - 5 - - 5 - - 5 - out_dim: 1 - precision: float64 - resnet_dt: true - ntypes: 4 - neuron: - - 5 - - 5 - - 5 - ntypes: 4 - numb_aparam: 0 - numb_fparam: 0 - precision: float64 - rcond: null - resnet_dt: true - spin: null - tot_ener_zero: false - trainable: - - true - - true - - true - - true - type: ener - type_map: - - Ni - - O - - Ni_spin - - O_spin - use_aparam_as_mask: false - var_name: energy - pair_exclude_types: *id003 - preset_out_bias: null - rcond: null - type: standard - type_map: - - Ni - - O - - Ni_spin - - O_spin - spin: - use_spin: - - true - - false - virtual_scale: - - 0.314 - - 0.0 - type: spin_ener -model_def_script: - descriptor: - precision: float64 - repflow: - a_dim: 4 - a_rcut: 3.5 - a_rcut_smth: 0.5 - a_sel: 4 - axis_neuron: 4 - e_dim: 6 - e_rcut: 4.0 - e_rcut_smth: 0.5 - e_sel: 8 - n_dim: 8 - nlayers: 1 - update_angle: false - seed: 1 - type: dpa3 - use_loc_mapping: false - fitting_net: - neuron: - - 5 - - 5 - - 5 - resnet_dt: true - seed: 1 - spin: - use_spin: - - true - - false - virtual_scale: - - 0.314 - - 0.0 - type_map: - - Ni - - O -software: deepmd-kit -time: "2026-04-30 14:57:42.534472+00:00" -version: 3.0.0 From 3c9ee65d8e34e8a5c3552706598cd986d1aecdb3 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 2 May 2026 07:51:02 +0800 Subject: [PATCH 23/34] fix(jax): accept comm_dict kwarg in forward_common_atomic The dpmodel layer threads a new ``comm_dict=None`` kwarg through ``forward_common_atomic`` (model and atomic-model levels) so the pt_expt backend can wire MPI ghost-atom exchange for GNN multi-rank LAMMPS. The JAX backend overrides ``forward_common_atomic`` with explicit kwarg lists; without accepting ``comm_dict``, ``dp convert-backend ... savedmodel`` fails at trace time: TypeError: jax_model.forward_common_atomic() got an unexpected keyword argument 'comm_dict' Affected the entire CI matrix on PR #5430 (every Python shard goes through the savedmodel build prep). Fix: add ``comm_dict: dict | None = None`` to each JAX override and ``del comm_dict`` (the JAX path has no MPI ghost exchange). Files touched: dp_atomic_model, linear_atomic_model, pairtab_atomic_model (atomic-model level), plus base_model, dp_model, dp_zbl_model (model level). Paddle's forward_common_atomic already accepts comm_dict and needs no change. --- deepmd/jax/atomic_model/dp_atomic_model.py | 2 ++ deepmd/jax/atomic_model/linear_atomic_model.py | 2 ++ deepmd/jax/atomic_model/pairtab_atomic_model.py | 2 ++ deepmd/jax/model/base_model.py | 2 ++ deepmd/jax/model/dp_model.py | 2 ++ deepmd/jax/model/dp_zbl_model.py | 2 ++ 6 files changed, 12 insertions(+) diff --git a/deepmd/jax/atomic_model/dp_atomic_model.py b/deepmd/jax/atomic_model/dp_atomic_model.py index 7227839f1f..319b8e94a2 100644 --- a/deepmd/jax/atomic_model/dp_atomic_model.py +++ b/deepmd/jax/atomic_model/dp_atomic_model.py @@ -57,7 +57,9 @@ def forward_common_atomic( mapping: jnp.ndarray | None = None, fparam: jnp.ndarray | None = None, aparam: jnp.ndarray | None = None, + comm_dict: dict | None = None, ) -> dict[str, jnp.ndarray]: + del comm_dict # JAX path has no MPI ghost exchange return super().forward_common_atomic( extended_coord, extended_atype, diff --git a/deepmd/jax/atomic_model/linear_atomic_model.py b/deepmd/jax/atomic_model/linear_atomic_model.py index 1c183db7ac..ecfc74cf95 100644 --- a/deepmd/jax/atomic_model/linear_atomic_model.py +++ b/deepmd/jax/atomic_model/linear_atomic_model.py @@ -61,7 +61,9 @@ def forward_common_atomic( mapping: jnp.ndarray | None = None, fparam: jnp.ndarray | None = None, aparam: jnp.ndarray | None = None, + comm_dict: dict | None = None, ) -> dict[str, jnp.ndarray]: + del comm_dict # JAX path has no MPI ghost exchange return super().forward_common_atomic( extended_coord, extended_atype, diff --git a/deepmd/jax/atomic_model/pairtab_atomic_model.py b/deepmd/jax/atomic_model/pairtab_atomic_model.py index 7f18a6403c..0117bf1d2c 100644 --- a/deepmd/jax/atomic_model/pairtab_atomic_model.py +++ b/deepmd/jax/atomic_model/pairtab_atomic_model.py @@ -46,7 +46,9 @@ def forward_common_atomic( mapping: jnp.ndarray | None = None, fparam: jnp.ndarray | None = None, aparam: jnp.ndarray | None = None, + comm_dict: dict | None = None, ) -> dict[str, jnp.ndarray]: + del comm_dict # JAX path has no MPI ghost exchange return super().forward_common_atomic( extended_coord, extended_atype, diff --git a/deepmd/jax/model/base_model.py b/deepmd/jax/model/base_model.py index 4522e25586..f99fccd276 100644 --- a/deepmd/jax/model/base_model.py +++ b/deepmd/jax/model/base_model.py @@ -26,7 +26,9 @@ def forward_common_atomic( aparam: jnp.ndarray | None = None, do_atomic_virial: bool = False, extended_coord_corr: jnp.ndarray | None = None, + comm_dict: dict | None = None, ) -> dict[str, jnp.ndarray]: + del comm_dict # JAX path has no MPI ghost exchange atomic_ret = self.atomic_model.forward_common_atomic( extended_coord, extended_atype, diff --git a/deepmd/jax/model/dp_model.py b/deepmd/jax/model/dp_model.py index 3e96eb6689..55239bb608 100644 --- a/deepmd/jax/model/dp_model.py +++ b/deepmd/jax/model/dp_model.py @@ -56,7 +56,9 @@ def forward_common_atomic( aparam: jnp.ndarray | None = None, do_atomic_virial: bool = False, extended_coord_corr: jnp.ndarray | None = None, + comm_dict: dict | None = None, ) -> dict[str, jnp.ndarray]: + del comm_dict # JAX path has no MPI ghost exchange return forward_common_atomic( self, extended_coord, diff --git a/deepmd/jax/model/dp_zbl_model.py b/deepmd/jax/model/dp_zbl_model.py index 7751d22a1f..f2aa68ea1f 100644 --- a/deepmd/jax/model/dp_zbl_model.py +++ b/deepmd/jax/model/dp_zbl_model.py @@ -38,7 +38,9 @@ def forward_common_atomic( aparam: jnp.ndarray | None = None, do_atomic_virial: bool = False, extended_coord_corr: jnp.ndarray | None = None, + comm_dict: dict | None = None, ) -> dict[str, jnp.ndarray]: + del comm_dict # JAX path has no MPI ghost exchange return forward_common_atomic( self, extended_coord, From 87c9f3f8e8a65d6c43647b6de040c1770069f114 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 2 May 2026 16:23:41 +0800 Subject: [PATCH 24/34] fix(pt_expt): auto-load underlying ops in comm.py DDP-spawned worker subprocesses re-import modules from scratch and never run the test conftest's ``import deepmd.pt``, so when ``pt_expt.utils.comm`` is imported the underlying ``deepmd_export::{border_op,border_op_backward}`` ops are not yet registered and the import-time guard raises: RuntimeError: torch.ops.deepmd_export.{border_op,border_op_backward} are not registered. Build libdeepmd_op_pt.so and ensure deepmd.pt is imported before this module. Repro: test_training_ddp.py::TestDDPRestart::test_ddp_restart on every Python CI shard. Fix: ``_check_underlying_ops_loaded`` now triggers ``import deepmd.pt`` as a side effect when the ops aren't yet registered. ``deepmd/pt/cxx_op.py`` loads ``libdeepmd_op_pt.so`` which registers the schemas. The original RuntimeError stays as a fallback if ``import deepmd.pt`` itself fails. Verified locally: importing ``deepmd.pt_expt.utils.comm`` in a fresh process (without explicit ``import deepmd.pt`` first) now succeeds and ``torch.ops.deepmd_export.border_op`` is available. --- deepmd/pt_expt/utils/comm.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py index b985c57fe6..434d2a97b0 100644 --- a/deepmd/pt_expt/utils/comm.py +++ b/deepmd/pt_expt/utils/comm.py @@ -43,7 +43,27 @@ def _check_underlying_ops_loaded() -> None: op schemas + impls. Without it, the ops can't be registered for fake/autograd metadata and callers get a cryptic AttributeError on ``torch.ops.deepmd_export.border_op``. + + The .so is loaded as a side effect of ``import deepmd.pt`` (via + ``deepmd/pt/cxx_op.py``). We trigger that import here so callers + don't have to remember to do it first — important for environments + like DDP-spawned subprocesses that re-import modules from scratch + and never see the test conftest's ``import deepmd.pt``. """ + if not ( + hasattr(torch.ops, "deepmd_export") + and hasattr(torch.ops.deepmd_export, "border_op") + and hasattr(torch.ops.deepmd_export, "border_op_backward") + ): + # Triggers cxx_op.py which torch.ops.load_library's the .so. + try: + import deepmd.pt # noqa: F401 + except Exception: + # If deepmd.pt itself fails to import, fall through to the + # explicit RuntimeError below — clearer than re-raising a + # potentially-unrelated import error. + pass + if not ( hasattr(torch.ops, "deepmd_export") and hasattr(torch.ops.deepmd_export, "border_op") @@ -52,7 +72,7 @@ def _check_underlying_ops_loaded() -> None: raise RuntimeError( "torch.ops.deepmd_export.{border_op,border_op_backward} " "are not registered. Build libdeepmd_op_pt.so and ensure " - "deepmd.pt is imported before this module." + "deepmd.pt is importable before this module." ) From 4865c4e3c220efe2311f928c21d4a53ab924fda6 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 2 May 2026 16:27:54 +0800 Subject: [PATCH 25/34] chore: drop redundant ``import deepmd.pt`` preloads After 87c9f3f8e ``deepmd.pt_expt.utils.comm`` self-bootstraps ``libdeepmd_op_pt.so`` via ``_check_underlying_ops_loaded()``, so the explicit ``import deepmd.pt`` preloads in conftest.py and test_border_op_backward.py are no longer needed. Closes 2 of the 13 GitHub Advanced Security CodeQL "unused import" alerts on the PR. The remaining 5 Python alerts (other tests' ``import deepmd.pt_expt.utils.comm`` for opaque-op registration) and 6 C++ alerts (TORCH_LIBRARY_* / border_op_export reachable only through macro-expanded static initialization) are CodeQL false positives that need to be dismissed in the GitHub Security UI rather than fixed in source. --- source/tests/pt_expt/conftest.py | 11 +++-------- source/tests/pt_expt/utils/test_border_op_backward.py | 6 +++--- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/source/tests/pt_expt/conftest.py b/source/tests/pt_expt/conftest.py index 06bca2fec5..d4d987fe95 100644 --- a/source/tests/pt_expt/conftest.py +++ b/source/tests/pt_expt/conftest.py @@ -17,14 +17,9 @@ _get_current_function_mode_stack, ) -# Import ``deepmd.pt`` at conftest evaluation time so libdeepmd_op_pt.so -# is loaded and ``deepmd_export::{border_op, border_op_backward}`` are -# registered before any pt_expt test module imports -# ``deepmd.pt_expt.utils`` (which transitively imports ``comm.py`` and -# its ``_check_underlying_ops_loaded()`` runtime check). Previously this -# worked only when collected alongside earlier tests that happened to -# import deepmd.pt first. -import deepmd.pt # noqa: F401 - side-effect: register custom ops +# ``deepmd.pt_expt.utils.comm`` self-bootstraps libdeepmd_op_pt.so via +# ``_check_underlying_ops_loaded()``, so we no longer need to preload +# ``deepmd.pt`` here. def _pop_device_contexts() -> list: diff --git a/source/tests/pt_expt/utils/test_border_op_backward.py b/source/tests/pt_expt/utils/test_border_op_backward.py index c46705ad8a..aeaf491cb2 100644 --- a/source/tests/pt_expt/utils/test_border_op_backward.py +++ b/source/tests/pt_expt/utils/test_border_op_backward.py @@ -30,9 +30,9 @@ import pytest import torch -# Ensure the new C++ symbol is loaded. pt_expt imports deepmd.pt for -# the custom-op .so. -import deepmd.pt +# comm self-bootstraps the underlying libdeepmd_op_pt.so when needed, so +# this single side-effect import is enough to register both the C++ +# ops (deepmd::border_op_backward) and their fake/autograd metadata. import deepmd.pt_expt.utils.comm # noqa: F401 - registers deepmd_export::border_op From bf1685ffc573d6d071113e9a2f994bcf35dba278 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 2 May 2026 16:52:14 +0800 Subject: [PATCH 26/34] fix: address coderabbitai review on PR 5430 Applies the substantive coderabbitai suggestions from the PR review. Defensive guards (no behavioral change for existing callers): - dpmodel/descriptor/{repflows,repformers}.py: raise ValueError when the default `_exchange_ghosts` is hit with `mapping_tiled=None` and `use_loc_mapping=False` instead of returning a cryptic array-backend error. - pt_expt/descriptor/{repflows,repformers}.py: refuse `comm_dict` path when `nf != 1`. The squeeze(0)/unsqueeze(0) dance only works for a single frame; failing here surfaces the unsupported case loudly instead of producing a malformed border_op tensor. Init robustness: - api_cc/src/{DeepPotPTExpt,DeepSpinPTExpt}.cc: wrap the optional with-comm artifact load in try/catch. If `has_comm_artifact` is set in metadata but the nested artifact fails to extract or compile, log and fall back to single-rank-only dispatch instead of aborting init -- the hard error then surfaces only when multi-rank actually needs the missing artifact. Code hygiene: - dpmodel/descriptor/hybrid.py: rename unused unpacks (`g2/h2/sw` -> `_g2/_h2/_sw`) for ruff RUF059 cleanliness. - tests/infer/gen_dpa3.py: deepcopy `config_mpi` before passing to `get_model()` so `data_mpi["model_def_script"]` retains the intended MPI export config even if the call mutates its argument. - tests/pt_expt/model/test_export_with_comm.py: mirror the zero-ghost clamp from `serialization.py::_make_comm_sample_inputs` in the test helper, so no zero-length sendlist pointer is ever materialised. Also update `extra/...` -> `model/extra/...` archive paths to match PT2_EXTRA_PREFIX after the upstream/master merge. Verified locally: pt_expt python (24/24), ctest (3/3, 498 tests including 198 PtExpt), LAMMPS multi-rank GNN (19/19) all green. --- deepmd/dpmodel/descriptor/hybrid.py | 2 +- deepmd/dpmodel/descriptor/repflows.py | 5 +++ deepmd/dpmodel/descriptor/repformers.py | 6 +++ deepmd/pt_expt/descriptor/repflows.py | 11 ++++++ deepmd/pt_expt/descriptor/repformers.py | 11 ++++++ source/api_cc/src/DeepPotPTExpt.cc | 37 +++++++++++++------ source/api_cc/src/DeepSpinPTExpt.cc | 28 +++++++++----- source/tests/infer/gen_dpa3.py | 5 ++- .../pt_expt/model/test_export_with_comm.py | 24 ++++++++---- 9 files changed, 99 insertions(+), 30 deletions(-) diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py index 512a753d25..8a644885ca 100644 --- a/deepmd/dpmodel/descriptor/hybrid.py +++ b/deepmd/dpmodel/descriptor/hybrid.py @@ -333,7 +333,7 @@ def call( # mixed_types is True, but descrpt.mixed_types is False assert nl_distinguish_types is not None nl = nl_distinguish_types[:, :, nci] - odescriptor, gr, g2, h2, sw = descrpt( + odescriptor, gr, _g2, _h2, _sw = descrpt( coord_ext, atype_ext, nl, mapping, comm_dict=comm_dict ) out_descriptor.append(odescriptor) diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py index 2dd64448b2..c3c6713aef 100644 --- a/deepmd/dpmodel/descriptor/repflows.py +++ b/deepmd/dpmodel/descriptor/repflows.py @@ -525,6 +525,11 @@ def _exchange_ghosts( del comm_dict, nall, nloc if self.use_loc_mapping: return node_ebd + if mapping_tiled is None: + raise ValueError( + "`mapping` is required when use_loc_mapping=False unless " + "`_exchange_ghosts` is overridden for parallel comm handling." + ) return xp_take_along_axis(node_ebd, mapping_tiled, axis=1) def call( diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py index 3891c57c7d..55b4a1a342 100644 --- a/deepmd/dpmodel/descriptor/repformers.py +++ b/deepmd/dpmodel/descriptor/repformers.py @@ -498,6 +498,12 @@ def _exchange_ghosts( not None``. """ del comm_dict, nall, nloc + if mapping_tiled is None: + raise ValueError( + "`mapping` is required by the default `_exchange_ghosts` " + "implementation; pass a valid mapping or override the method " + "for parallel comm handling." + ) return xp_take_along_axis(g1, mapping_tiled, axis=1) def call( diff --git a/deepmd/pt_expt/descriptor/repflows.py b/deepmd/pt_expt/descriptor/repflows.py index efd7cba7ba..dacab9f464 100644 --- a/deepmd/pt_expt/descriptor/repflows.py +++ b/deepmd/pt_expt/descriptor/repflows.py @@ -64,6 +64,17 @@ def _exchange_ghosts( "inference requires use_loc_mapping=False so per-layer " "ghost exchange is meaningful." ) + # The squeeze(0) / unsqueeze(0) dance below assumes a single + # frame. LAMMPS always feeds nb=1 in production; refuse loudly + # if a Python caller batches frames so the mismatch surfaces + # here rather than as a malformed border_op tensor downstream. + if node_ebd.shape[0] != 1: + raise RuntimeError( + "DescrptBlockRepflows._exchange_ghosts: comm_dict path " + "only supports nf=1 (got nf=" + f"{node_ebd.shape[0]}). Multi-frame batching with " + "comm_dict is not supported." + ) has_spin = "has_spin" in comm_dict if has_spin: diff --git a/deepmd/pt_expt/descriptor/repformers.py b/deepmd/pt_expt/descriptor/repformers.py index f106a7a240..9b8ddb4a85 100644 --- a/deepmd/pt_expt/descriptor/repformers.py +++ b/deepmd/pt_expt/descriptor/repformers.py @@ -44,6 +44,17 @@ def _exchange_ghosts( nall, nloc, ) + # The squeeze(0) / unsqueeze(0) dance below assumes a single + # frame. LAMMPS always feeds nb=1 in production; refuse loudly + # if a Python caller batches frames so the mismatch surfaces + # here rather than as a malformed border_op tensor downstream. + if g1.shape[0] != 1: + raise RuntimeError( + "DescrptBlockRepformers._exchange_ghosts: comm_dict path " + "only supports nf=1 (got nf=" + f"{g1.shape[0]}). Multi-frame batching with comm_dict is " + "not supported." + ) has_spin = "has_spin" in comm_dict if has_spin: diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 3f01081d42..287ee3b18f 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -164,21 +164,34 @@ void DeepPotPTExpt::init(const std::string& model, // Phase 4: load the optional with-comm artifact for multi-rank GNN // inference. Pre-Phase-3 .pt2 files lack ``has_comm_artifact``; - // default to false so old artifacts keep working. + // default to false so old artifacts keep working. If the metadata + // flag is set but the nested artifact fails to extract or compile, + // fall back to single-rank mode rather than aborting init -- the + // hard error then surfaces in ``run_model_with_comm()`` only when + // multi-rank actually needs it. has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") && metadata["has_comm_artifact"].as_bool(); if (has_comm_artifact_) { - // Extract the nested ``extra/forward_lower_with_comm.pt2`` into a - // temp file and load it as a second AOTI module. The TempFile - // unlinks the temp file on destruction. - with_comm_tempfile_ = std::make_unique( - deepmd::ptexpt::TempFile::from_zip_entry( - model, "extra/forward_lower_with_comm.pt2")); - with_comm_loader = - std::make_unique( - with_comm_tempfile_->path(), "model", false, 1, - gpu_enabled ? static_cast(gpu_id) - : static_cast(-1)); + try { + // Extract the nested ``extra/forward_lower_with_comm.pt2`` into a + // temp file and load it as a second AOTI module. The TempFile + // unlinks the temp file on destruction. + with_comm_tempfile_ = std::make_unique( + deepmd::ptexpt::TempFile::from_zip_entry( + model, "extra/forward_lower_with_comm.pt2")); + with_comm_loader = + std::make_unique( + with_comm_tempfile_->path(), "model", false, 1, + gpu_enabled ? static_cast(gpu_id) + : static_cast(-1)); + } catch (const std::exception& e) { + std::cerr << "DeepPotPTExpt: failed to load with-comm artifact (" + << e.what() << "); falling back to single-rank-only dispatch." + << std::endl; + with_comm_tempfile_.reset(); + with_comm_loader.reset(); + has_comm_artifact_ = false; + } } int num_intra_nthreads, num_inter_nthreads; diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc index 90c518c1a5..9d4f072d2a 100644 --- a/source/api_cc/src/DeepSpinPTExpt.cc +++ b/source/api_cc/src/DeepSpinPTExpt.cc @@ -173,18 +173,28 @@ void DeepSpinPTExpt::init(const std::string& model, : static_cast(-1)); // Phase 4: load the optional with-comm artifact for multi-rank GNN - // spin inference. Mirrors DeepPotPTExpt; see its init() comment. + // spin inference. Mirrors DeepPotPTExpt; see its init() comment for + // the rationale on the try/catch fallback. has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") && metadata["has_comm_artifact"].as_bool(); if (has_comm_artifact_) { - with_comm_tempfile_ = std::make_unique( - deepmd::ptexpt::TempFile::from_zip_entry( - model, "extra/forward_lower_with_comm.pt2")); - with_comm_loader = - std::make_unique( - with_comm_tempfile_->path(), "model", false, 1, - gpu_enabled ? static_cast(gpu_id) - : static_cast(-1)); + try { + with_comm_tempfile_ = std::make_unique( + deepmd::ptexpt::TempFile::from_zip_entry( + model, "extra/forward_lower_with_comm.pt2")); + with_comm_loader = + std::make_unique( + with_comm_tempfile_->path(), "model", false, 1, + gpu_enabled ? static_cast(gpu_id) + : static_cast(-1)); + } catch (const std::exception& e) { + std::cerr << "DeepSpinPTExpt: failed to load with-comm artifact (" + << e.what() << "); falling back to single-rank-only dispatch." + << std::endl; + with_comm_tempfile_.reset(); + with_comm_loader.reset(); + has_comm_artifact_ = false; + } } int num_intra_nthreads, num_inter_nthreads; diff --git a/source/tests/infer/gen_dpa3.py b/source/tests/infer/gen_dpa3.py index ffe3126eb2..20304afcf2 100644 --- a/source/tests/infer/gen_dpa3.py +++ b/source/tests/infer/gen_dpa3.py @@ -94,7 +94,10 @@ def main(): # source/lmp/tests/test_lammps_dpa3_pt2.py::test_pair_deepmd_mpi_dpa3. config_mpi = copy.deepcopy(config) config_mpi["descriptor"]["use_loc_mapping"] = False - model_mpi = get_model(config_mpi) + # Defensive deep copy: get_model is allowed to mutate its argument + # in place, and we still need ``config_mpi`` intact below for + # ``model_def_script``. + model_mpi = get_model(copy.deepcopy(config_mpi)) data_mpi = { "model": model_mpi.serialize(), "model_def_script": config_mpi, diff --git a/source/tests/pt_expt/model/test_export_with_comm.py b/source/tests/pt_expt/model/test_export_with_comm.py index 24c27310ee..f905f409bc 100644 --- a/source/tests/pt_expt/model/test_export_with_comm.py +++ b/source/tests/pt_expt/model/test_export_with_comm.py @@ -79,16 +79,25 @@ def _build_self_comm_inputs( sendlist_indices: np.ndarray, keepalive: list, ) -> tuple[torch.Tensor, ...]: - """Build runtime comm tensors for a single-rank self-send.""" + """Build runtime comm tensors for a single-rank self-send. + + Clamps the swap count to ``max(1, nghost)`` to mirror the trace-time + helper in ``serialization.py::_make_comm_sample_inputs``; that + avoids an empty sendlist pointer when a caller happens to construct + a fixture with no ghost atoms. + """ + send_count = max(1, nghost) sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32) + if sendlist_indices.size == 0: + sendlist_indices = np.zeros(send_count, dtype=np.int32) keepalive.append(sendlist_indices) nswap = 1 addr = _addr_of(sendlist_indices) send_list = torch.tensor([addr], dtype=torch.int64) send_proc = torch.zeros(nswap, dtype=torch.int32) recv_proc = torch.zeros(nswap, dtype=torch.int32) - send_num = torch.tensor([nghost], dtype=torch.int32) - recv_num = torch.tensor([nghost], dtype=torch.int32) + send_num = torch.tensor([send_count], dtype=torch.int32) + recv_num = torch.tensor([send_count], dtype=torch.int32) communicator = torch.zeros(1, dtype=torch.int64) nlocal_ts = torch.tensor(nloc, dtype=torch.int32) nghost_ts = torch.tensor(nghost, dtype=torch.int32) @@ -120,11 +129,12 @@ def test_pt2_dual_artifact_for_gnn(tmp_path) -> None: deserialize_to_file(pt2_path, data) assert os.path.exists(pt2_path) - # 1. ZIP layout sanity + # 1. ZIP layout sanity. PyTorch 2.11 strict layout puts our sidecars + # under ``model/extra/`` (PT2_EXTRA_PREFIX); see serialization.py. with zipfile.ZipFile(pt2_path, "r") as zf: names = set(zf.namelist()) - meta = json.loads(zf.read("extra/metadata.json").decode("utf-8")) - assert "extra/forward_lower_with_comm.pt2" in names, ( + meta = json.loads(zf.read("model/extra/metadata.json").decode("utf-8")) + assert "model/extra/forward_lower_with_comm.pt2" in names, ( f"with-comm artifact missing; names={sorted(names)}" ) assert meta["has_message_passing"] is True @@ -141,7 +151,7 @@ def test_pt2_dual_artifact_for_gnn(tmp_path) -> None: wc_path = os.path.join(td, "fl_wc.pt2") with zipfile.ZipFile(pt2_path, "r") as zf: with open(wc_path, "wb") as f: - f.write(zf.read("extra/forward_lower_with_comm.pt2")) + f.write(zf.read("model/extra/forward_lower_with_comm.pt2")) with_comm = aoti_load_package(wc_path) # 3. Run both artifacts with nframes=1 (matches what the with-comm From a429fc99e2f283a18a1032610a1f0f8c55b9c555 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 2 May 2026 23:41:51 +0800 Subject: [PATCH 27/34] refactor: replace _has_message_passing hack with descriptor API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the private-attribute fishing in ``deepmd/pt_expt/utils/serialization.py`` (which read ``descriptor.repflows.use_loc_mapping`` and friends) and replaces it with a public method on ``BaseDescriptor``: ``has_message_passing_- across_ranks()``. Why --- The old helper conflated two questions: 1. "Is this a GNN-style descriptor?" (existing ``has_message_passing()``) 2. "Do per-layer node embeddings need MPI exchange across rank boundaries to be correct under multi-rank LAMMPS?" Only #2 governs whether to compile a with-comm AOTI artifact. The old function answered #2 by special-casing the ``repflows``/``repformers`` attribute names and ``use_loc_mapping`` flag — silent breakage on any rename and never recursing into hybrid wrappers (Tier-1 #1 in the gnn_mpi_untested_paths catalog). Note: every LAMMPS pair_style already exchanges ghost-atom *coords and forces* via the standard pair-style comm topology — that's not GNN- specific. The new method asks specifically about per-layer atomic feature exchange (the ``node_ebd`` tensor that flows between message- passing layers), which is the actual concern that gates the with-comm artifact. How --- ``BaseDescriptor.has_message_passing_across_ranks()`` returns ``False`` by default. GNN paths override: - ``DescrptBlockRepflows``: ``not self.use_loc_mapping`` - ``DescrptBlockRepformers``: ``True`` (no ``use_loc_mapping`` opt-out exists) - ``DescrptDPA3`` / ``DescrptDPA2``: delegate to their block - ``DescrptHybrid``: ``any(child.has_message_passing_across_ranks() ...)`` (closes the structural side of catalog Tier-1 #1) Non-GNN dpmodel descriptors (``se_e2_a``, ``se_r``, ``se_t``, ``se_t_tebd``, ``dpa1``) get explicit ``return False`` overrides pinning the contract; pt and pd backend descriptors inherit the default (no edits needed there). The serialization helper ``_has_message_passing`` is renamed to ``_needs_with_comm_artifact`` and just calls ``descriptor.has_message_passing_across_ranks()``. The metadata key ``has_message_passing`` is dropped from the .pt2 archive (C++ readers only consume ``has_comm_artifact``). Per-descriptor tests -------------------- The standalone ``source/tests/pt_expt/utils/test_has_message_passing.py`` is deleted; per-descriptor coverage of *both* APIs is added to existing descriptor test files at ``source/tests/pt_expt/descriptor/``: | File | has_message_passing | has_message_passing_across_ranks | |--------------|---------------------|----------------------------------| | se_e2_a | False | False | | dpa1 | False | False | | dpa3 | True | not use_loc_mapping | | dpa2 | True | True | | hybrid | depends on child | True if any child needs it | Bonus: also includes a CUDA segfault fix ---------------------------------------- While running the post-refactor verification, the CUDA-runner CI exposed a latent bug in ``source/op/pt/comm.cc`` (forward + backward kernels): when built with ``USE_MPI`` but invoked single-rank (world_size==0), ``cuda_aware`` defaults to 0 and the CPU-fallback ``recv_g1_tensor.to(kCPU)`` block (guarded by ``world_size >= 1``) is skipped — the tensor stays on CUDA. The inner self-send branch then did host ``memcpy`` on what were still CUDA pointers and segfaulted. Fix: gate the host-memcpy / CPU-copy-back paths on ``world_size >= 1 && cuda_aware == 0`` so single-rank deployments correctly use ``gpuMemcpy DeviceToDevice``. Mirrored in three sites (forward inner, forward post-loop, backward inner, backward post-loop). Float32 multi-rank fixture + test --------------------------------- Adds ``test_lammps_dpa3_pt2_fp32.py`` and a paired ``deeppot_dpa3_mpi_fp32.pt2`` fixture (gen_dpa3.py addition). Validates that the comm_dict path is dtype-agnostic in practice (template dispatch on ``g1.dtype()``, ``register_fake``'s ``empty_like(g1)``, and ``MPI_FLOAT`` exchange) — not just by inspection. Compares mpi-2 vs same-archive mpi-1 with float32-appropriate tolerances (atol 1e-4 / rel 1e-3 for force/virial; rel 1e-5 for energy). Verified locally (CPU build): pt_expt python 965 passed / 32 skipped, ctest 3/3 (498 C++ tests), LAMMPS multi-rank 20/20 (DPA3 + DPA2 + spin DPA3 + DPA3 fp32). Trade-off note -------------- The plan called for ``has_message_passing_across_ranks()`` to be abstract on ``BaseDescriptor`` (mirroring ``has_message_passing``). Implementing that requires touching all 49 subclasses across pt and pd backends — well outside the scope of "GNN MPI for pt_expt". Kept the method concrete with a ``return False`` default; pt and pd backend descriptors inherit that. They can override later if they grow a multi-rank GNN path of their own. --- deepmd/dpmodel/descriptor/dpa1.py | 8 + deepmd/dpmodel/descriptor/dpa2.py | 10 + deepmd/dpmodel/descriptor/dpa3.py | 11 + deepmd/dpmodel/descriptor/hybrid.py | 10 + .../descriptor/make_base_descriptor.py | 18 ++ deepmd/dpmodel/descriptor/repflows.py | 10 + deepmd/dpmodel/descriptor/repformers.py | 9 + deepmd/dpmodel/descriptor/se_e2_a.py | 4 + deepmd/dpmodel/descriptor/se_r.py | 4 + deepmd/dpmodel/descriptor/se_t.py | 4 + deepmd/dpmodel/descriptor/se_t_tebd.py | 4 + deepmd/pt_expt/utils/serialization.py | 97 ++++---- source/lmp/tests/test_lammps_dpa3_pt2_fp32.py | 163 +++++++++++++ source/op/pt/comm.cc | 25 +- source/tests/infer/gen_dpa2.py | 5 +- source/tests/infer/gen_dpa3.py | 24 ++ source/tests/pt_expt/descriptor/test_dpa1.py | 37 +++ source/tests/pt_expt/descriptor/test_dpa2.py | 57 +++++ source/tests/pt_expt/descriptor/test_dpa3.py | 40 +++ .../tests/pt_expt/descriptor/test_hybrid.py | 70 ++++++ .../tests/pt_expt/descriptor/test_se_e2_a.py | 35 +++ .../pt_expt/model/test_export_with_comm.py | 35 +-- .../pt_expt/utils/test_has_message_passing.py | 229 ------------------ 23 files changed, 603 insertions(+), 306 deletions(-) create mode 100644 source/lmp/tests/test_lammps_dpa3_pt2_fp32.py delete mode 100644 source/tests/pt_expt/utils/test_has_message_passing.py diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py index 9d138f422a..04d0420009 100644 --- a/deepmd/dpmodel/descriptor/dpa1.py +++ b/deepmd/dpmodel/descriptor/dpa1.py @@ -397,6 +397,14 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor has message passing.""" return self.se_atten.has_message_passing() + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange. + + DPA1 (se_atten) is single-layer and does not exchange features + across ranks; same as the base se_e2_a path. + """ + return False + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" return self.se_atten.need_sorted_nlist_for_lower() diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py index 851422cce0..e530398ca6 100644 --- a/deepmd/dpmodel/descriptor/dpa2.py +++ b/deepmd/dpmodel/descriptor/dpa2.py @@ -687,6 +687,16 @@ def has_message_passing(self) -> bool: [self.repinit.has_message_passing(), self.repformers.has_message_passing()] ) + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange. + + DPA2's repformers always passes ``g1`` in ``[nb, nall, n_dim]`` + layout (no ``use_loc_mapping`` opt-out exists at the block level), + so multi-rank deployment always needs cross-rank exchange of + per-atom features between layers. + """ + return self.repformers.has_message_passing_across_ranks() + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" return True diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py index 07d5481a91..c1d9531357 100644 --- a/deepmd/dpmodel/descriptor/dpa3.py +++ b/deepmd/dpmodel/descriptor/dpa3.py @@ -527,6 +527,17 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor has message passing.""" return self.repflows.has_message_passing() + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange. + + Delegates to repflows: ``False`` when ``use_loc_mapping=True`` + (per-layer messages stay within each rank's local atoms), + ``True`` when ``use_loc_mapping=False`` (ghost slots in + ``[nb, nall, n_dim]`` layout must be filled by cross-rank + exchange before each layer). + """ + return self.repflows.has_message_passing_across_ranks() + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" return True diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py index 8a644885ca..a51220c5e2 100644 --- a/deepmd/dpmodel/descriptor/hybrid.py +++ b/deepmd/dpmodel/descriptor/hybrid.py @@ -168,6 +168,16 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor has message passing.""" return any(descrpt.has_message_passing() for descrpt in self.descrpt_list) + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange. + + ``True`` if any child descriptor needs cross-rank message passing + (e.g. a hybrid wrapping a DPA3 with ``use_loc_mapping=False``). + """ + return any( + descrpt.has_message_passing_across_ranks() for descrpt in self.descrpt_list + ) + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" return True diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py index 47245898ce..8184b4e42a 100644 --- a/deepmd/dpmodel/descriptor/make_base_descriptor.py +++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py @@ -107,6 +107,24 @@ def mixed_types(self) -> bool: def has_message_passing(self) -> bool: """Returns whether the descriptor has message passing.""" + def has_message_passing_across_ranks(self) -> bool: + """Returns whether the descriptor's message passing extends across rank + boundaries — i.e. whether it requires cross-rank exchange of intermediate + atomic features (per-layer node embeddings) during the forward pass. + + Distinct from generic ghost-coord/force exchange that every LAMMPS + pair_style does. This question gates whether the pt_expt backend + compiles a second "with-comm" AOTI artifact for multi-rank deployment. + + Concrete default ``False`` (non-GNN behavior) so pt and pd backend + descriptors that subclass ``BaseDescriptor`` directly do not have + to implement this method until they grow a multi-rank GNN path of + their own. GNN descriptors that need MPI ghost-feature exchange + (DPA2, DPA3 with ``use_loc_mapping=False``, hybrids wrapping such + children) override to return ``True``. + """ + return False + @abstractmethod def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py index c3c6713aef..bc94b877ea 100644 --- a/deepmd/dpmodel/descriptor/repflows.py +++ b/deepmd/dpmodel/descriptor/repflows.py @@ -732,6 +732,16 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor block has message passing.""" return True + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange. + + Repflows passes ``node_ebd`` either in ``[nb, nloc, n_dim]`` layout + (``use_loc_mapping=True``: messages stay within the rank's local atoms) + or ``[nb, nall, n_dim]`` layout (``use_loc_mapping=False``: ghost slots + must be filled by cross-rank exchange before each layer). + """ + return not self.use_loc_mapping + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor block needs sorted nlist when using `forward_lower`.""" return True diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py index 55b4a1a342..799ab0c3c3 100644 --- a/deepmd/dpmodel/descriptor/repformers.py +++ b/deepmd/dpmodel/descriptor/repformers.py @@ -600,6 +600,15 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor block has message passing.""" return True + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer g1 needs MPI ghost exchange. + + Repformers has no ``use_loc_mapping`` opt-out; it always passes + ``g1`` in ``[nb, nall, n_dim]`` layout, so multi-rank always needs + cross-rank exchange of the per-atom feature tensor. + """ + return True + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor block needs sorted nlist when using `forward_lower`.""" return False diff --git a/deepmd/dpmodel/descriptor/se_e2_a.py b/deepmd/dpmodel/descriptor/se_e2_a.py index 6c20699c23..f72b6f75e8 100644 --- a/deepmd/dpmodel/descriptor/se_e2_a.py +++ b/deepmd/dpmodel/descriptor/se_e2_a.py @@ -278,6 +278,10 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor has message passing.""" return False + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange.""" + return False + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" return False diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py index 55a774bb71..6846710735 100644 --- a/deepmd/dpmodel/descriptor/se_r.py +++ b/deepmd/dpmodel/descriptor/se_r.py @@ -257,6 +257,10 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor has message passing.""" return False + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange.""" + return False + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" return False diff --git a/deepmd/dpmodel/descriptor/se_t.py b/deepmd/dpmodel/descriptor/se_t.py index 38eb7cc16c..2d61736235 100644 --- a/deepmd/dpmodel/descriptor/se_t.py +++ b/deepmd/dpmodel/descriptor/se_t.py @@ -249,6 +249,10 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor has message passing.""" return False + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange.""" + return False + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" return False diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py index 445260b861..2f6e749e19 100644 --- a/deepmd/dpmodel/descriptor/se_t_tebd.py +++ b/deepmd/dpmodel/descriptor/se_t_tebd.py @@ -255,6 +255,10 @@ def has_message_passing(self) -> bool: """Returns whether the descriptor has message passing.""" return self.se_ttebd.has_message_passing() + def has_message_passing_across_ranks(self) -> bool: + """Returns whether per-layer node embeddings need MPI ghost exchange.""" + return self.se_ttebd.has_message_passing_across_ranks() + def need_sorted_nlist_for_lower(self) -> bool: """Returns whether the descriptor needs sorted nlist when using `forward_lower`.""" return self.se_ttebd.need_sorted_nlist_for_lower() diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index abd40662d4..d85a334493 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -98,45 +98,29 @@ def _json_to_numpy(model_obj: dict) -> dict: ) -def _has_message_passing(model: torch.nn.Module) -> bool: - """Detect whether a model's descriptor uses GNN-style message passing. - - GNN descriptors (DPA2 with repformers, DPA3 with repflows) require - a per-layer ghost-atom MPI exchange when running multi-rank LAMMPS, - which means a separate ``with-comm`` AOTInductor artifact must be - compiled. Non-GNN descriptors (se_e2_a, se_r, se_t, se_t_tebd, - DPA1, hybrid-of-non-GNN) need only the regular artifact. - - Additional gate: ``use_loc_mapping=True`` GNN models (the default - for DPA3) keep nlist in local-only indexing, so per-layer ghost - exchange is meaningless — these get only the regular artifact. - Multi-rank LAMMPS for GNN requires use_loc_mapping=False. - - Returns False if the descriptor's ``has_message_passing()`` query - cannot be answered (e.g. linear/zbl/frozen models without a single - descriptor) — those are assumed local. +def _needs_with_comm_artifact(model: torch.nn.Module) -> bool: + """Return ``True`` if the model needs a "with-comm" AOTI artifact compiled. + + The with-comm artifact carries the per-layer ``deepmd_export::border_op`` + calls that exchange node-embedding tensors across MPI ranks. Multi-rank + LAMMPS dispatches to it when the descriptor's message passing extends + across rank boundaries (i.e. layers consume neighbour features that + live on a different rank). Non-GNN descriptors and GNN descriptors with + ``use_loc_mapping=True`` keep all per-layer messaging local to each + rank's owned atoms; they need only the regular artifact. + + Delegates to ``descriptor.has_message_passing_across_ranks()``, which + descriptor classes implement explicitly. Returns ``False`` defensively + when the model has no single descriptor (linear/zbl/frozen) or when + the method is somehow missing or raises. """ - try: - descriptor = model.atomic_model.descriptor - except AttributeError: - return False - if not hasattr(descriptor, "has_message_passing"): + desc = getattr(getattr(model, "atomic_model", None), "descriptor", None) + if desc is None or not hasattr(desc, "has_message_passing_across_ranks"): return False try: - if not descriptor.has_message_passing(): - return False + return bool(desc.has_message_passing_across_ranks()) except (AttributeError, NotImplementedError): return False - # Walk into the GNN block (repflows / repformers) to inspect - # ``use_loc_mapping``. The attribute lives on the block, not on the - # top-level descriptor wrapper. - for attr in ("repflows", "repformers"): - block = getattr(descriptor, attr, None) - if block is None: - continue - if getattr(block, "use_loc_mapping", False): - return False - return True # Module-level cache for the trace-time sendlist buffer. The pointer @@ -454,11 +438,10 @@ def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict: if is_spin: meta["ntypes_spin"] = model.spin.get_ntypes_spin() meta["use_spin"] = [bool(v) for v in model.spin.use_spin] - # Record whether the model uses GNN-style message passing. When - # True, .pt2 deserialization compiles a second ``with-comm`` artifact - # so multi-rank LAMMPS can drive ghost-atom MPI exchange through - # the model. C++ DeepPotPTExpt branches on this flag at load time. - meta["has_message_passing"] = _has_message_passing(model) + # Whether multi-rank LAMMPS needs a second "with-comm" AOTI artifact + # (per-layer ghost-feature MPI exchange via deepmd_export::border_op). + # The C++ DeepPotPTExpt / DeepSpinPTExpt loaders branch on this flag. + meta["has_comm_artifact"] = _needs_with_comm_artifact(model) return meta @@ -588,7 +571,8 @@ def _trace_and_export( ``send_proc``, ``recv_proc``, ``send_num``, ``recv_num``, ``communicator``, ``nlocal``, ``nghost``) used by the pt_expt Repflow/Repformer override to drive MPI ghost-atom exchange. - Only valid for GNN models (see ``_has_message_passing``). + Only valid for models that need cross-rank ghost-feature exchange + (see ``_needs_with_comm_artifact``). do_atomic_virial If True, the traced graph computes per-atom virial (extra autograd.grad backward passes); off by default to keep .pt2 @@ -686,10 +670,12 @@ def _trace_and_export( # matter for tracing — only that they're valid tensors of the right # shape and dtype. See ``_make_comm_sample_inputs``. if with_comm_dict: - if not metadata.get("has_message_passing"): + if not _needs_with_comm_artifact(model): raise ValueError( - "with_comm_dict=True requested but model has no GNN " - "message-passing descriptor — there's nothing to compile." + "with_comm_dict=True requested but the model's descriptor " + "does not need cross-rank message passing " + "(has_message_passing_across_ranks() is False) — " + "there's nothing to compile." ) nloc_sample = nlist_t.shape[1] nall_sample = ext_atype.shape[1] @@ -847,21 +833,22 @@ def _deserialize_to_file_pt2( program into a .pt2 package (ZIP archive with compiled shared libraries), then embeds metadata into the archive. - For GNN models (descriptor.has_message_passing() is True), compiles - a SECOND ``with-comm`` artifact and packs it alongside the regular - one. The ``with-comm`` variant accepts comm-dict tensors as + For models whose descriptor reports + ``has_message_passing_across_ranks() == True`` (DPA2, DPA3 with + ``use_loc_mapping=False``, or hybrids wrapping such children), + compiles a SECOND ``with-comm`` artifact and packs it alongside the + regular one. The ``with-comm`` variant accepts comm-dict tensors as additional positional inputs and drives MPI ghost-atom exchange via - ``deepmd_export::border_op``. The C++ ``DeepPotPTExpt`` loader picks + ``deepmd_export::border_op``. The C++ ``DeepPotPTExpt`` loader picks the artifact based on the LAMMPS rank count at runtime. Layout inside the .pt2 ZIP (PyTorch 2.11 strict layout): regular → artifact at ``model/`` (AOTInductor's own layout) with-comm → ``model/extra/forward_lower_with_comm.pt2`` (nested ZIP) metadata → ``model/extra/metadata.json`` with - ``has_message_passing`` and ``has_comm_artifact`` - flags. The C++ reader matches by ``/``-delimited - suffix so the legacy root-level ``extra/`` layout - still loads. + ``has_comm_artifact`` flag. The C++ reader matches + by ``/``-delimited suffix so the legacy root-level + ``extra/`` layout still loads. Old .pt2 files (pre-this-change) lack ``has_comm_artifact`` so the C++ loader must default to ``False`` when the field is missing. @@ -903,9 +890,11 @@ def _deserialize_to_file_pt2( finally: _inductor_config.realize_opcount_threshold = saved_threshold - # Second artifact: with-comm. Only for GNN models. - has_comm_artifact = bool(metadata.get("has_message_passing")) - metadata["has_comm_artifact"] = has_comm_artifact + # Second artifact: with-comm. Only for descriptors whose message + # passing extends across rank boundaries. The flag was computed + # from the model in ``_collect_metadata`` and is already in + # ``metadata`` here. + has_comm_artifact = bool(metadata.get("has_comm_artifact")) with_comm_bytes: bytes | None = None with_comm_output_keys: list[str] | None = None if has_comm_artifact: diff --git a/source/lmp/tests/test_lammps_dpa3_pt2_fp32.py b/source/lmp/tests/test_lammps_dpa3_pt2_fp32.py new file mode 100644 index 0000000000..1f8eed2512 --- /dev/null +++ b/source/lmp/tests/test_lammps_dpa3_pt2_fp32.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Float32 multi-rank LAMMPS test for DPA3 GNN .pt2. + +The float64 multi-rank test in ``test_lammps_dpa3_pt2.py`` validates the +comm_dict path against a same-archive single-rank reference (atol 1e-8). +This file does the same thing for the float32 variant of the fixture +(``deeppot_dpa3_mpi_fp32.pt2``) — the model and trace are byte-identical +in every respect except ``descriptor.precision``/``fitting_net.precision`` +being set to ``float32``. + +Why a separate test file: + 1. The fp32 fixture is not packaged into ``deeppot_dpa3_mpi.pt2``; + it is a sibling artifact produced by the same gen script. + 2. fp32 needs looser tolerances. The C++ ``border_op`` kernel's + ``forward_t`` template path (chosen automatically via + ``g1.dtype()`` dispatch in ``source/op/pt/comm.cc``) loses ~7 + decimal digits of precision relative to the ``forward_t`` + path. Single-precision GEMM in the AOTI-compiled kernel adds + further drift. + +What this file validates that the float64 test does not: + * ``border_op`` template dispatch on ``g1.dtype() == kFloat`` (vs + ``kDouble``) actually fires under MPI. + * ``register_fake`` returns ``torch.empty_like(g1)`` so the FX trace + preserves float32 dtype through the opaque op. + * ``register_autograd``'s ``border_op_backward`` invocation also + runs under float32, returning float32 gradients. + * MPI exchange uses ``MPI_FLOAT`` (vs ``MPI_DOUBLE``), halving the + bandwidth per ghost atom — relevant for slow interconnects. + +This is a regression-only test for the comm path. It does not pin any +hardcoded numerical values; mpi-2 must agree with same-archive mpi-1 +within float32 tolerances. +""" + +from __future__ import ( + annotations, +) + +import importlib.util +import os +import shutil +import subprocess as sp +import sys +import tempfile +from pathlib import ( + Path, +) + +import numpy as np +import pytest +from write_lmp_data import ( + write_lmp_data, +) + +pb_file_mpi_fp32 = ( + Path(__file__).parent.parent.parent + / "tests" + / "infer" + / "deeppot_dpa3_mpi_fp32.pt2" +) +data_file = Path(__file__).parent / "data_dpa3_pt2_fp32.lmp" + +# Same 6-atom O-H system as the float64 test. ``processors 2 1 1`` +# splits at x=6.5 -> 3 atoms per rank. +box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0]) +coord = np.array( + [ + [12.83, 2.56, 2.18], + [12.09, 2.87, 2.74], + [0.25, 3.32, 1.68], + [3.36, 3.00, 1.81], + [3.51, 2.51, 2.60], + [4.27, 3.22, 1.56], + ] +) +type_OH = np.array([1, 2, 2, 1, 2, 2]) + + +def setup_module() -> None: + if os.environ.get("ENABLE_PYTORCH", "1") != "1": + pytest.skip("Skip test because PyTorch support is not enabled.") + write_lmp_data(box, coord, type_OH, data_file) + + +def teardown_module() -> None: + if data_file.exists(): + os.remove(data_file) + + +def _run_mpi_subprocess( + nprocs: int, + processors: str | None = None, +) -> dict: + """Run ``run_mpi_pair_deepmd_dpa3_pt2.py`` against the fp32 archive. + + Returns ``{"pe", "forces", "virials"}`` parsed from the runner's + output file. + """ + with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: + out_path = f.name + try: + argv = [ + "mpirun", + "-n", + str(nprocs), + sys.executable, + str(Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"), + str(data_file.resolve()), + str(pb_file_mpi_fp32.resolve()), + out_path, + ] + if processors is not None: + argv.extend(["--processors", processors]) + elif nprocs == 1: + argv.extend(["--processors", "1 1 1"]) + sp.check_call(argv) + with open(out_path) as fh: + lines = fh.read().strip().splitlines() + pe = float(lines[0]) + rows = np.array( + [list(map(float, line.split())) for line in lines[1:]], + dtype=np.float64, + ) + forces = rows[:, :3] + virials = rows[:, 3:] + return {"pe": pe, "forces": forces, "virials": virials} + finally: + if os.path.exists(out_path): + os.remove(out_path) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa3_fp32() -> None: + """Float32 DPA3 multi-rank must match same-archive single-rank. + + Tolerances follow standard float32 expectations: + * energy: ``rel=1e-5`` (~7 decimal digits, with mantissa noise) + * force: ``atol=1e-4`` absolute (force magnitudes are O(1e-1) for + this system, so ``rel=1e-3``) + * virial: ``atol=5e-4`` per component + + Single-rank uses the regular artifact (nswap=0); multi-rank uses + the with-comm artifact -- so any divergence beyond float32 noise + is necessarily in the multi-rank dispatch (border_op template + dispatch, MPI_FLOAT exchange, register_fake/register_autograd + dtype handling). + """ + out_mpi = _run_mpi_subprocess(nprocs=2) + out_ref = _run_mpi_subprocess(nprocs=1) + + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-5, abs=1e-7) + np.testing.assert_allclose( + out_mpi["forces"], out_ref["forces"], atol=1e-4, rtol=1e-3 + ) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=5e-4, rtol=1e-3 + ) diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 3bb7516155..6eb49624ec 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -140,7 +140,12 @@ class Border : public torch::autograd::Function { #endif #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) #ifdef USE_MPI - if (cuda_aware == 0) { + // The CPU-fallback ``recv_g1_tensor.to(kCPU)`` above only runs + // when ``world_size >= 1`` (MPI initialized). With no MPI + // (single-rank, world_size == 0) the tensor is still on CUDA, + // so memcpy on CUDA pointers would segfault — gpuMemcpy is + // correct in that case regardless of ``cuda_aware``. + if (world_size >= 1 && cuda_aware == 0) { memcpy(recv_g1, send_g1, (unsigned long)nsend * tensor_size * sizeof(FPTYPE)); } else { @@ -164,7 +169,10 @@ class Border : public torch::autograd::Function { } #ifdef USE_MPI #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) - if (cuda_aware == 0) { + // Only copy back when ``recv_g1_tensor`` was moved to CPU above + // (world_size >= 1 && cuda_aware == 0). With world_size == 0 the + // tensor is still aliased to g1 — no copy needed. + if (world_size >= 1 && cuda_aware == 0) { g1.copy_(recv_g1_tensor); } #endif @@ -305,7 +313,10 @@ class Border : public torch::autograd::Function { if (nrecv) { #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) #ifdef USE_MPI - if (cuda_aware == 0) { + // See forward kernel: when world_size==0 the data stays on + // CUDA, so memcpy on device pointers segfaults. Only use + // host memcpy when we explicitly moved data to CPU above. + if (world_size >= 1 && cuda_aware == 0) { memcpy(recv_g1, send_g1, (unsigned long)nrecv * tensor_size * sizeof(FPTYPE)); } else { @@ -333,9 +344,11 @@ class Border : public torch::autograd::Function { } #ifdef USE_MPI #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) - if (cuda_aware == 0) { - // Move result back to the device of the input grad. This replaces - // the original in-place copy_ into grad_output[0]. + // Move result back to the device of the input grad only when + // ``d_local_g1_tensor`` was moved to CPU above (world_size >= 1 + // && cuda_aware == 0). With world_size == 0 the tensor stayed on + // its original device — no move needed. + if (world_size >= 1 && cuda_aware == 0) { d_local_g1_tensor = d_local_g1_tensor.to(grad_g1.device()); } #endif diff --git a/source/tests/infer/gen_dpa2.py b/source/tests/infer/gen_dpa2.py index 5aff706aab..e640514ee3 100644 --- a/source/tests/infer/gen_dpa2.py +++ b/source/tests/infer/gen_dpa2.py @@ -110,8 +110,9 @@ def main(): print(f"Exporting to {pt2_path} ...") # noqa: T201 # DPA2's repformer block has no ``use_loc_mapping`` knob (unlike # DPA3), so a single .pt2 already carries the dual-artifact layout - # (regular + with-comm) — _has_message_passing returns True and the - # serializer produces both. No separate _mpi.pt2 needed. + # (regular + with-comm) — ``has_message_passing_across_ranks`` + # returns True and the serializer produces both. No separate _mpi.pt2 + # needed. pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data), do_atomic_virial=True) pth_path = os.path.join(base_dir, "deeppot_dpa2.pth") diff --git a/source/tests/infer/gen_dpa3.py b/source/tests/infer/gen_dpa3.py index 20304afcf2..1bfe0f9c65 100644 --- a/source/tests/infer/gen_dpa3.py +++ b/source/tests/infer/gen_dpa3.py @@ -111,6 +111,30 @@ def main(): pt2_mpi_path, copy.deepcopy(data_mpi), do_atomic_virial=True ) + # Float32 multi-rank variant — same architecture as the float64 + # MPI fixture but with ``precision: float32``. Used by + # source/lmp/tests/test_lammps_dpa3_pt2_fp32.py to validate that + # the comm_dict path (border_op + register_fake/register_autograd) + # is dtype-agnostic in practice, not just by inspection. + config_mpi_fp32 = copy.deepcopy(config_mpi) + config_mpi_fp32["descriptor"]["precision"] = "float32" + config_mpi_fp32["fitting_net"]["precision"] = "float32" + model_mpi_fp32 = get_model(copy.deepcopy(config_mpi_fp32)) + data_mpi_fp32 = { + "model": model_mpi_fp32.serialize(), + "model_def_script": config_mpi_fp32, + "backend": "dpmodel", + "software": "deepmd-kit", + "version": "3.0.0", + } + pt2_mpi_fp32_path = os.path.join(base_dir, "deeppot_dpa3_mpi_fp32.pt2") + print(f"Exporting to {pt2_mpi_fp32_path} ...") # noqa: T201 + pt_expt_deserialize_to_file( + pt2_mpi_fp32_path, + copy.deepcopy(data_mpi_fp32), + do_atomic_virial=True, + ) + pth_path = os.path.join(base_dir, "deeppot_dpa3.pth") print(f"Exporting to {pth_path} ...") # noqa: T201 try: diff --git a/source/tests/pt_expt/descriptor/test_dpa1.py b/source/tests/pt_expt/descriptor/test_dpa1.py index 24a1d36078..8edb25ccdf 100644 --- a/source/tests/pt_expt/descriptor/test_dpa1.py +++ b/source/tests/pt_expt/descriptor/test_dpa1.py @@ -290,3 +290,40 @@ def test_share_params(self, shared_level) -> None: # invalid level raises with pytest.raises(NotImplementedError): dd1.share_params(dd0, shared_level=2) + + +def test_has_message_passing_across_ranks() -> None: + """DPA1 (se_atten) is single-layer attention; no cross-rank + feature exchange is needed at multi-rank deployment. + """ + import copy + + from deepmd.dpmodel.model.model import ( + get_model, + ) + + config = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "rcut": 6.0, + "rcut_smth": 0.5, + "sel": 20, + "neuron": [2, 4], + "axis_neuron": 2, + "attn": 5, + "attn_layer": 1, + "type_one_side": True, + "precision": "float64", + "seed": 1, + }, + "fitting_net": { + "neuron": [4, 4], + "resnet_dt": True, + "precision": "float64", + "seed": 1, + }, + } + desc = get_model(copy.deepcopy(config)).atomic_model.descriptor + assert desc.has_message_passing() is False + assert desc.has_message_passing_across_ranks() is False diff --git a/source/tests/pt_expt/descriptor/test_dpa2.py b/source/tests/pt_expt/descriptor/test_dpa2.py index fb0005e13a..217bcdb230 100644 --- a/source/tests/pt_expt/descriptor/test_dpa2.py +++ b/source/tests/pt_expt/descriptor/test_dpa2.py @@ -426,3 +426,60 @@ def fn(coord_ext, atype_ext, nlist, mapping): rtol=rtol, atol=atol, ) + + +def test_has_message_passing_across_ranks() -> None: + """DPA2's repformer always passes ``g1`` in ``[nb, nall, n_dim]`` + layout (no ``use_loc_mapping`` opt-out exists), so cross-rank + message passing is always required for multi-rank deployment. + """ + import copy + + from deepmd.dpmodel.model.model import ( + get_model, + ) + + config = { + "type_map": ["O", "H"], + "descriptor": { + "type": "dpa2", + "repinit": { + "rcut": 6.0, + "rcut_smth": 2.0, + "nsel": 20, + "neuron": [2, 4], + "axis_neuron": 4, + "tebd_dim": 8, + "tebd_input_mode": "concat", + "set_davg_zero": True, + "type_one_side": True, + "use_three_body": False, + }, + "repformer": { + "rcut": 3.0, + "rcut_smth": 1.5, + "nsel": 10, + "nlayers": 1, + "g1_dim": 8, + "g2_dim": 5, + "axis_neuron": 4, + "update_g1_has_conv": True, + "update_g1_has_drrd": True, + "update_g1_has_grrg": True, + "update_g2_has_attn": True, + "attn1_hidden": 8, + "attn1_nhead": 2, + "attn2_hidden": 5, + "attn2_nhead": 1, + "update_style": "res_avg", + "set_davg_zero": True, + }, + "concat_output_tebd": True, + "precision": "float64", + "seed": 1, + }, + "fitting_net": {"neuron": [4, 4], "resnet_dt": True, "seed": 1}, + } + desc = get_model(copy.deepcopy(config)).atomic_model.descriptor + assert desc.has_message_passing() is True + assert desc.has_message_passing_across_ranks() is True diff --git a/source/tests/pt_expt/descriptor/test_dpa3.py b/source/tests/pt_expt/descriptor/test_dpa3.py index ef4b479724..3013f5cc65 100644 --- a/source/tests/pt_expt/descriptor/test_dpa3.py +++ b/source/tests/pt_expt/descriptor/test_dpa3.py @@ -311,3 +311,43 @@ def test_share_params(self, shared_level) -> None: # invalid level raises with pytest.raises(NotImplementedError): dd1.share_params(dd0, shared_level=2) + + +@pytest.mark.parametrize("use_loc_mapping", [True, False]) +def test_has_message_passing_across_ranks(use_loc_mapping) -> None: + """DPA3 always reports message passing; cross-rank only when + ``use_loc_mapping=False`` (so per-layer node embeddings must flow + via MPI ghost exchange instead of a local gather). + """ + import copy + + from deepmd.dpmodel.model.model import ( + get_model, + ) + + config = { + "type_map": ["O", "H"], + "descriptor": { + "type": "dpa3", + "repflow": { + "n_dim": 8, + "e_dim": 6, + "a_dim": 4, + "nlayers": 1, + "e_rcut": 4.0, + "e_rcut_smth": 0.5, + "e_sel": 8, + "a_rcut": 3.5, + "a_rcut_smth": 0.5, + "a_sel": 4, + "axis_neuron": 4, + "update_angle": False, + }, + "use_loc_mapping": use_loc_mapping, + }, + "fitting_net": {"neuron": [16, 16], "seed": 1}, + } + model = get_model(copy.deepcopy(config)) + desc = model.atomic_model.descriptor + assert desc.has_message_passing() is True + assert desc.has_message_passing_across_ranks() is (not use_loc_mapping) diff --git a/source/tests/pt_expt/descriptor/test_hybrid.py b/source/tests/pt_expt/descriptor/test_hybrid.py index 5fa8970bf1..86575180c7 100644 --- a/source/tests/pt_expt/descriptor/test_hybrid.py +++ b/source/tests/pt_expt/descriptor/test_hybrid.py @@ -284,3 +284,73 @@ def test_share_params(self) -> None: # invalid level raises with pytest.raises(NotImplementedError): dd1.share_params(dd0, shared_level=1) + + +def _se_e2_a_child() -> dict: + return { + "type": "se_e2_a", + "rcut": 6.0, + "rcut_smth": 0.5, + "sel": [20, 20], + "neuron": [2, 4], + "axis_neuron": 2, + "type_one_side": True, + "precision": "float64", + "seed": 1, + } + + +def _dpa3_child(use_loc_mapping: bool) -> dict: + return { + "type": "dpa3", + "repflow": { + "n_dim": 8, + "e_dim": 6, + "a_dim": 4, + "nlayers": 1, + "e_rcut": 4.0, + "e_rcut_smth": 0.5, + "e_sel": 8, + "a_rcut": 3.5, + "a_rcut_smth": 0.5, + "a_sel": 4, + "axis_neuron": 4, + "update_angle": False, + }, + "use_loc_mapping": use_loc_mapping, + } + + +@pytest.mark.parametrize( + "child_factory,expected_hmp,expected_hmp_ar", + [ + (lambda: _se_e2_a_child(), False, False), + (lambda: _dpa3_child(use_loc_mapping=True), True, False), + (lambda: _dpa3_child(use_loc_mapping=False), True, True), + ], + ids=["se_e2_a-only", "dpa3-ulm-true", "dpa3-ulm-false"], +) +def test_has_message_passing_across_ranks( + child_factory, expected_hmp, expected_hmp_ar +) -> None: + """Hybrid descriptor recurses into its children; cross-rank message + passing is required iff any child needs it. Closes the structural + side of catalog Tier-1 #1. + """ + import copy + + from deepmd.dpmodel.model.model import ( + get_model, + ) + + config = { + "type_map": ["O", "H"], + "descriptor": { + "type": "hybrid", + "list": [child_factory()], + }, + "fitting_net": {"neuron": [4, 4], "seed": 1}, + } + desc = get_model(copy.deepcopy(config)).atomic_model.descriptor + assert desc.has_message_passing() is expected_hmp + assert desc.has_message_passing_across_ranks() is expected_hmp_ar diff --git a/source/tests/pt_expt/descriptor/test_se_e2_a.py b/source/tests/pt_expt/descriptor/test_se_e2_a.py index e4bd1e385e..e3a8ca5c21 100644 --- a/source/tests/pt_expt/descriptor/test_se_e2_a.py +++ b/source/tests/pt_expt/descriptor/test_se_e2_a.py @@ -221,3 +221,38 @@ def fn(coord_ext, atype_ext, nlist): rtol=rtol, atol=atol, ) + + +def test_has_message_passing_across_ranks() -> None: + """se_e2_a is a single-layer local descriptor: no message passing, + no cross-rank exchange ever needed. + """ + import copy + + from deepmd.dpmodel.model.model import ( + get_model, + ) + + config = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_e2_a", + "rcut": 6.0, + "rcut_smth": 0.5, + "sel": [20, 20], + "neuron": [2, 4], + "axis_neuron": 2, + "type_one_side": True, + "precision": "float64", + "seed": 1, + }, + "fitting_net": { + "neuron": [4, 4], + "resnet_dt": True, + "precision": "float64", + "seed": 1, + }, + } + desc = get_model(copy.deepcopy(config)).atomic_model.descriptor + assert desc.has_message_passing() is False + assert desc.has_message_passing_across_ranks() is False diff --git a/source/tests/pt_expt/model/test_export_with_comm.py b/source/tests/pt_expt/model/test_export_with_comm.py index f905f409bc..f338397639 100644 --- a/source/tests/pt_expt/model/test_export_with_comm.py +++ b/source/tests/pt_expt/model/test_export_with_comm.py @@ -9,8 +9,7 @@ This test verifies: 1. Both artifacts are present in the archive. - 2. ``metadata.json`` carries the new ``has_message_passing`` and - ``has_comm_artifact`` flags. + 2. ``metadata.json`` carries the ``has_comm_artifact`` flag. 3. The with-comm artifact loads via ``aoti_load_package`` and runs when fed valid comm-dict tensors built via the ctypes pointer trick (see ``test_repflow_parallel.py``). @@ -137,7 +136,6 @@ def test_pt2_dual_artifact_for_gnn(tmp_path) -> None: assert "model/extra/forward_lower_with_comm.pt2" in names, ( f"with-comm artifact missing; names={sorted(names)}" ) - assert meta["has_message_passing"] is True assert meta["has_comm_artifact"] is True # 2. Both artifacts load. @@ -245,19 +243,20 @@ def test_make_comm_sample_inputs_clamps_zero_nghost() -> None: assert nghost_t.item() == 0 -def test_has_message_passing_for_hybrid_with_gnn() -> None: - """``_has_message_passing`` correctly reports True for hybrid - descriptors whose children include a GNN block. +def test_needs_with_comm_artifact_for_hybrid_with_gnn() -> None: + """``_needs_with_comm_artifact`` correctly reports True for hybrid + descriptors whose children include a GNN block needing cross-rank + message passing. - The hybrid descriptor delegates ``has_message_passing()`` to its - children — if any child has message passing, the hybrid does too. - Our metadata flag (``has_message_passing``) is what - ``_deserialize_to_file_pt2`` uses to decide whether to compile - the with-comm artifact, so the hybrid case must route correctly. + The hybrid descriptor delegates ``has_message_passing_across_ranks()`` + to its children — if any child needs cross-rank message passing, + the hybrid does too. ``_deserialize_to_file_pt2`` uses this gate + to decide whether to compile the with-comm artifact, so the + hybrid case must route correctly. """ from deepmd.pt_expt.model.get_model import get_model as get_pt_expt_model from deepmd.pt_expt.utils.serialization import ( - _has_message_passing, + _needs_with_comm_artifact, ) config = { @@ -301,8 +300,10 @@ def test_has_message_passing_for_hybrid_with_gnn() -> None: model = get_pt_expt_model(config) model.to("cpu") model.eval() - assert _has_message_passing(model) is True, ( - "hybrid model with a GNN child must report has_message_passing=True" + assert _needs_with_comm_artifact(model) is True, ( + "hybrid model with a use_loc_mapping=False GNN child must " + "report has_message_passing_across_ranks=True so a with-comm " + "artifact gets compiled" ) @@ -330,7 +331,11 @@ def test_pte_with_comm_dict_traces_and_loads(tmp_path) -> None: model_json_override=None, with_comm_dict=True, ) - assert metadata["has_message_passing"] is True + # ``_trace_and_export(with_comm_dict=True)`` is the with-comm path + # by construction; metadata at this layer no longer carries the + # has_message_passing flag (only ``has_comm_artifact``, written + # later in _deserialize_to_file_pt2). Sanity-check via output_keys + # that the trace produced energy outputs. # output_keys mirrors what the regular trace would produce; at # least one energy-related key must be present. assert any(k.startswith("energy") for k in output_keys), ( diff --git a/source/tests/pt_expt/utils/test_has_message_passing.py b/source/tests/pt_expt/utils/test_has_message_passing.py deleted file mode 100644 index 673e4d8bd0..0000000000 --- a/source/tests/pt_expt/utils/test_has_message_passing.py +++ /dev/null @@ -1,229 +0,0 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -"""Schema-drift regression test for ``_has_message_passing``. - -``_has_message_passing`` (in ``deepmd/pt_expt/utils/serialization.py``) -gates whether the dual-artifact ``.pt2`` is produced for GNN models — -specifically, whether the with-comm AOTInductor module is compiled and -nested inside the archive. The detection relies on a chain of attribute -lookups: - -* ``model.atomic_model.descriptor`` -* ``descriptor.has_message_passing()`` -* For repflows/repformers: ``block.use_loc_mapping`` - -A rename of any of these (refactor in the dpmodel descriptor layer, a -new GNN block name, etc.) silently disables the with-comm artifact and -multi-rank LAMMPS users get a single-artifact .pt2 that crashes on the -first ghost exchange — with no test failure to flag the breakage. - -This test pins the contract: assert ``_has_message_passing`` returns -the documented value for each baseline configuration. -""" - -from __future__ import ( - annotations, -) - -import copy - -import pytest - -from deepmd.dpmodel.model.model import ( - get_model, -) -from deepmd.pt_expt.utils.serialization import ( - _has_message_passing, -) - - -def _se_e2_a_config() -> dict: - """Non-GNN descriptor — must report False.""" - return { - "type_map": ["O", "H"], - "descriptor": { - "type": "se_e2_a", - "rcut": 6.0, - "rcut_smth": 0.5, - "sel": [20, 20], - "neuron": [2, 4], - "axis_neuron": 2, - "type_one_side": True, - "precision": "float64", - "seed": 1, - }, - "fitting_net": { - "neuron": [4, 4], - "resnet_dt": True, - "precision": "float64", - "seed": 1, - }, - } - - -def _dpa1_config() -> dict: - """DPA1 (se_atten) — non-GNN; must report False.""" - return { - "type_map": ["O", "H"], - "descriptor": { - "type": "se_atten", - "rcut": 6.0, - "rcut_smth": 0.5, - "sel": 20, - "neuron": [2, 4], - "axis_neuron": 2, - "attn": 5, - "attn_layer": 1, - "type_one_side": True, - "precision": "float64", - "seed": 1, - }, - "fitting_net": { - "neuron": [4, 4], - "resnet_dt": True, - "precision": "float64", - "seed": 1, - }, - } - - -def _dpa3_config(use_loc_mapping: bool) -> dict: - """DPA3 (repflows). use_loc_mapping=False -> True, True -> False.""" - return { - "type_map": ["O", "H"], - "descriptor": { - "type": "dpa3", - "repflow": { - "n_dim": 8, - "e_dim": 6, - "a_dim": 4, - "nlayers": 1, - "e_rcut": 4.0, - "e_rcut_smth": 0.5, - "e_sel": 8, - "a_rcut": 3.5, - "a_rcut_smth": 0.5, - "a_sel": 4, - "axis_neuron": 4, - "update_angle": False, - }, - "use_loc_mapping": use_loc_mapping, - }, - "fitting_net": {"neuron": [16, 16], "seed": 1}, - } - - -def _dpa2_config() -> dict: - """DPA2 (repformer) — GNN; repformer has no use_loc_mapping knob, - so always reports True. - """ - return { - "type_map": ["O", "H"], - "descriptor": { - "type": "dpa2", - "repinit": { - "rcut": 6.0, - "rcut_smth": 2.0, - "nsel": 20, - "neuron": [2, 4], - "axis_neuron": 4, - "tebd_dim": 8, - "tebd_input_mode": "concat", - "set_davg_zero": True, - "type_one_side": True, - "use_three_body": False, - }, - "repformer": { - "rcut": 3.0, - "rcut_smth": 1.5, - "nsel": 10, - "nlayers": 1, - "g1_dim": 8, - "g2_dim": 5, - "axis_neuron": 4, - "update_g1_has_conv": True, - "update_g1_has_drrd": True, - "update_g1_has_grrg": True, - "update_g2_has_attn": True, - "attn1_hidden": 8, - "attn1_nhead": 2, - "attn2_hidden": 5, - "attn2_nhead": 1, - "update_style": "res_avg", - "set_davg_zero": True, - }, - "concat_output_tebd": True, - "precision": "float64", - "seed": 1, - }, - "fitting_net": { - "neuron": [4, 4], - "resnet_dt": True, - "seed": 1, - }, - } - - -@pytest.mark.parametrize( - "config_factory,expected", - [ - (_se_e2_a_config, False), - (_dpa1_config, False), - (lambda: _dpa3_config(use_loc_mapping=True), False), - (lambda: _dpa3_config(use_loc_mapping=False), True), - (_dpa2_config, True), - ], - ids=[ - "se_e2_a-non-gnn", - "dpa1-non-gnn", - "dpa3-use-loc-mapping-true", - "dpa3-use-loc-mapping-false", - "dpa2-repformer", - ], -) -def test_has_message_passing_matches_descriptor_kind(config_factory, expected) -> None: - """``_has_message_passing`` must report the documented value for - each baseline descriptor configuration. - - A False positive (non-GNN reported as GNN) wastes compile time on - a useless with-comm artifact. A False negative (GNN with - use_loc_mapping=False reported as non-GNN) is worse: multi-rank - LAMMPS gets a single-artifact .pt2 and crashes on the first ghost - exchange. This test pins both directions. - """ - config = config_factory() - model = get_model(copy.deepcopy(config)) - assert _has_message_passing(model) is expected - - -def test_has_message_passing_no_descriptor_returns_false() -> None: - """Models without a single ``atomic_model.descriptor`` (e.g. linear - / ZBL / frozen) must report False — the function defends against - AttributeError and treats the model as local. - """ - - class _StubAtomicModel: - # Intentionally no ``descriptor`` attribute. - pass - - class _StubModel: - atomic_model = _StubAtomicModel() - - assert _has_message_passing(_StubModel()) is False - - -def test_has_message_passing_descriptor_without_query_returns_false() -> None: - """If the descriptor exists but lacks ``has_message_passing``, the - function must report False rather than raise. - """ - - class _StubDescriptor: - # Intentionally no ``has_message_passing`` method. - pass - - class _StubAtomicModel: - descriptor = _StubDescriptor() - - class _StubModel: - atomic_model = _StubAtomicModel() - - assert _has_message_passing(_StubModel()) is False From 08805b6474dc0cb8e1844a198ea2dd04066e655c Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 3 May 2026 12:21:18 +0800 Subject: [PATCH 28/34] fix(test): build comm_dict control tensors on CPU for repflow_parallel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The C++ ``border_op`` host code dereferences ``send_proc``, ``recv_proc``, ``send_num``, ``recv_num`` (and ``send_list`` / ``communicator`` / ``nlocal`` / ``nghost``) directly via ``data_ptr()`` from host code — see ``source/op/pt/comm.cc`` forward_t/backward_t. Production code in ``source/api_cc/src/commonPTExpt.h::build_comm_tensors_positional`` explicitly creates them on ``torch::kCPU``. The test ``_build_self_comm_dict`` helper was constructing them on ``device`` (which on a CUDA build is ``cuda:0``). On CPU-only builds this happened to work; on a CUDA-enabled build the host read of ``recvnum[iswap]`` walks a CUDA pointer and segfaults. This is a test bug, not a runtime contract change. Fix by forcing the control tensors to CPU regardless of caller-supplied device, matching production semantics, and document why in the docstring. Reproduces the intermittent CUDA CI segfault on PR #5430: ``test_repflow_parallel.py`` was the failure point in https://github.com/deepmodeling/deepmd-kit/actions/runs/25264766026 --- .../descriptor/test_repflow_parallel.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/source/tests/pt_expt/descriptor/test_repflow_parallel.py b/source/tests/pt_expt/descriptor/test_repflow_parallel.py index 61b84fe5af..f5c68fabed 100644 --- a/source/tests/pt_expt/descriptor/test_repflow_parallel.py +++ b/source/tests/pt_expt/descriptor/test_repflow_parallel.py @@ -81,24 +81,32 @@ def _build_self_comm_dict( int32 array of length ``nghost`` giving local indices to copy into successive ghost slots [nloc, nloc+1, ...]. device - Target torch device for tensors. + Target torch device for the data tensors. The control tensors + (send_proc / recv_proc / send_num / recv_num / send_list / + communicator / nlocal / nghost) are forced to CPU regardless of + ``device`` because the C++ ``border_op`` host-side code derefer- + ences ``data_ptr()`` directly — production builds them on + CPU in ``commonPTExpt.h::build_comm_tensors_positional`` and a + CUDA-built kernel will segfault if it tries to read CUDA memory + from the host. keepalive List into which we store numpy buffers that must outlive the forward pass (their addresses are referenced by sendlist_tensor). """ + del device # control tensors are always CPU; see docstring sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32) keepalive.append(sendlist_indices) nswap = 1 addr = _addr_of(sendlist_indices) # int** packed as one int64 entry per swap. - sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device=device) - sendproc = torch.zeros(nswap, dtype=torch.int32, device=device) - recvproc = torch.zeros(nswap, dtype=torch.int32, device=device) - sendnum = torch.tensor([nghost], dtype=torch.int32, device=device) - recvnum = torch.tensor([nghost], dtype=torch.int32, device=device) - communicator = torch.zeros(1, dtype=torch.int64, device=device) - nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device) - nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device) + sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device="cpu") + sendproc = torch.zeros(nswap, dtype=torch.int32, device="cpu") + recvproc = torch.zeros(nswap, dtype=torch.int32, device="cpu") + sendnum = torch.tensor([nghost], dtype=torch.int32, device="cpu") + recvnum = torch.tensor([nghost], dtype=torch.int32, device="cpu") + communicator = torch.zeros(1, dtype=torch.int64, device="cpu") + nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device="cpu") + nghost_ts = torch.tensor(nghost, dtype=torch.int32, device="cpu") return { "send_list": sendlist_tensor, "send_proc": sendproc, From afa99c7b97d3c16bf361eac631dbef5ec404e1f6 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 3 May 2026 19:01:30 +0800 Subject: [PATCH 29/34] fix(test): build comm_dict control tensors on CPU for repformer_parallel Same bug as the previous commit in ``test_repflow_parallel.py``: ``_build_self_comm_dict`` constructs the control tensors (send_proc / recv_proc / send_num / recv_num / send_list / communicator / nlocal / nghost) on the caller-supplied ``device``, which is ``cuda`` on a CUDA build. The C++ ``border_op`` host code dereferences these via ``data_ptr()`` from the host, so a CUDA-device control tensor segfaults the read. Production code in ``commonPTExpt.h::build_comm_tensors_positional`` explicitly builds them on CPU. Force CPU regardless of the caller-supplied device, matching the production contract. This was the second segfault revealed on PR #5430 CI after 08805b647 fixed test_repflow_parallel.py: test_repflow_parallel.py .... [ 13%] Segmentation fault (core dumped) test_repformer_parallel.py --- .../descriptor/test_repformer_parallel.py | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/source/tests/pt_expt/descriptor/test_repformer_parallel.py b/source/tests/pt_expt/descriptor/test_repformer_parallel.py index ca0bd035e7..24e6e6ce33 100644 --- a/source/tests/pt_expt/descriptor/test_repformer_parallel.py +++ b/source/tests/pt_expt/descriptor/test_repformer_parallel.py @@ -54,18 +54,27 @@ def _build_self_comm_dict( device: torch.device, keepalive: list, ) -> dict: + """Control tensors must live on CPU because the C++ ``border_op`` + host code dereferences ``data_ptr()`` directly. Production + builds them on CPU in + ``commonPTExpt.h::build_comm_tensors_positional``; on a CUDA build + a CUDA-device control tensor segfaults the host read. See + ``test_repflow_parallel.py::_build_self_comm_dict`` for the full + rationale. + """ + del device # control tensors are always CPU sendlist_indices = np.ascontiguousarray(sendlist_indices, dtype=np.int32) keepalive.append(sendlist_indices) nswap = 1 addr = _addr_of(sendlist_indices) - sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device=device) - sendproc = torch.zeros(nswap, dtype=torch.int32, device=device) - recvproc = torch.zeros(nswap, dtype=torch.int32, device=device) - sendnum = torch.tensor([nghost], dtype=torch.int32, device=device) - recvnum = torch.tensor([nghost], dtype=torch.int32, device=device) - communicator = torch.zeros(1, dtype=torch.int64, device=device) - nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device=device) - nghost_ts = torch.tensor(nghost, dtype=torch.int32, device=device) + sendlist_tensor = torch.tensor([addr], dtype=torch.int64, device="cpu") + sendproc = torch.zeros(nswap, dtype=torch.int32, device="cpu") + recvproc = torch.zeros(nswap, dtype=torch.int32, device="cpu") + sendnum = torch.tensor([nghost], dtype=torch.int32, device="cpu") + recvnum = torch.tensor([nghost], dtype=torch.int32, device="cpu") + communicator = torch.zeros(1, dtype=torch.int64, device="cpu") + nlocal_ts = torch.tensor(nloc, dtype=torch.int32, device="cpu") + nghost_ts = torch.tensor(nghost, dtype=torch.int32, device="cpu") return { "send_list": sendlist_tensor, "send_proc": sendproc, From e19108d96f4078c6ffd33ac2ace582da56e0cfd1 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 4 May 2026 10:50:50 +0800 Subject: [PATCH 30/34] fix(op): dispatch border_op self-send on tensor device, not MPI state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CUDA self-send branch in ``Border::forward_t`` and ``backward_t`` was guarded by ``if (world_size >= 1 && cuda_aware == 0)`` to choose between host ``memcpy`` and ``gpuMemcpy(D2D)``. The intent was "world_size >= 1 means MPI is initialised so the pre-loop CPU fallback ran and the buffer is now on CPU; otherwise it's still on its original device (assumed CUDA)". That assumption is wrong for one important case: a USE_MPI build called from Python with CPU tensors and no MPI init (``world_size == 0``). Unit tests in ``source/tests/pt_expt/utils/test_border_op_- backward.py`` do exactly this — they construct CPU comm tensors and a CPU ``grad_g1``, never call MPI_Init, and expect the kernel to do plain CPU accumulation. The old guard fell through to ``gpuMemcpy (...DeviceToDevice)`` on host pointers. CUDA returns ``cudaErrorInvalidValue`` from that call; the return code is unchecked and ``recv_g1`` is left uninitialised. Subsequent ``index_add_`` then writes garbage into ``d_local_g1_tensor`` — the test sees mixed denormals + sigmoid-shaped values from leaked buffer memory. Same bug bit ``test_spin_export_with_comm.py::test_spin_dpa3_eager- _parity``: it compares the no-comm path against the comm_dict path for a spin DPA3, and the comm_dict path went through the broken self-send. Energy diverged by ~0.1 instead of being bit-identical. Fix: dispatch the self-send memcpy on the actual device of the buffer (``recv_g1_tensor.is_cuda()``). The post-loop copy-back to ``g1.device()`` is changed analogously to use ``!is_alias_of(g1)`` — the buffer was moved if and only if the pre-loop CPU fallback created a fresh tensor. Both checks are precise correctness conditions that work for every combination of (USE_MPI on/off, GOOGLE_CUDA on/off, MPI initialised or not, CUDA or CPU tensors). Verified on remote with CUDA build + USE_MPI: test_border_op_backward.py 5 passed test_spin_export_with_comm.py 1 passed test_repflow_parallel.py + sibling 6 passed broader pt_expt sweep 58 passed --- source/op/pt/comm.cc | 69 +++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 6eb49624ec..4b175370e4 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -139,25 +139,22 @@ class Border : public torch::autograd::Function { } else { #endif #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) -#ifdef USE_MPI - // The CPU-fallback ``recv_g1_tensor.to(kCPU)`` above only runs - // when ``world_size >= 1`` (MPI initialized). With no MPI - // (single-rank, world_size == 0) the tensor is still on CUDA, - // so memcpy on CUDA pointers would segfault — gpuMemcpy is - // correct in that case regardless of ``cuda_aware``. - if (world_size >= 1 && cuda_aware == 0) { - memcpy(recv_g1, send_g1, - (unsigned long)nsend * tensor_size * sizeof(FPTYPE)); - } else { + // Self-send branch: choose the host-vs-device memcpy based on + // where the data actually lives, not on MPI state. The buffer + // we read/write is ``recv_g1_tensor`` whose device is either + // (a) the original ``g1`` device, or (b) CPU after the + // non-cuda-aware MPI fallback above. Reading that device + // directly is the only correct dispatch for build configs + // where USE_MPI is on but the call site uses CPU tensors + // (e.g. unit tests of border_op without MPI init). + if (recv_g1_tensor.is_cuda()) { gpuMemcpy(recv_g1, send_g1, (unsigned long)nsend * tensor_size * sizeof(FPTYPE), gpuMemcpyDeviceToDevice); + } else { + memcpy(recv_g1, send_g1, + (unsigned long)nsend * tensor_size * sizeof(FPTYPE)); } -#else - gpuMemcpy(recv_g1, send_g1, - (unsigned long)nsend * tensor_size * sizeof(FPTYPE), - gpuMemcpyDeviceToDevice); -#endif #else memcpy(recv_g1, send_g1, (unsigned long)nsend * tensor_size * sizeof(FPTYPE)); @@ -169,10 +166,12 @@ class Border : public torch::autograd::Function { } #ifdef USE_MPI #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) - // Only copy back when ``recv_g1_tensor`` was moved to CPU above - // (world_size >= 1 && cuda_aware == 0). With world_size == 0 the - // tensor is still aliased to g1 — no copy needed. - if (world_size >= 1 && cuda_aware == 0) { + // Only copy back when ``recv_g1_tensor`` was actually moved to a + // different device above (the cuda_aware==0 CPU fallback). When + // ``recv_g1_tensor`` still aliases ``g1`` no copy is needed; the + // is_alias_of check is the precise correctness condition and works + // for both CUDA and CPU call sites. + if (!recv_g1_tensor.is_alias_of(g1)) { g1.copy_(recv_g1_tensor); } #endif @@ -312,23 +311,20 @@ class Border : public torch::autograd::Function { #endif if (nrecv) { #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) -#ifdef USE_MPI - // See forward kernel: when world_size==0 the data stays on - // CUDA, so memcpy on device pointers segfaults. Only use - // host memcpy when we explicitly moved data to CPU above. - if (world_size >= 1 && cuda_aware == 0) { - memcpy(recv_g1, send_g1, - (unsigned long)nrecv * tensor_size * sizeof(FPTYPE)); - } else { + // Self-send branch: dispatch on the actual device of the + // ``recv_g1_tensor`` buffer, not on MPI state. Same rationale + // as the forward kernel — USE_MPI builds may be called with + // CPU tensors (unit tests of border_op_backward) where the + // gpuMemcpy path silently fails with cudaErrorInvalidValue + // and leaves recv_g1 uninitialized. + if (recv_g1_tensor.is_cuda()) { gpuMemcpy(recv_g1, send_g1, (unsigned long)nrecv * tensor_size * sizeof(FPTYPE), gpuMemcpyDeviceToDevice); + } else { + memcpy(recv_g1, send_g1, + (unsigned long)nrecv * tensor_size * sizeof(FPTYPE)); } -#else - gpuMemcpy(recv_g1, send_g1, - (unsigned long)nrecv * tensor_size * sizeof(FPTYPE), - gpuMemcpyDeviceToDevice); -#endif #else memcpy(recv_g1, send_g1, (unsigned long)nrecv * tensor_size * sizeof(FPTYPE)); @@ -345,10 +341,11 @@ class Border : public torch::autograd::Function { #ifdef USE_MPI #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) // Move result back to the device of the input grad only when - // ``d_local_g1_tensor`` was moved to CPU above (world_size >= 1 - // && cuda_aware == 0). With world_size == 0 the tensor stayed on - // its original device — no move needed. - if (world_size >= 1 && cuda_aware == 0) { + // ``d_local_g1_tensor`` was actually moved to a different device + // above (the cuda_aware==0 CPU fallback). The is_alias_of check + // is the precise correctness condition and works for both CUDA + // and CPU call sites (no-op when the tensor still aliases input). + if (!d_local_g1_tensor.is_alias_of(grad_g1)) { d_local_g1_tensor = d_local_g1_tensor.to(grad_g1.device()); } #endif From 4f8240ea66e9383da7ebe0f9258efa8e3a1834a2 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 4 May 2026 21:56:59 +0800 Subject: [PATCH 31/34] fix(op): drain pending MPI eager-send ACKs in border_op via Barrier The empty-subdomain spin LAMMPS test (``processors 2 1 1`` with all atoms on rank 0, rank 1 nloc=0) failed at MPI_Finalize with "Communicator (handle=0x44000000) being freed has 2 unmatched message(s)". Test outputs were correct; the failure was purely in the MPI cleanup path. Root cause is the asymmetric ghost-exchange pattern that arises when one rank only Sends and the other only Irecvs at a given swap (no local atoms means nothing to send back). Under MPICH eager protocol: * The sender's MPI_Send returns once the message is queued in the eager buffer; the receiver's ACK round-trip is processed asynchronously by MPI's progress engine. * In symmetric swaps the sender also calls MPI_Wait on its own Irecv, which advances the progress engine and drains pending ACKs. * In asymmetric swaps the sender makes no further MPI call inside border_op, so the ACK stays unprocessed. The "in-flight" counter remains nonzero, and MPI_Finalize reports it as unmatched. Fix: add a single ``MPI_Barrier(world)`` at the end of ``Border::forward_t`` and ``Border::backward_t``. The Barrier forces a round-trip on every rank, which advances every rank's progress engine and drains pending ACKs. Cost is one collective per ghost-exchange call; on a 2-rank, 6-swap, 4-atom case this is in the noise vs the surrounding model forward. Verified on remote (CUDA + MPICH): test_lammps_spin_dpa3_pt2.py ... [3 passed] test_lammps_dpa3_pt2.py ............... [15 passed] Restores the multi-rank LAMMPS spin GNN with empty-subdomain support (PR #5430 CI's last failing case). --- source/op/pt/comm.cc | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 4b175370e4..32949bc339 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -165,6 +165,22 @@ class Border : public torch::autograd::Function { recv_g1 += nrecv * tensor_size; } #ifdef USE_MPI + // Drain pending eager-send ACKs before returning. In the + // asymmetric ghost-exchange pattern (one rank only Sends, the + // other only Irecvs at a given swap — e.g. an empty subdomain + // under ``processors 2 1 1``) the sender's MPI_Send returns once + // the eager-buffered message is queued, but MPICH's internal + // accounting marks the message as "in flight" until the sender's + // progress engine processes the receiver's ACK. In the symmetric + // case the sender's own MPI_Wait on its Irecv drains those ACKs. + // In the asymmetric case there is no such Wait, and the message + // stays "in flight" all the way to MPI_Finalize, which then + // reports ``Communicator (...) being freed has N unmatched + // message(s)``. An MPI_Barrier on the same communicator forces a + // round-trip on every rank, drains ACKs, and clears the counter. + if (mpi_init && world_size >= 1) { + MPI_Barrier(world); + } #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) // Only copy back when ``recv_g1_tensor`` was actually moved to a // different device above (the cuda_aware==0 CPU fallback). When @@ -339,6 +355,13 @@ class Border : public torch::autograd::Function { } } #ifdef USE_MPI + // Drain pending eager-send ACKs before returning — see forward_t + // for the full rationale. Backward has the same asymmetric + // Send/Irecv pattern (now in the reverse direction) and the same + // unmatched-message trap when one rank only Sends. + if (mpi_init && world_size >= 1) { + MPI_Barrier(world); + } #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) // Move result back to the device of the input grad only when // ``d_local_g1_tensor`` was actually moved to a different device From 7632db8945166cb7db13330395c12c0cde35a06e Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 9 May 2026 17:31:35 +0800 Subject: [PATCH 32/34] fix(pt_expt): fail fast on with-comm artifact errors instead of silently zeroing Address @iProzd review on PR #5430: - border_op_export: throw on empty output list rather than returning empty_like(g1), which masked internal kernel bugs as zero outputs. - DeepPotPTExpt / DeepSpinPTExpt: if the with-comm artifact is declared in metadata but fails to load, keep has_comm_artifact_=true so multi-rank dispatch (nswap>0) throws explicitly. Previously has_comm_artifact_ was reset to false on load failure, making multi-rank silently fall through to the single-rank artifact and skip the MPI ghost-embedding exchange. --- source/api_cc/src/DeepPotPTExpt.cc | 27 +++++++++++++++++++-------- source/api_cc/src/DeepSpinPTExpt.cc | 22 +++++++++++++++++----- source/op/pt/comm.cc | 7 ++++++- 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 287ee3b18f..910c2f6f7a 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -166,9 +166,10 @@ void DeepPotPTExpt::init(const std::string& model, // inference. Pre-Phase-3 .pt2 files lack ``has_comm_artifact``; // default to false so old artifacts keep working. If the metadata // flag is set but the nested artifact fails to extract or compile, - // fall back to single-rank mode rather than aborting init -- the - // hard error then surfaces in ``run_model_with_comm()`` only when - // multi-rank actually needs it. + // keep ``has_comm_artifact_=true`` and let single-rank dispatch + // continue working; multi-rank dispatch then fails fast at + // ``run_model_with_comm()`` rather than silently dropping the MPI + // exchange and producing wrong results. has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") && metadata["has_comm_artifact"].as_bool(); if (has_comm_artifact_) { @@ -186,11 +187,12 @@ void DeepPotPTExpt::init(const std::string& model, : static_cast(-1)); } catch (const std::exception& e) { std::cerr << "DeepPotPTExpt: failed to load with-comm artifact (" - << e.what() << "); falling back to single-rank-only dispatch." + << e.what() + << "); single-rank inference will still work, but multi-rank " + "LAMMPS dispatch will throw." << std::endl; with_comm_tempfile_.reset(); with_comm_loader.reset(); - has_comm_artifact_ = false; } } @@ -244,9 +246,12 @@ std::vector DeepPotPTExpt::run_model_with_comm( const std::vector& comm_tensors) { if (!with_comm_loader) { throw deepmd::deepmd_exception( - "run_model_with_comm called but the .pt2 file has no with-comm " - "artifact. This is a programming error: the caller should check " - "has_comm_artifact_ before invoking this path."); + "run_model_with_comm called but the with-comm artifact is not " + "available. Either the .pt2 file has no with-comm artifact compiled " + "(programming error: the caller should check has_comm_artifact_ " + "before invoking this path), or the artifact was present in the " + ".pt2 metadata but failed to load at init time (see earlier stderr " + "log). Multi-rank LAMMPS requires a working with-comm artifact."); } if (comm_tensors.size() != 8) { throw deepmd::deepmd_exception( @@ -431,6 +436,12 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // tensor to gather ghost embeddings from local atoms. std::vector flat_outputs; bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0; + if (use_with_comm && !with_comm_loader) { + throw deepmd::deepmd_exception( + "Multi-rank LAMMPS requires the with-comm artifact, but it failed " + "to load at init time. See the earlier stderr log for the underlying " + "error."); + } // When NULL-type atoms exist, remapped storage must outlive comm // tensors (the int** pointer-array tensor references it). std::vector> remapped_sendlist; diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc index 9d4f072d2a..2ac4369f5f 100644 --- a/source/api_cc/src/DeepSpinPTExpt.cc +++ b/source/api_cc/src/DeepSpinPTExpt.cc @@ -174,7 +174,9 @@ void DeepSpinPTExpt::init(const std::string& model, // Phase 4: load the optional with-comm artifact for multi-rank GNN // spin inference. Mirrors DeepPotPTExpt; see its init() comment for - // the rationale on the try/catch fallback. + // the rationale on keeping ``has_comm_artifact_=true`` on load + // failure so multi-rank dispatch fails fast rather than silently + // dropping the MPI exchange. has_comm_artifact_ = metadata.obj_val.count("has_comm_artifact") && metadata["has_comm_artifact"].as_bool(); if (has_comm_artifact_) { @@ -189,11 +191,12 @@ void DeepSpinPTExpt::init(const std::string& model, : static_cast(-1)); } catch (const std::exception& e) { std::cerr << "DeepSpinPTExpt: failed to load with-comm artifact (" - << e.what() << "); falling back to single-rank-only dispatch." + << e.what() + << "); single-rank inference will still work, but multi-rank " + "LAMMPS dispatch will throw." << std::endl; with_comm_tempfile_.reset(); with_comm_loader.reset(); - has_comm_artifact_ = false; } } @@ -249,8 +252,11 @@ std::vector DeepSpinPTExpt::run_model_with_comm( const std::vector& comm_tensors) { if (!with_comm_loader) { throw deepmd::deepmd_exception( - "DeepSpinPTExpt::run_model_with_comm called but the .pt2 has no " - "with-comm artifact."); + "DeepSpinPTExpt::run_model_with_comm called but the with-comm " + "artifact is not available. Either the .pt2 file has no with-comm " + "artifact compiled, or the artifact was present in the .pt2 metadata " + "but failed to load at init time (see earlier stderr log). Multi-rank " + "LAMMPS requires a working with-comm artifact."); } if (comm_tensors.size() != 8) { throw deepmd::deepmd_exception( @@ -448,6 +454,12 @@ void DeepSpinPTExpt::compute(ENERGYVTYPE& ener, // (pre atom-doubling); the spin override halves them internally. std::vector flat_outputs; bool use_with_comm = has_comm_artifact_ && lmp_list.nswap > 0; + if (use_with_comm && !with_comm_loader) { + throw deepmd::deepmd_exception( + "Multi-rank LAMMPS requires the with-comm artifact, but it failed " + "to load at init time. See the earlier stderr log for the underlying " + "error."); + } std::vector> remapped_sendlist; std::vector remapped_sendlist_ptrs; std::vector remapped_sendnum, remapped_recvnum; diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 32949bc339..cfe78321af 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -523,7 +523,12 @@ torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor, communicator_tensor, nlocal_tensor, nghost_tensor); // border_op returns {g1_tensor} — a list whose first element aliases // g1_tensor. Clone for AOTI graph-output correctness. - return out.empty() ? torch::empty_like(g1_tensor) : out[0].clone(); + if (out.empty()) { + throw std::runtime_error( + "border_op_export: border_op returned an empty output list, which " + "indicates an internal error in the underlying border_op kernel."); + } + return out[0].clone(); } torch::Tensor border_op_backward_export( From 68c72a3091610ba0668a6c76d67ca3471a511697 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 9 May 2026 20:34:48 +0800 Subject: [PATCH 33/34] test(pt_expt): cover with-comm artifact load-failure dispatch guard Add gtest cases that exercise the explicit ``use_with_comm && !with_comm_loader`` throw added to DeepPotPTExpt::compute and DeepSpinPTExpt::compute. Fixtures: copies of deeppot_dpa3_mpi.pt2 and deeppot_dpa3_spin_mpi.pt2 with the nested ``model/extra/forward_lower_with_comm.pt2`` entry replaced by garbage bytes, produced by gen_corrupt_with_comm.py via zip rewrite (no AOTI recompilation). Each variant asserts: - init() succeeds (catch path keeps regular artifact usable) - single-rank compute (nswap=0) succeeds (uses regular artifact) - multi-rank compute (nswap=1) throws deepmd::deepmd_exception --- .../test_with_comm_load_failure_ptexpt.cc | 202 ++++++++++++++++++ source/tests/infer/gen_corrupt_with_comm.py | 67 ++++++ 2 files changed, 269 insertions(+) create mode 100644 source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc create mode 100644 source/tests/infer/gen_corrupt_with_comm.py diff --git a/source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc b/source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc new file mode 100644 index 0000000000..10111a41b7 --- /dev/null +++ b/source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +// Tests for the dispatch-site fail-fast guard when the with-comm AOTI +// artifact failed to load at init time. The fixtures are produced by +// source/tests/infer/gen_corrupt_with_comm.py: copies of the valid +// multi-rank .pt2 archives whose nested +// ``model/extra/forward_lower_with_comm.pt2`` entry has been replaced +// with garbage bytes. The outer metadata still claims +// ``has_comm_artifact: true`` so the loader exercises the catch path. +// +// Expectations: +// * init() succeeds (the loader logs and falls back instead of aborting). +// * Single-rank dispatch (nswap == 0) keeps working through the regular +// forward_lower artifact. +// * Multi-rank dispatch (nswap > 0) throws a deepmd::deepmd_exception +// instead of silently dropping the MPI ghost-embedding exchange. +#include + +#include +#include + +#include "DeepPot.h" +// Include the PT_Expt headers so BUILD_PT_EXPT / BUILD_PT_EXPT_SPIN are +// visible to the GTEST_SKIP guard below. +#include "DeepPotPTExpt.h" +#include "DeepSpin.h" +#include "DeepSpinPTExpt.h" +#include "common.h" +#include "neighbor_list.h" +#include "test_utils.h" + +namespace { +constexpr const char* kPotCorrupt = + "../../tests/infer/deeppot_dpa3_mpi_corrupt_with_comm.pt2"; +constexpr const char* kSpinCorrupt = + "../../tests/infer/deeppot_dpa3_spin_mpi_corrupt_with_comm.pt2"; + +bool file_exists(const char* path) { + std::ifstream f(path); + return f.good(); +} +} // namespace + +// ============================================================================ +// DeepPot (non-spin) — corrupted with-comm artifact +// ============================================================================ + +class TestDeepPotPTExptWithCommLoadFailure : public ::testing::Test { + protected: + // Coordinates / atype / box copied from gen_dpa3.py so the regular + // forward_lower artifact has well-formed inputs to evaluate. + std::vector coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, 4.27, 3.22, 1.56}; + std::vector atype = {0, 1, 1, 0, 1, 1}; + std::vector box = {13., 0., 0., 0., 13., 0., 0., 0., 13.}; + + deepmd::DeepPot dp; + + void SetUp() override { +#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT + GTEST_SKIP() << "Skip because PyTorch / pt_expt support is not enabled."; +#endif + if (!file_exists(kPotCorrupt)) { + GTEST_SKIP() << "Skipping: " << kPotCorrupt + << " not found. Run source/tests/infer/" + "gen_corrupt_with_comm.py first."; + } + // Init must succeed: the with-comm loader fails internally and the + // catch block keeps the regular single-rank artifact usable. + ASSERT_NO_THROW(dp.init(kPotCorrupt)); + } +}; + +TEST_F(TestDeepPotPTExptWithCommLoadFailure, single_rank_compute_succeeds) { + // nswap == 0 (default InputNlist) routes through the regular + // forward_lower artifact; the broken with-comm artifact is not + // consulted, so compute must succeed. + float rc = dp.cutoff(); + int nloc = coord.size() / 3; + std::vector coord_cpy; + std::vector atype_cpy, mapping; + std::vector> nlist_data; + _build_nlist(nlist_data, coord_cpy, atype_cpy, mapping, coord, atype, + box, rc); + int nall = coord_cpy.size() / 3; + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, ilist.data(), numneigh.data(), + firstneigh.data()); + convert_nlist(inlist, nlist_data); + inlist.mapping = mapping.data(); + ASSERT_EQ(inlist.nswap, 0); // pre-condition: single-rank dispatch + + double ener; + std::vector force_, virial; + EXPECT_NO_THROW(dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, + nall - nloc, inlist, 0)); + EXPECT_EQ(force_.size(), nall * 3); + EXPECT_EQ(virial.size(), 9); +} + +TEST_F(TestDeepPotPTExptWithCommLoadFailure, multi_rank_compute_throws) { + // nswap > 0 forces the dispatch site to ``run_model_with_comm``; the + // load-failure guard added by PR #5430 must throw rather than silently + // falling back to the single-rank path. The send/recv arrays remain + // null — the guard fires before any of them are dereferenced. + float rc = dp.cutoff(); + int nloc = coord.size() / 3; + std::vector coord_cpy; + std::vector atype_cpy, mapping; + std::vector> nlist_data; + _build_nlist(nlist_data, coord_cpy, atype_cpy, mapping, coord, atype, + box, rc); + int nall = coord_cpy.size() / 3; + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, ilist.data(), numneigh.data(), + firstneigh.data()); + convert_nlist(inlist, nlist_data); + inlist.mapping = mapping.data(); + inlist.nswap = 1; // simulate multi-rank without populating send/recv + + double ener; + std::vector force_, virial; + EXPECT_THROW(dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, + nall - nloc, inlist, 0), + deepmd::deepmd_exception); +} + +// ============================================================================ +// DeepSpin — corrupted with-comm artifact +// ============================================================================ + +class TestDeepSpinPTExptWithCommLoadFailure : public ::testing::Test { + protected: + std::vector coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, 4.27, 3.22, 1.56}; + // Match deeppot_dpa3_spin_mpi.pt2 spin layout (type 0 has spin, types + // 1+ do not) — spin vector packed alongside coord. + std::vector spin = {0.13, 0.02, 0.03, 0., 0., 0., 0., 0., 0., + 0.14, 0.10, 0.12, 0., 0., 0., 0., 0., 0.}; + std::vector atype = {0, 1, 1, 0, 1, 1}; + std::vector box = {13., 0., 0., 0., 13., 0., 0., 0., 13.}; + + deepmd::DeepSpin dp; + + void SetUp() override { +#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT_SPIN + GTEST_SKIP() << "Skip because PyTorch / pt_expt spin support is not " + "enabled."; +#endif + if (!file_exists(kSpinCorrupt)) { + GTEST_SKIP() << "Skipping: " << kSpinCorrupt + << " not found. Run source/tests/infer/" + "gen_corrupt_with_comm.py first."; + } + ASSERT_NO_THROW(dp.init(kSpinCorrupt)); + } +}; + +TEST_F(TestDeepSpinPTExptWithCommLoadFailure, single_rank_compute_succeeds) { + // NoPBC + hardcoded all-pairs nlist mirrors the + // ``cpu_lmp_nlist`` pattern in test_deeppot_dpa_ptexpt_spin.cc: + // nloc == natoms == nall, no ghost atoms. + const int natoms = static_cast(atype.size()); + std::vector empty_box; + std::vector> nlist_data = {{1, 2, 3, 4, 5}, {0, 2, 3, 4, 5}, + {0, 1, 3, 4, 5}, {0, 1, 2, 4, 5}, + {0, 1, 2, 3, 5}, {0, 1, 2, 3, 4}}; + std::vector ilist(natoms), numneigh(natoms); + std::vector firstneigh(natoms); + deepmd::InputNlist inlist(natoms, ilist.data(), numneigh.data(), + firstneigh.data()); + convert_nlist(inlist, nlist_data); + ASSERT_EQ(inlist.nswap, 0); + + double ener; + std::vector force_, force_mag, virial; + EXPECT_NO_THROW(dp.compute(ener, force_, force_mag, virial, coord, spin, + atype, empty_box, 0, inlist, 0)); +} + +TEST_F(TestDeepSpinPTExptWithCommLoadFailure, multi_rank_compute_throws) { + const int natoms = static_cast(atype.size()); + std::vector empty_box; + std::vector> nlist_data = {{1, 2, 3, 4, 5}, {0, 2, 3, 4, 5}, + {0, 1, 3, 4, 5}, {0, 1, 2, 4, 5}, + {0, 1, 2, 3, 5}, {0, 1, 2, 3, 4}}; + std::vector ilist(natoms), numneigh(natoms); + std::vector firstneigh(natoms); + deepmd::InputNlist inlist(natoms, ilist.data(), numneigh.data(), + firstneigh.data()); + convert_nlist(inlist, nlist_data); + inlist.nswap = 1; // simulate multi-rank without populating send/recv + + double ener; + std::vector force_, force_mag, virial; + EXPECT_THROW(dp.compute(ener, force_, force_mag, virial, coord, spin, atype, + empty_box, 0, inlist, 0), + deepmd::deepmd_exception); +} diff --git a/source/tests/infer/gen_corrupt_with_comm.py b/source/tests/infer/gen_corrupt_with_comm.py new file mode 100644 index 0000000000..ff0d16158c --- /dev/null +++ b/source/tests/infer/gen_corrupt_with_comm.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Generate ``deeppot_*_corrupt_with_comm.pt2`` fixtures. + +The fixtures are copies of the corresponding multi-rank ``.pt2`` archives +in which the nested ``model/extra/forward_lower_with_comm.pt2`` entry has +been overwritten with garbage bytes. The outer metadata still claims +``has_comm_artifact: true``, so: + +- ``DeepPotPTExpt::init`` / ``DeepSpinPTExpt::init`` exercise the + try/catch fallback path on the with-comm AOTI loader. +- Single-rank dispatch (``nswap == 0``) keeps working via the regular + artifact. +- Multi-rank dispatch (``nswap > 0``) hits the explicit dispatch-site + throw added in PR #5430, instead of silently dropping the MPI + ghost-embedding exchange. + +Consumed by ``source/api_cc/tests/test_with_comm_load_failure_ptexpt.cc``. +""" + +import os +import zipfile + +WITH_COMM_ENTRY = "model/extra/forward_lower_with_comm.pt2" +GARBAGE = b"NOT_A_VALID_AOTI_ARCHIVE_" * 32 + + +def corrupt_with_comm(src: str, dst: str) -> None: + """Copy ``src`` to ``dst`` with the nested with-comm entry replaced.""" + with ( + zipfile.ZipFile(src, "r") as zin, + zipfile.ZipFile(dst, "w", compression=zipfile.ZIP_STORED) as zout, + ): + replaced = False + for info in zin.infolist(): + data = zin.read(info.filename) + if info.filename == WITH_COMM_ENTRY: + data = GARBAGE + replaced = True + zout.writestr(info, data) + if not replaced: + raise RuntimeError( + f"{src} does not contain {WITH_COMM_ENTRY}; cannot corrupt." + ) + + +def main() -> None: + base_dir = os.path.dirname(__file__) + pairs = [ + ("deeppot_dpa3_mpi.pt2", "deeppot_dpa3_mpi_corrupt_with_comm.pt2"), + ( + "deeppot_dpa3_spin_mpi.pt2", + "deeppot_dpa3_spin_mpi_corrupt_with_comm.pt2", + ), + ] + for src_name, dst_name in pairs: + src = os.path.join(base_dir, src_name) + dst = os.path.join(base_dir, dst_name) + if not os.path.exists(src): + print(f"Skipping {dst_name}: source {src_name} not found.") # noqa: T201 + continue + corrupt_with_comm(src, dst) + print(f"Wrote {dst}") # noqa: T201 + + +if __name__ == "__main__": + main() From 5359abc4f8319a8fe42bd1007aa0fbe2e530738a Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 13 May 2026 10:33:48 +0800 Subject: [PATCH 34/34] chore: silence CodeQL alerts on PR #5430 - Python (5x py/unused-import): add `# lgtm[py/unused-import]` to the side-effect imports of `deepmd.pt_expt.utils.comm` (which register the deepmd_export::border_op fake/autograd metadata via decorators). - Python (1x py/unnecessary-lambda): replace `lambda: _se_e2_a_child()` with `_se_e2_a_child` in test_hybrid.py parametrize table (the other two entries keep their lambdas because they pass kwargs). - C++ (6x cpp/unused-static-function): annotate border_op_export and border_op_backward_export with `DEEPMD_MAYBE_UNUSED`, a macro that expands to `[[maybe_unused]]` under C++17 and to nothing under C++14 (the fallback when older torch < 2.1 forces the legacy standard). CodeQL doesn't see through TORCH_LIBRARY_IMPL's function-pointer registration; the attribute documents that this is intentional. --- source/op/pt/comm.cc | 33 +++++++++++++------ .../tests/pt_expt/descriptor/test_hybrid.py | 2 +- .../descriptor/test_repflow_parallel.py | 2 +- .../descriptor/test_repformer_parallel.py | 2 +- .../pt_expt/model/test_export_with_comm.py | 2 +- .../model/test_spin_export_with_comm.py | 2 +- .../pt_expt/utils/test_border_op_backward.py | 2 +- 7 files changed, 29 insertions(+), 16 deletions(-) diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index cfe78321af..31691d5e7d 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -509,15 +509,27 @@ TORCH_LIBRARY_FRAGMENT(deepmd, m) { // ============================================================================ namespace { -torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor, - const torch::Tensor& sendproc_tensor, - const torch::Tensor& recvproc_tensor, - const torch::Tensor& sendnum_tensor, - const torch::Tensor& recvnum_tensor, - const torch::Tensor& g1_tensor, - const torch::Tensor& communicator_tensor, - const torch::Tensor& nlocal_tensor, - const torch::Tensor& nghost_tensor) { +// ``DEEPMD_MAYBE_UNUSED`` silences CodeQL's ``cpp/unused-static-function`` +// query — the functions ARE used: ``TORCH_LIBRARY_IMPL(...)`` below +// registers them as op implementations via function-pointer arguments, +// which CodeQL's static dataflow can't see through. The attribute is +// C++17, so guard it for the legacy-torch (< 2.1) build path which +// CMakeLists.txt holds at C++14. +#if __cplusplus >= 201703L +#define DEEPMD_MAYBE_UNUSED [[maybe_unused]] +#else +#define DEEPMD_MAYBE_UNUSED +#endif +DEEPMD_MAYBE_UNUSED torch::Tensor border_op_export( + const torch::Tensor& sendlist_tensor, + const torch::Tensor& sendproc_tensor, + const torch::Tensor& recvproc_tensor, + const torch::Tensor& sendnum_tensor, + const torch::Tensor& recvnum_tensor, + const torch::Tensor& g1_tensor, + const torch::Tensor& communicator_tensor, + const torch::Tensor& nlocal_tensor, + const torch::Tensor& nghost_tensor) { auto out = border_op(sendlist_tensor, sendproc_tensor, recvproc_tensor, sendnum_tensor, recvnum_tensor, g1_tensor, communicator_tensor, nlocal_tensor, nghost_tensor); @@ -531,7 +543,7 @@ torch::Tensor border_op_export(const torch::Tensor& sendlist_tensor, return out[0].clone(); } -torch::Tensor border_op_backward_export( +DEEPMD_MAYBE_UNUSED torch::Tensor border_op_backward_export( const torch::Tensor& sendlist_tensor, const torch::Tensor& sendproc_tensor, const torch::Tensor& recvproc_tensor, @@ -547,6 +559,7 @@ torch::Tensor border_op_backward_export( .clone(); } } // namespace +#undef DEEPMD_MAYBE_UNUSED TORCH_LIBRARY_FRAGMENT(deepmd_export, m) { m.def( diff --git a/source/tests/pt_expt/descriptor/test_hybrid.py b/source/tests/pt_expt/descriptor/test_hybrid.py index 86575180c7..b45a7bea19 100644 --- a/source/tests/pt_expt/descriptor/test_hybrid.py +++ b/source/tests/pt_expt/descriptor/test_hybrid.py @@ -324,7 +324,7 @@ def _dpa3_child(use_loc_mapping: bool) -> dict: @pytest.mark.parametrize( "child_factory,expected_hmp,expected_hmp_ar", [ - (lambda: _se_e2_a_child(), False, False), + (_se_e2_a_child, False, False), (lambda: _dpa3_child(use_loc_mapping=True), True, False), (lambda: _dpa3_child(use_loc_mapping=False), True, True), ], diff --git a/source/tests/pt_expt/descriptor/test_repflow_parallel.py b/source/tests/pt_expt/descriptor/test_repflow_parallel.py index f5c68fabed..f5b4d40bcd 100644 --- a/source/tests/pt_expt/descriptor/test_repflow_parallel.py +++ b/source/tests/pt_expt/descriptor/test_repflow_parallel.py @@ -32,7 +32,7 @@ import torch # Trigger registration of the deepmd_export::border_op opaque wrapper. -import deepmd.pt_expt.utils.comm # noqa: F401 +import deepmd.pt_expt.utils.comm # noqa: F401 # lgtm[py/unused-import] from deepmd.dpmodel.descriptor.dpa3 import ( RepFlowArgs, ) diff --git a/source/tests/pt_expt/descriptor/test_repformer_parallel.py b/source/tests/pt_expt/descriptor/test_repformer_parallel.py index 24e6e6ce33..1a6413d08f 100644 --- a/source/tests/pt_expt/descriptor/test_repformer_parallel.py +++ b/source/tests/pt_expt/descriptor/test_repformer_parallel.py @@ -18,7 +18,7 @@ import torch # Trigger registration of the deepmd_export::border_op opaque wrapper. -import deepmd.pt_expt.utils.comm # noqa: F401 +import deepmd.pt_expt.utils.comm # noqa: F401 # lgtm[py/unused-import] from deepmd.dpmodel.descriptor.dpa2 import ( RepformerArgs, RepinitArgs, diff --git a/source/tests/pt_expt/model/test_export_with_comm.py b/source/tests/pt_expt/model/test_export_with_comm.py index f338397639..dcbc628e53 100644 --- a/source/tests/pt_expt/model/test_export_with_comm.py +++ b/source/tests/pt_expt/model/test_export_with_comm.py @@ -35,7 +35,7 @@ # Trigger registration of the deepmd_export::border_op opaque wrapper # (needed by the with-comm artifact at runtime). -import deepmd.pt_expt.utils.comm # noqa: F401 +import deepmd.pt_expt.utils.comm # noqa: F401 # lgtm[py/unused-import] from deepmd.pt_expt.model.get_model import ( get_model, ) diff --git a/source/tests/pt_expt/model/test_spin_export_with_comm.py b/source/tests/pt_expt/model/test_spin_export_with_comm.py index f77c9fe415..0e403d2b42 100644 --- a/source/tests/pt_expt/model/test_spin_export_with_comm.py +++ b/source/tests/pt_expt/model/test_spin_export_with_comm.py @@ -30,7 +30,7 @@ import numpy as np import torch -import deepmd.pt_expt.utils.comm # noqa: F401 - opaque op registration +import deepmd.pt_expt.utils.comm # noqa: F401 # lgtm[py/unused-import] - opaque op registration from deepmd.dpmodel.model.model import get_model as get_model_dp from deepmd.pt_expt.model.spin_ener_model import ( SpinEnergyModel, diff --git a/source/tests/pt_expt/utils/test_border_op_backward.py b/source/tests/pt_expt/utils/test_border_op_backward.py index aeaf491cb2..b33e575f1a 100644 --- a/source/tests/pt_expt/utils/test_border_op_backward.py +++ b/source/tests/pt_expt/utils/test_border_op_backward.py @@ -33,7 +33,7 @@ # comm self-bootstraps the underlying libdeepmd_op_pt.so when needed, so # this single side-effect import is enough to register both the C++ # ops (deepmd::border_op_backward) and their fake/autograd metadata. -import deepmd.pt_expt.utils.comm # noqa: F401 - registers deepmd_export::border_op +import deepmd.pt_expt.utils.comm # noqa: F401 # lgtm[py/unused-import] - registers deepmd_export::border_op def _addr_of(np_arr: np.ndarray) -> int: