diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py index 434d2a97b0..1d209d48dc 100644 --- a/deepmd/pt_expt/utils/comm.py +++ b/deepmd/pt_expt/utils/comm.py @@ -50,25 +50,38 @@ def _check_underlying_ops_loaded() -> None: like DDP-spawned subprocesses that re-import modules from scratch and never see the test conftest's ``import deepmd.pt``. """ - if not ( - hasattr(torch.ops, "deepmd_export") - and hasattr(torch.ops.deepmd_export, "border_op") - and hasattr(torch.ops.deepmd_export, "border_op_backward") - ): + + def _ops_registered() -> bool: + return ( + hasattr(torch.ops, "deepmd_export") + and hasattr(torch.ops.deepmd_export, "border_op") + and hasattr(torch.ops.deepmd_export, "border_op_backward") + ) + + import_err: Exception | None = None + if not _ops_registered(): # Triggers cxx_op.py which torch.ops.load_library's the .so. try: import deepmd.pt # noqa: F401 - except Exception: - # If deepmd.pt itself fails to import, fall through to the - # explicit RuntimeError below — clearer than re-raising a - # potentially-unrelated import error. - pass - - if not ( - hasattr(torch.ops, "deepmd_export") - and hasattr(torch.ops.deepmd_export, "border_op") - and hasattr(torch.ops.deepmd_export, "border_op_backward") - ): + except Exception as exc: + # ``deepmd/pt/__init__.py`` loads ``cxx_op`` (which registers + # the ops) before running ``load_entry_point("deepmd.pt")``. + # A broken third-party entry point can make the import raise + # *after* the ops were already registered, so only re-raise + # when the registration is still missing — that branch is the + # one where the error (typically an ``undefined symbol`` ABI + # mismatch against libdeepmd_op_pt.so) carries the diagnostic + # detail that the generic RuntimeError below would hide. + import_err = exc + + if not _ops_registered(): + if import_err is not None: + # Surface the raw import error (typically ``ImportError`` with + # ``undefined symbol`` ABI detail) instead of burying it in a + # generic message — that detail is what tells the user the + # mismatch is between libdeepmd_op_pt.so and the runtime torch, + # not a missing build. + raise import_err raise RuntimeError( "torch.ops.deepmd_export.{border_op,border_op_backward} " "are not registered. Build libdeepmd_op_pt.so and ensure "