From 0a1dc5440edb5ed5bacaba4d75170053aeedae57 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 28 May 2026 22:10:43 +0800 Subject: [PATCH 1/3] fix(pt_expt): let deepmd.pt import errors propagate in comm op check The blanket except in _check_underlying_ops_loaded swallowed ABI / torch-version mismatches against libdeepmd_op_pt.so (e.g. 'undefined symbol' ImportError), leaving callers with only the generic 'build the .so' RuntimeError that misleads users into rebuilding an already- built library. --- deepmd/pt_expt/utils/comm.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py index 434d2a97b0..8326c327c3 100644 --- a/deepmd/pt_expt/utils/comm.py +++ b/deepmd/pt_expt/utils/comm.py @@ -56,13 +56,11 @@ def _check_underlying_ops_loaded() -> None: and hasattr(torch.ops.deepmd_export, "border_op_backward") ): # Triggers cxx_op.py which torch.ops.load_library's the .so. - try: - import deepmd.pt # noqa: F401 - except Exception: - # If deepmd.pt itself fails to import, fall through to the - # explicit RuntimeError below — clearer than re-raising a - # potentially-unrelated import error. - pass + # Let import errors propagate — ABI / torch-version mismatches + # against libdeepmd_op_pt.so surface here with diagnostic detail + # (e.g. ``undefined symbol``) that the generic RuntimeError below + # would otherwise hide. + import deepmd.pt # noqa: F401 if not ( hasattr(torch.ops, "deepmd_export") From add14af7562ed57abcc8d13bc5e9d29d85049732 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 28 May 2026 22:19:45 +0800 Subject: [PATCH 2/3] fix(pt_expt): re-check op registration before propagating import error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit deepmd/pt/__init__.py loads cxx_op (registers deepmd_export ops) before running load_entry_point('deepmd.pt'). A broken third-party entry point makes the import raise after the ops were already registered, so the previous unconditional propagation skipped the fake/autograd registrations even when the underlying ops were present. Catch the import error, re-check registration, and only re-raise when the ops are still missing — preserving the diagnostic detail (e.g. ABI 'undefined symbol') for the genuine .so-load-failure path. --- deepmd/pt_expt/utils/comm.py | 42 +++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py index 8326c327c3..9e3770f113 100644 --- a/deepmd/pt_expt/utils/comm.py +++ b/deepmd/pt_expt/utils/comm.py @@ -50,28 +50,36 @@ def _check_underlying_ops_loaded() -> None: like DDP-spawned subprocesses that re-import modules from scratch and never see the test conftest's ``import deepmd.pt``. """ - if not ( - hasattr(torch.ops, "deepmd_export") - and hasattr(torch.ops.deepmd_export, "border_op") - and hasattr(torch.ops.deepmd_export, "border_op_backward") - ): + + def _ops_registered() -> bool: + return ( + hasattr(torch.ops, "deepmd_export") + and hasattr(torch.ops.deepmd_export, "border_op") + and hasattr(torch.ops.deepmd_export, "border_op_backward") + ) + + import_err: Exception | None = None + if not _ops_registered(): # Triggers cxx_op.py which torch.ops.load_library's the .so. - # Let import errors propagate — ABI / torch-version mismatches - # against libdeepmd_op_pt.so surface here with diagnostic detail - # (e.g. ``undefined symbol``) that the generic RuntimeError below - # would otherwise hide. - import deepmd.pt # noqa: F401 - - if not ( - hasattr(torch.ops, "deepmd_export") - and hasattr(torch.ops.deepmd_export, "border_op") - and hasattr(torch.ops.deepmd_export, "border_op_backward") - ): + try: + import deepmd.pt # noqa: F401 + except Exception as exc: + # ``deepmd/pt/__init__.py`` loads ``cxx_op`` (which registers + # the ops) before running ``load_entry_point("deepmd.pt")``. + # A broken third-party entry point can make the import raise + # *after* the ops were already registered, so only re-raise + # when the registration is still missing — that branch is the + # one where the error (typically an ``undefined symbol`` ABI + # mismatch against libdeepmd_op_pt.so) carries the diagnostic + # detail that the generic RuntimeError below would hide. + import_err = exc + + if not _ops_registered(): raise RuntimeError( "torch.ops.deepmd_export.{border_op,border_op_backward} " "are not registered. Build libdeepmd_op_pt.so and ensure " "deepmd.pt is importable before this module." - ) + ) from import_err _check_underlying_ops_loaded() From 3d1ff0249af5721927a51f7ec3c8e4819befe9e9 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 29 May 2026 07:48:30 +0800 Subject: [PATCH 3/3] fix(pt_expt): re-raise original import error on missing ops The prior commit wrapped the captured import error in a generic RuntimeError via 'raise ... from import_err'. Callers that look at the exception type or str(exc) saw only the generic 'build the .so' message; the diagnostic detail (e.g. 'undefined symbol' for a torch-version / ABI mismatch against libdeepmd_op_pt.so) survived only in the chained traceback. Re-raise the original import error directly when ops are still missing; reserve the generic RuntimeError for the case where 'import deepmd.pt' succeeded but the ops still aren't registered. --- deepmd/pt_expt/utils/comm.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/deepmd/pt_expt/utils/comm.py b/deepmd/pt_expt/utils/comm.py index 9e3770f113..1d209d48dc 100644 --- a/deepmd/pt_expt/utils/comm.py +++ b/deepmd/pt_expt/utils/comm.py @@ -75,11 +75,18 @@ def _ops_registered() -> bool: import_err = exc if not _ops_registered(): + if import_err is not None: + # Surface the raw import error (typically ``ImportError`` with + # ``undefined symbol`` ABI detail) instead of burying it in a + # generic message — that detail is what tells the user the + # mismatch is between libdeepmd_op_pt.so and the runtime torch, + # not a missing build. + raise import_err raise RuntimeError( "torch.ops.deepmd_export.{border_op,border_op_backward} " "are not registered. Build libdeepmd_op_pt.so and ensure " "deepmd.pt is importable before this module." - ) from import_err + ) _check_underlying_ops_loaded()