diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
index 5d98c37e..3155eb8f 100644
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -18,4 +18,4 @@ jobs:
       - name: Codespell
         uses: codespell-project/actions-codespell@v1
         with:
-           ignore_words_list: fmpose, mpjpe, uvd, xyz, hm36, cpn, dbb
+           ignore_words_list: fmpose, mpjpe, uvd, xyz, hm36, cpn, dbb, mot
diff --git a/README.md b/README.md
index 1005e6a0..5037520e 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ FMPose3D creates a 3D pose from a single 2D image. It leverages fast Flow Matchi
 
 ### Set up an environment
 
-Make sure you have Python 3.10+. You can set this up with:
+Make sure you have Python 3.10. The installation and demos are tested with Python 3.10. You can set this up with:
 ```bash
 conda create -n fmpose_3d python=3.10
 conda activate fmpose_3d
@@ -45,6 +45,8 @@ For the animal pipeline, install the optional DeepLabCut dependency:
 pip install "fmpose3d[animals]"
 ```
 
+> **PyTorch/CUDA note.** FMPose3D pins `torch>=2.4.1,<2.5` and `torchvision>=0.19.1,<0.20`, which use CUDA 12.1 wheels by default on Linux. If your driver does not support CUDA 12.1, or if you need a specific CUDA build, install PyTorch first using the matching command from [pytorch.org](https://pytorch.org/get-started/locally/), then install `fmpose3d`.
+
 ## Demos
 
 ### Testing on in-the-wild images (humans)
@@ -108,7 +110,7 @@ FMPose3D also ships a high-level Python API for end-to-end 3D pose estimation fr
 
 ## Experiments on non-human animals
 
-For animal training/testing and demo scripts, see [animals/README.md](animals/README.md).
+For animal training/testing and demo scripts, see [animals/README.md](animals/README.md). The animal demo **auto-downloads both checkpoints** (a 26-joint SuperAnimal-Quadruped fine-tuned on Animal3D for 2D pose, and the FMPose3D animal flow-matching lifter for 3D) from [Hugging Face](https://huggingface.co/MLAdaptiveIntelligence/FMPose3D) on first run — no manual setup needed.
 
 ## Citation 
 
diff --git a/animals/README.md b/animals/README.md
index 5121533b..6ab16f31 100644
--- a/animals/README.md
+++ b/animals/README.md
@@ -9,8 +9,10 @@ In this part, the FMPose3D model is trained on [Animal3D](https://xujiacong.gith
 
 This visualization script is designed for single-frame based model, allowing you to easily run 3D animal pose estimation on any single image.
 
-Before testing, make sure you have the pre-trained model ready.
-You may either use the model trained by your own or download ours from [here](https://drive.google.com/drive/folders/1kL4aOyWNq0o9zB0rSTRM8KYgkySVmUTk?usp=drive_link) and place it in the `./pre_trained_models` directory.
+Both pre-trained checkpoints are **auto-downloaded from [Hugging Face](https://huggingface.co/MLAdaptiveIntelligence/FMPose3D)** on first run and cached under `~/.cache/huggingface/`. No manual downloads required.
+
+- **3D lifter** (`fmpose3d_animals.pth`) — Animal3D 26-joint flow-matching 2D→3D lifter. Override: set `saved_model_path` in `vis_animals.sh` to a local `.pth`.
+- **2D pose model** (`sa_finetune_hrnet_w32.pt`) — SuperAnimal-Quadruped HRNet-w32 fine-tuned on Animal3D for the 26-joint Animal3D output layout. Override: set `saved_2d_model_path` in `vis_animals.sh` to a local `.pt`.
 
 Next, put your test images into folder `demo/images`. Then run the visualization script:
 ```bash
@@ -49,7 +51,7 @@ Place the downloaded files in the `dataset/` folder of this project:
 ## Training
 The training logs, checkpoints, and related files of each training time will be saved in the './checkpoint' folder.
 
-For trainig on the two datasets:
+For training on the two datasets:
 
 ```bash
 cd animals
diff --git a/animals/demo/vis_animals.py b/animals/demo/vis_animals.py
index 5772060b..b31abcb0 100644
--- a/animals/demo/vis_animals.py
+++ b/animals/demo/vis_animals.py
@@ -46,21 +46,6 @@
     from fmpose3d.models import get_model
     CFM = get_model(args.model_type)
 
-try:
-    from deeplabcut.pose_estimation_pytorch.apis import (  # pyright: ignore[reportMissingImports]
-        superanimal_analyze_images,
-    )
-except ImportError:
-    raise ImportError(
-        "DeepLabCut is required for the animal demo. "
-        "Install it with: pip install \"fmpose3d[animals]\""
-    ) from None
-
-superanimal_name = "superanimal_quadruped"
-model_name = "hrnet_w32"
-detector_name = "fasterrcnn_resnet50_fpn_v2"
-max_individuals = 1
-
 def compute_limb_regularization_matrix(gt_3d):
     """
     Compute regularization matrix to align limb directions to vertical (0,0,1).
@@ -145,108 +130,39 @@ def apply_regularization(pose_3d, R):
     """
     return (R @ pose_3d.T).T
 
-def get_pose2D(path, output_dir, type):
+def build_2d_estimator():
+    """Build the 2D pose estimator once. Snapshot resolves lazily on first predict.
+
+    Empty --saved_2d_model_path -> auto-download fine-tuned snapshot from HF.
+    Non-empty path -> use as a local override.
+    """
+    from fmpose3d.common.config import SuperAnimalConfig
+    from fmpose3d.inference_api.fmpose3d import SuperAnimalEstimator
+    from fmpose3d.utils.weights import resolve_weights_path
+
+    pose_snapshot_path = resolve_weights_path(
+        args.saved_2d_model_path, "sa_finetune_hrnet_w32.pt"
+    )
+    cfg = SuperAnimalConfig(
+        pose_snapshot_path=pose_snapshot_path,
+        pytorch_config_path=args.pytorch_config_2d_path,
+    )
+    print(f"[2D] pose snapshot = {cfg.pose_snapshot_path}")
+    return SuperAnimalEstimator(cfg)
+
+
+def get_pose2D(estimator, path, output_dir, type):
 
     print('\nGenerating 2D pose...')
-    
-    # Check if this is the special debug case for 000000119761_horse
-    filename = Path(path).stem
-    is_debug_case = "000000119761_horse" in filename
-    
-    if is_debug_case:
-        print(f"DEBUG MODE: Using provided 2D pose for {filename}")
-        # User provided 2D pose (26 keypoints, x, y coordinates, ignoring the last dimension)
-        provided_pose = np.array([
-            [361, 230], [361, 237], [363, 279], [257, 359], [251, 374],
-            [164, 365], [68, 372], [99, 206], [247, 266], [253, 285],
-            [127, 275], [101, 285], [267, 217], [268, 229], [273, 318],
-            [250, 340], [128, 311], [76, 305], [313, 220], [48, 310],
-            [351, 203], [352, 210], [340, 257], [340, 261], [373, 276],
-            [55, 247]
-        ], dtype=np.float32)
-        
-        # Reshape to match expected format: (1, 26, 2) for single individual
-        provided_pose = provided_pose.reshape(1, 26, 2)
-        
-        # Create xy_preds dict with the provided pose
-        xy_preds = {path: provided_pose}
-        print(f"Using provided 2D pose with shape: {provided_pose.shape}")
-    else:
-        # Normal prediction flow
-        predictions = superanimal_analyze_images(
-            superanimal_name,
-            model_name,
-            detector_name,
-            path,
-            max_individuals,
-            out_folder=output_dir
-        )
-        print("predictions:", predictions)
-        
-        # get the 2D keypoints from the predictions
-        xy_preds = {}
-        # predictions is a dict: {image_path: {"bodyparts": (N, K, 3), "bboxes": ..., "bbox_scores": ...}}
-        for img_path, payload in predictions.items():
-            bodyparts = payload.get("bodyparts")
-            if bodyparts is None:
-                continue
-            # bodyparts shape: (num_individuals, num_keypoints, 3) -> [:, :, :2] keeps x,y
-            xy_preds[img_path] = bodyparts[..., :2]
-
-    print("2D keypoints (x,y) by image:")
-    for img_path, xy in xy_preds.items():
-        print(f"{img_path}: shape {xy.shape}")
-    
-    # For debug case, the provided pose is already in Animal3D format (26 keypoints)
-    # So we skip the mapping step
-    if is_debug_case:
-        print("DEBUG MODE: Skipping keypoint mapping (already in Animal3D format)")
-        mapped_keypoints = xy_preds
-    else:
-        # now map the keypoints to a different set of keypoints (used in Animal3D)
-        # keypoint mapping from quadruped80K super keypotints to animal3d keypoints
-        keypoint_mapping = {"quadruped80k":[10, 5, -1, 26, 29, 30, 35, 22, 24, 27, 31, 32, -1, -1, 25, 28, 33, 34, 15, 23, 11, 6, 4, 3, 0, -1]}
-        
-        # for the keypoint_mapping, -1 indicates that there is no corresponding keypoint in the source set, but we can interpolate 
-        # for index 2, we can interpolate between keypoints 3 and 4 in the source set to get a better estimate of the missing keypoint
-        # for index 25, we can interpolate between keypoints 22 and 23 in the source set
-        # for index 12, we can interpolate between keypoints 24 and 19 in the source set
-        # for index 13, we can interpolate between keypoints 27 and 19 in the source set
-        
-        # Define interpolation rules for -1 indices: {target_idx: (source_idx1, source_idx2)}
-        interpolation_rules = {
-            2: (3, 4),      # interpolate between source keypoints 3 and 4
-            12: (24, 19),   # interpolate between source keypoints 24 and 19
-            13: (27, 19),   # interpolate between source keypoints 27 and 19
-            25: (22, 23),   # interpolate between source keypoints 22 and 23
-        }
-        
-        # map the keypoints
-        mapped_keypoints = {}
-        mapping_indices = keypoint_mapping["quadruped80k"]
-
-        for img_path, xy in xy_preds.items():
-            # xy shape: (num_individuals, num_keypoints, 2)
-            num_individuals, num_keypoints, _ = xy.shape
-            num_target_keypoints = len(mapping_indices)
-            
-            # Initialize mapped array with NaN or zeros
-            mapped_xy = np.full((num_individuals, num_target_keypoints, 2), np.nan)
-            
-            for target_idx, source_idx in enumerate(mapping_indices):
-                if source_idx != -1 and source_idx < num_keypoints:
-                    # Copy the keypoint from source to target position
-                    mapped_xy[:, target_idx, :] = xy[:, source_idx, :]
-                elif source_idx == -1 and target_idx in interpolation_rules:
-                    # Perform interpolation for -1 indices
-                    src1, src2 = interpolation_rules[target_idx]
-                    if src1 < num_keypoints and src2 < num_keypoints:
-                        # Interpolate as the average of the two source keypoints
-                        mapped_xy[:, target_idx, :] = (xy[:, src1, :] + xy[:, src2, :]) / 2.0
-                        print(f"Interpolated keypoint {target_idx} from source keypoints {src1} and {src2}")
-            
-            mapped_keypoints[img_path] = mapped_xy
-            print(f"Mapped {img_path}: {xy.shape} -> {mapped_xy.shape}")
+
+    img_bgr = cv2.imread(path)
+    if img_bgr is None:
+        raise FileNotFoundError(f"Failed to read image: {path}")
+
+    # predict() returns (kpts (1, N, 26, 2), scores (1, N, 26), valid_mask (N,)).
+    kpts, _scores, _mask = estimator.predict(img_bgr[None])
+    # Pack into the {img_path: (1, 26, 2)} format expected by the save/vis code below.
+    mapped_keypoints = {path: kpts[:, 0, :, :]}
 
     print('Generating 2D pose successful!')
 
@@ -259,7 +175,6 @@ def get_pose2D(path, output_dir, type):
         # Save in the same format as vis_in_the_wild.py for compatibility
         output_npz = output_dir_2D + 'keypoints.npz'
         np.savez_compressed(output_npz, reconstruction=mapped_xy)
-        print(f"Saved keypoints to {output_npz}")
         
         # Also save as npy for backup
         img_name = Path(img_path).stem
@@ -275,7 +190,6 @@ def get_pose2D(path, output_dir, type):
                 index=[f'keypoint_{i}' for i in range(mapped_xy.shape[1])]
             )
             df.to_csv(csv_file)
-            print(f"Saved individual {ind_idx} keypoints to {csv_file}")
         
         # Visualize mapped keypoints on image
         img = Image.open(img_path)
@@ -328,39 +242,38 @@ def get_pose2D(path, output_dir, type):
         plt.tight_layout()
         plt.savefig(vis_file, dpi=150, bbox_inches='tight')
         plt.close(fig)
-        print(f"Saved visualization to {vis_file}")
 
 
-def get_pose3D(path, output_dir, type='image'):
-    """
-    Generate 3D pose from 2D keypoints using the model.
-    This function reads the 2D keypoints saved by get_pose2D and generates 3D poses.
+def build_3d_lifter():
+    """Build the 3D lifter once and return (model, device).
+
+    Empty --saved_model_path -> auto-download fmpose3d_animals.pth from HF.
+    Non-empty path is used as a local override.
     """
-    print('\nGenerating 3D pose...')
-    print(f"args.n_joints: {args.n_joints}, args.out_joints: {args.out_joints}")
-    
-    ## Reload model
+    from fmpose3d.utils.weights import resolve_weights_path
+
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = CFM(args).to(device)
 
-    model = {}
-    model['CFM'] = CFM(args).to(device)
-    
-    model_dict = model['CFM'].state_dict()
-    model_path = args.saved_model_path
-    print(f"Loading model from: {model_path}")
+    model_path = resolve_weights_path(args.saved_model_path, f"{args.model_type}.pth")
+    print(f"[3D] lifter weights = {model_path}")
     pre_dict = torch.load(model_path, map_location=device, weights_only=True)
-    for name, key in model_dict.items():
+    model_dict = model.state_dict()
+    for name in model_dict:
         model_dict[name] = pre_dict[name]
-    model['CFM'].load_state_dict(model_dict)
-    print("Model loaded successfully!")
-    
-    model = model['CFM'].eval()
+    model.load_state_dict(model_dict)
+    return model.eval()
+
+
+def get_pose3D(model, path, output_dir, type='image'):
+    """
+    Generate 3D pose from 2D keypoints using the model.
+    Reads the 2D keypoints saved by get_pose2D and generates 3D poses.
+    """
+    print('\nGenerating 3D pose...')
 
-    ## Load input 2D keypoints
     keypoints = np.load(output_dir + 'input_2D/keypoints.npz', allow_pickle=True)['reconstruction']
-    print(f"Loaded keypoints shape: {keypoints.shape}")
 
-    ## Generate 3D poses
     if type == "image":
         i = 0
         img = cv2.imread(path)
@@ -422,9 +335,6 @@ def euler_sample(c_2d, y_local, steps, model_3d):
         return y_local
     
     ## Estimation (without TTA for better results)
-    print("input_2D.shape:", input_2D.shape)
-    print("input_2D:", input_2D[0, 0])
-    
     # Single inference without flip augmentation
     # Create 3D random noise with shape (1, 1, J, 3)
     y = torch.randn(input_2D.size(0), input_2D.size(1), input_2D.size(2), 3, device=device)
@@ -492,7 +402,6 @@ def euler_sample(c_2d, y_local, steps, model_3d):
         output_dir_2D_img = output_dir + 'pose2D_on_image/'
         os.makedirs(output_dir_2D_img, exist_ok=True)
         cv2.imwrite(f'{output_dir_2D_img}{i:04d}_2d.png', img_copy)
-        print(f"Saved 2D pose on image to {output_dir_2D_img}{i:04d}_2d.png")
 
     ## Save 3D pose as npz
     output_dir_3D = output_dir + 'pose3D/'
@@ -603,46 +512,46 @@ def img2gif(video_path, name, output_dir, duration=0.25):
 
 
 if __name__ == "__main__":
-    
+
     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
 
     path = args.path # file path or folder path
-    
-    # Check if path is a directory
+
+    # Build the 2D estimator and 3D lifter ONCE; reuse across all images/frames.
+    # This avoids redundant HF resolution and DLC/torch model reloads.
+    estimator_2d = build_2d_estimator()
+    model_3d = build_3d_lifter()
+
     if os.path.isdir(path):
-        # Get all image files in the directory
         image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.JPG', '*.JPEG', '*.PNG', '*.BMP']
         image_files = []
         for ext in image_extensions:
             image_files.extend(glob.glob(os.path.join(path, ext)))
         image_files.sort()
-        
+
         if len(image_files) == 0:
             print(f"No image files found in {path}")
             exit(0)
-        
+
         print(f"Found {len(image_files)} images in {path}")
-        
-        # Process each image
+
         for img_path in tqdm(image_files, desc="Processing images"):
             filename = img_path.split('/')[-1].split('.')[0]
             output_dir = './predictions/' + filename + '/'
-            
+
             print(f"\nProcessing: {img_path}")
-            get_pose2D(img_path, output_dir, args.type)
-            get_pose3D(img_path, output_dir, args.type)
-        
+            get_pose2D(estimator_2d, img_path, output_dir, args.type)
+            get_pose3D(model_3d, img_path, output_dir, args.type)
+
         print(f'\nAll {len(image_files)} images processed successfully!')
     else:
         # Single file processing
         filename = path.split('/')[-1].split('.')[0]
         output_dir = './predictions/' + filename + '/'
 
-        get_pose2D(path, output_dir, args.type)
-        get_pose3D(path, output_dir, args.type)
+        get_pose2D(estimator_2d, path, output_dir, args.type)
+        get_pose3D(model_3d, path, output_dir, args.type)
 
-        if args.type=="video":
+        if args.type == "video":
             img2video(path, filename, output_dir)
-            img2gif(path, filename, output_dir)
-
-        print('Generating demo successful!')
\ No newline at end of file
+            img2gif(path, filename, output_dir)
\ No newline at end of file
diff --git a/animals/demo/vis_animals.sh b/animals/demo/vis_animals.sh
index e2944c2d..a2109695 100644
--- a/animals/demo/vis_animals.sh
+++ b/animals/demo/vis_animals.sh
@@ -4,12 +4,19 @@ gpu_id=1
 sample_steps=3
 batch_size=1
 sh_file='vis_animals.sh'
-# n_joints=26
-# out_joints=26
 
 model_type='fmpose3d_animals'
 # model_path=''  # set to a local file path to override the registry
-saved_model_path='../pre_trained_models/fmpose3d_animals/fmpose3d_animals_pretrained_weights.pth'
+
+# 3D lifter (2D keypoints -> 3D pose).
+# Empty -> auto-downloads fmpose3d_animals.pth from Hugging Face (cached under ~/.cache/huggingface).
+# Set to a local path to override.
+saved_model_path=''
+
+# 2D pose model (image -> 26-joint Animal3D keypoints).
+# Empty -> auto-downloads from Hugging Face on first run (cached under ~/.cache/huggingface).
+# Set to a local path to override (e.g., for a custom checkpoint).
+saved_2d_model_path=''
 
 # path='./images/image_00068.jpg'  # single image
 input_images_folder='./images/'  # folder containing multiple images
@@ -18,6 +25,7 @@ python3 vis_animals.py \
  --type 'image' \
  --path ${input_images_folder} \
  --saved_model_path "${saved_model_path}" \
+ --saved_2d_model_path "${saved_2d_model_path}" \
  ${model_path:+--model_path "$model_path"} \
  --model_type "${model_type}" \
  --sample_steps ${sample_steps} \
@@ -25,4 +33,4 @@ python3 vis_animals.py \
  --layers ${layers} \
  --dataset animal3d \
  --gpu ${gpu_id} \
- --sh_file ${sh_file}
\ No newline at end of file
+ --sh_file ${sh_file}
diff --git a/animals/scripts/main_animal3d.py b/animals/scripts/main_animal3d.py
index c90bdead..b2bdaa7a 100644
--- a/animals/scripts/main_animal3d.py
+++ b/animals/scripts/main_animal3d.py
@@ -18,6 +18,7 @@
 from fmpose3d.animals.common.arguments import opts as parse_args
 from fmpose3d.animals.common.utils import *
 from fmpose3d.animals.common.animal3d_dataset import TrainDataset
+from fmpose3d.utils.weights import resolve_weights_path
 import time
 
 args = parse_args().parse()
@@ -210,7 +211,7 @@ def get_parameter_number(net):
 
         if args.train==False:
             # create a new folder for the test results
-            args.folder_dir = os.path.dirname(args.saved_model_path)
+            args.folder_dir = os.path.dirname(args.saved_model_path) if args.saved_model_path else './checkpoint'
             args.checkpoint = os.path.join(args.folder_dir, 'test_results_' + args.create_time)
 
         if not os.path.exists(args.checkpoint):
@@ -247,8 +248,8 @@ def get_parameter_number(net):
     train_paths = args.train_dataset_path if isinstance(args.train_dataset_path, list) else [args.train_dataset_path]
     test_paths = args.test_dataset_path if isinstance(args.test_dataset_path, list) else [args.test_dataset_path]
 
-    # Rat7M doesn't have action labels, use placeholder for error calculation
-    actions = ['rat_motion']
+    # Animal3D doesn't have per-clip action labels; use a single placeholder bucket for error aggregation.
+    actions = ['animal_motion']
 
     if args.train:
         train_datasets = [TrainDataset(is_train=True, json_file=p, root_joint=args.root_joint) for p in train_paths]
@@ -268,9 +269,8 @@ def get_parameter_number(net):
 
     if args.reload:
         model_dict = model['CFM'].state_dict()
-        # Prefer explicit saved_model_path; otherwise fallback to previous_dir glob
-        model_path = args.saved_model_path
-        print(model_path)
+        model_path = resolve_weights_path(args.saved_model_path, f"{args.model_type}.pth")
+        print(f"Loading weights from: {model_path}")
         pre_dict = torch.load(model_path, weights_only=True, map_location=device)
         for name, key in model_dict.items():
             model_dict[name] = pre_dict[name]
@@ -348,4 +348,3 @@ def get_parameter_number(net):
     print(args.checkpoint)
     logging.info(args.checkpoint)
     
-
diff --git a/animals/scripts/test_animal3d.sh b/animals/scripts/test_animal3d.sh
index 207e3321..c975d136 100644
--- a/animals/scripts/test_animal3d.sh
+++ b/animals/scripts/test_animal3d.sh
@@ -11,8 +11,11 @@ n_joints=26
 out_joints=26
 epochs=300
 model_type='fmpose3d_animals'
-# model_path='' # set to a local file path to override the registry
-saved_model_path='./pre_trained_models/fmpose3d_animals/fmpose3d_animals_pretrained_weights.pth'
+model_path='' # set to a local file path to override the registry
+# By default, weights are automatically downloaded from Hugging Face Hub.
+# To use local weights instead, uncomment the line below:
+# saved_model_path='./pre_trained_models/fmpose3d_animals/fmpose3d_animals_pretrained_weights.pth'
+saved_model_path=''
 
 # root path denotes the path to the original dataset
 root_path="./dataset/"
@@ -47,4 +50,4 @@ python ./scripts/main_animal3d.py \
   --lr_decay_large ${lr_decay_large} \
   --train_dataset_path ${train_dataset_paths[@]} \
   --test_dataset_path ${test_dataset_paths[@]} \
-  --saved_model_path ${saved_model_path}
\ No newline at end of file
+  --saved_model_path "${saved_model_path}"
diff --git a/demo/vis_in_the_wild.py b/demo/vis_in_the_wild.py
index 8f888198..26515964 100755
--- a/demo/vis_in_the_wild.py
+++ b/demo/vis_in_the_wild.py
@@ -279,7 +279,7 @@ def get_pose3D(path, output_dir, type='image'):
     
     # if args.reload:
     model_dict = model['CFM'].state_dict()
-    model_path = resolve_weights_path(args.model_weights_path, args.model_type)
+    model_path = resolve_weights_path(args.model_weights_path, f"{args.model_type}.pth")
 
     print(f"Loading weights from: {model_path}")
     pre_dict = torch.load(model_path, map_location=device, weights_only=True)
diff --git a/fmpose3d/animals/common/animal_visualization.py b/fmpose3d/animals/common/animal_visualization.py
deleted file mode 100644
index c45d6f66..00000000
--- a/fmpose3d/animals/common/animal_visualization.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-FMPose3D: monocular 3D Pose Estimation via Flow Matching
-
-Official implementation of the paper:
-"FMPose3D: monocular 3D Pose Estimation via Flow Matching"
-by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis
-Licensed under Apache 2.0
-"""
-
-import os
-
-import cv2
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-
-
-def save_3Dpose_colored(pre_pose, gt_pose, figure_name):
-    fig = plt.figure()
-    ax1 = fig.add_subplot(211, projection="3d")
-    ax1.scatter(
-        pre_pose[:, 0], pre_pose[:, 1], pre_pose[:, 2], c=list(range(pre_pose.shape[0])), cmap="jet"
-    )
-    # plt.axis('off')
-    ax2 = fig.add_subplot(212, projection="3d")
-    ax2.scatter(
-        gt_pose[:, 0], gt_pose[:, 1], gt_pose[:, 2], c=list(range(gt_pose.shape[0])), cmap="jet"
-    )
-    # plt.axis('off')
-    plt.show()
-    plt.savefig(figure_name, dpi=400.0)
-    plt.close()
-
-
-def save_absolute_3Dpose_image(image, pre_pose, gt_pose, vid_3D, skeleton, figure_name):
-    fig = plt.figure(figsize=(20, 9))
-    ax1 = fig.add_subplot(131, projection="3d")
-    ax1.scatter(
-        pre_pose[:, 0],
-        pre_pose[:, 2],
-        -pre_pose[:, 1],
-        c=list(range(pre_pose.shape[0])),
-        cmap="jet",
-    )
-    for i in range(skeleton.shape[0]):
-        ax1.plot(
-            [pre_pose[skeleton[i, 0], 0], pre_pose[skeleton[i, 1], 0]],
-            [pre_pose[skeleton[i, 0], 2], pre_pose[skeleton[i, 1], 2]],
-            [-pre_pose[skeleton[i, 0], 1], -pre_pose[skeleton[i, 1], 1]],
-            c="black",
-        )
-    ax1.set_xlim([-3, 3])
-    ax1.set_zlim([-1.5, 3])
-    ax1.set_ylim([12, 20])
-    ax1.title.set_text("Prediction")
-
-    # plt.axis('off')
-    ax2 = fig.add_subplot(132, projection="3d")
-    visiable_gt = gt_pose[np.where(vid_3D)[0], :]
-    ax2.scatter(
-        visiable_gt[:, 0],
-        visiable_gt[:, 2],
-        -visiable_gt[:, 1],
-        c=list(np.array(range(gt_pose.shape[0]))[np.where(vid_3D)]),
-        cmap="jet",
-    )
-    for i in range(skeleton.shape[0]):
-        if vid_3D[skeleton[i, 0]] > 0 and vid_3D[skeleton[i, 1]] > 0:
-            ax2.plot(
-                [gt_pose[skeleton[i, 0], 0], gt_pose[skeleton[i, 1], 0]],
-                [gt_pose[skeleton[i, 0], 2], gt_pose[skeleton[i, 1], 2]],
-                [-gt_pose[skeleton[i, 0], 1], -gt_pose[skeleton[i, 1], 1]],
-                c="black",
-            )
-    ax2.set_xlim([-3, 3])
-    ax2.set_zlim([-1.5, 3])
-    ax2.set_ylim([12, 20])
-    ax2.title.set_text("GT")
-    # plt.axis('off')
-    ax3 = fig.add_subplot(133)
-    ax3.imshow(image)
-    ax3.title.set_text("Camera1 view")
-    plt.show()
-    plt.savefig(figure_name, dpi=200.0)
-    plt.close()
-
-
-def save_absolute_3Dpose(pre_pose, skeleton, figure_name):
-    fig = plt.figure(figsize=(20, 9))
-    ax1 = fig.add_subplot(111, projection="3d")
-    ax1.scatter(
-        pre_pose[:, 0],
-        pre_pose[:, 2],
-        -pre_pose[:, 1],
-        c=list(range(pre_pose.shape[0])),
-        cmap="jet",
-    )
-    for i in range(skeleton.shape[0]):
-        ax1.plot(
-            [pre_pose[skeleton[i, 0], 0], pre_pose[skeleton[i, 1], 0]],
-            [pre_pose[skeleton[i, 0], 2], pre_pose[skeleton[i, 1], 2]],
-            [-pre_pose[skeleton[i, 0], 1], -pre_pose[skeleton[i, 1], 1]],
-            c="black",
-        )
-    ax1.set_xlim([-1, 1])
-    ax1.set_zlim([-1, 1])
-    ax1.set_ylim([-1, 1])
-    ax1.title.set_text("gt")
-    plt.show()
-    plt.savefig(figure_name, dpi=200.0)
-    plt.close()
diff --git a/fmpose3d/animals/common/arber_dataset.py b/fmpose3d/animals/common/arber_dataset.py
deleted file mode 100644
index 27dba171..00000000
--- a/fmpose3d/animals/common/arber_dataset.py
+++ /dev/null
@@ -1,312 +0,0 @@
-"""
-FMPose3D: monocular 3D Pose Estimation via Flow Matching
-
-Official implementation of the paper:
-"FMPose3D: monocular 3D Pose Estimation via Flow Matching"
-by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis
-Licensed under Apache 2.0
-"""
-
-import copy
-import gc
-import glob
-import os
-import random
-
-import cv2
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import torch
-from torch import from_numpy as FN
-from torch.utils.data import Dataset
-from tqdm import tqdm
-
-from fmpose3d.common.camera import normalize_screen_coordinates
-from fmpose3d.animals.common.lifter3d import load_camera_params, load_h5_keypoints
-
-
-class ArberDataset(Dataset):
-    def __init__(
-        self,
-        cfg,
-        path,
-        split,
-        cam_names,
-        root_index=12,
-        joint_num=23,
-        sampling_gap=100,
-        frame_per_video=9000,
-        norm_rate=50.0,
-        img_W=2048,
-        img_H=1536,
-        arg_views=1,
-        resize_2D_scale=0.5,
-        visualize=False,
-    ):
-
-        self.cfg = cfg
-        self.cam_names = cam_names
-        self.joint_num = joint_num
-        self.root_index = root_index
-        self.img_W = img_W * resize_2D_scale
-        self.img_H = img_H * resize_2D_scale
-        self.arg_views = arg_views
-        self.split = split
-        self.visualize = visualize
-
-        # subject_index: category names
-        subject_index = os.listdir(path)
-        subject_index.sort()
-
-        # use split to define start and end frame
-        if split == "Train":
-            self.subject_list = subject_index
-            self.start_frame = 0
-            self.end_frame = 10000
-        elif split == "Valid":
-            self.subject_list = subject_index
-            self.start_frame = 3
-            self.end_frame = 8000
-        elif split == "Test":
-            self.subject_list = subject_index
-            self.start_frame = 6
-            self.end_frame = 10000
-        elif split == "Infer":
-            self.subject_list = subject_index[:1]
-            self.start_frame = 0
-            self.end_frame = 2000000
-
-        # prepare pose data
-        print("prepare the pose data...")
-        self.pose_3D_list = []
-        self.pose_2D_list = []
-        self.sample_info_list = []
-        self.cam_para_list = []
-
-        for sub_idx, subject_name in enumerate(self.subject_list):  # iterate on subject
-            print(subject_name)
-            subject_folder = os.path.join(path, subject_name)
-
-            # load asked cameras
-            yaml_files = []
-            for cam in cam_names:
-                yaml_files.extend(
-                    sorted(glob.glob(os.path.join(subject_folder, f"calibration/*{cam}*.yaml")))
-                )
-
-            # yaml_files = sorted(glob.glob(os.path.join(subject_folder,'calibration/*.yaml')))
-            cameras = [load_camera_params(yaml) for yaml in yaml_files]
-            self.cam_para_list = cameras
-
-            # load triangulated 3d points
-            # points_3d_np = np.load(os.path.join(subject_folder,'triangulated_3d.npy'))   # shape (num_frames, 23, 3)
-
-            # apply norm_rate on translation vector
-            for i in range(len(cam_names)):
-                cameras[i]["T"] = cameras[i]["T"] / norm_rate
-
-            # load all 2D keypoints from asked cameras
-            h5_files = []
-            for cam in cam_names:
-                # print("cam:", cam)
-                # h5_files.extend(sorted(glob.glob(os.path.join(subject_folder,f'pose2d_dlc/Camera_{cam}*.h5')))) # for cspnext model
-                h5_files.extend(
-                    sorted(glob.glob(os.path.join(subject_folder, f"pose2d_dlc/*{cam}*.h5")))
-                )  # for rtmpose model
-
-            keypoints_2d = [
-                load_h5_keypoints(h5) for h5 in h5_files
-            ]  # （num_cameras,num_frames,23,3) # for rtmpose model
-            # keypoints_2d = [load_h5_keypoints_cspnext(h5) for h5 in h5_files]   # （num_cameras,num_frames,23,3) # for cspnext model
-
-            # get total frame - > real end frame
-            total_frame_num = keypoints_2d[0].shape[0]
-            real_end_frame = min(self.end_frame, total_frame_num)
-
-            for idx in tqdm(
-                range(self.start_frame, real_end_frame, sampling_gap)
-            ):  # get temporal video fragment of 2D and 3D keypoints
-                idx = max(idx, self.t_pad)
-                idx = min(idx, real_end_frame - self.t_pad - 1)
-                left_frame_id = idx - self.t_pad
-                right_frame_id = idx + self.t_pad + 1
-
-                # record sample info
-                tmp_info = np.zeros(2)
-                tmp_info[0] = sub_idx
-                tmp_info[1] = idx
-
-                # extract 3d fragment, get 3D points from npy file
-
-                points_3d_fragment = (
-                    np.load(os.path.join(subject_folder, "triangulated_3d.npy"))[
-                        left_frame_id:right_frame_id, :, :3
-                    ]
-                    / norm_rate
-                )  # load from prepared .npy and apply norm_rate
-
-                # print("points_3d_fragment shape:", points_3d_fragment.shape) # (num_frames, 23, 3)
-                keypoints_2d = np.array(keypoints_2d)  # Ensure it's a NumPy array
-                # get 2D keypoint vis
-                points_2d_vis_np = keypoints_2d[:, left_frame_id:right_frame_id, :, 2:]  # N,T,K,1
-
-                # clip vis
-                points_2d_vis_np = np.clip(points_2d_vis_np, 0, 1)
-
-                # get 2D keypoint
-                points_2d_np = keypoints_2d[:, left_frame_id:right_frame_id, :, :2]  # N,T,K,2
-                # # get 3D keypoint from 3D lifting
-                # points_2d_fragment_np = np.array(points_2d_fragment) # (num_cams, num_frames, num_joints, 2) N,T,K,2
-                # points_3d_fragment = triangulate_3d_batch(points_2d_fragment_np,cameras)  #(num_frames, 23, 3) T,K,3
-
-                # get 3D pose from world to camera, with respect to different camera
-                points_3d = np.zeros(
-                    (self.t_length, self.joint_num, 3, len(self.cam_names))
-                )  # initialize 3D keypints, from T,K,3 to T,K,3,N
-                points_3d_world = np.reshape(points_3d_fragment, (-1, 3))  # T,K,3
-                # print("before and after reshape",points_3d_fragment.shape,points_3d_world.shape)
-                for cam_idx, cam in enumerate(cam_names):
-                    # todo: check transformation
-                    points_3d_cam = (
-                        np.dot(points_3d_world, cameras[cam_idx]["R"].T) + cameras[cam_idx]["T"].T
-                    )
-
-                    points_3d[:, :, :, cam_idx] = np.reshape(
-                        points_3d_cam, (self.t_length, self.joint_num, 3)
-                    )  # T,K,3,N
-
-                # get relative 3D pose
-                points_3d_root = copy.deepcopy(
-                    points_3d[:, self.root_index : self.root_index + 1, :, :]
-                )
-                rela_points_3d = points_3d - points_3d_root
-
-                del points_3d, points_3d_root
-                gc.collect()
-
-                # normalize 2D pose
-                points_2d_np = normalize_screen_coordinates(
-                    copy.deepcopy(points_2d_np), self.img_W, self.img_H
-                )  # N,T,K,2
-
-                # get fake vis3d
-                points_vis3D = np.ones((self.t_length, self.joint_num, 1))
-
-                self.pose_3D_list.append(rela_points_3d)
-                self.pose_2D_list.append(
-                    np.nan_to_num(points_2d_np.transpose(1, 2, 3, 0))
-                )  # transpose to T,K,2,N
-                self.vid2D_list.append(
-                    points_2d_vis_np.transpose(1, 2, 3, 0)
-                )  # Transpose to T,K,1,N, move N to the end
-                self.vid3D_list.append(points_vis3D)
-                self.sample_info_list.append(tmp_info)
-
-                del points_2d_np, points_vis3D, points_2d_vis_np
-                gc.collect()
-        torch.cuda.empty_cache()
-
-    def __len__(self):
-        return len(self.pose_3D_list)
-
-    def __getitem__(self, index):
-        return self.getitem(index)
-
-    def getitem(self, index):
-
-        pose_3D = self.pose_3D_list[index].copy()
-        pose_2D = self.pose_2D_list[index].copy()
-        vid_3D = self.vid3D_list[index].copy()
-        vid_2D = self.vid2D_list[index].copy()
-        sample_info = self.sample_info_list[index]
-
-        if "TRAIN" in self.split.upper() and self.arg_views > 0:
-            pose_3D, pose_2D = self.view_aug(pose_3D, pose_2D)
-            tmp_vid = np.repeat(
-                np.expand_dims(copy.deepcopy(vid_3D), axis=-1), self.arg_views, axis=-1
-            )
-            vid_2D = np.concatenate((vid_2D, tmp_vid), axis=-1)
-            # clip vid into 0,1
-            # vid_2D = np.clip(vid_2D,0,1)
-
-        pose_root = copy.deepcopy(pose_3D[:, self.root_index : self.root_index + 1, :, :])
-        pose_3D[:, self.root_index : self.root_index + 1, :, :] = 0.0
-        pose_3D = np.nan_to_num(pose_3D, nan=0)
-        pose_2D = np.concatenate((pose_2D, vid_2D), axis=2)
-
-        return (
-            FN(pose_3D).float(),
-            FN(pose_root).float(),
-            FN(pose_2D).float(),
-            FN(vid_3D).float(),
-            FN(sample_info).float(),
-        )
-
-
-if __name__ == "__main__":
-    from common.arguments import parse_args
-    from common.config import config as cfg
-    from common.config import reset_config, update_config
-
-    from scripts.reset_config_arber import reset_config_arber
-    from scripts.reset_config_rat7m import reset_config_rat7m
-
-    cam_names = ["Camera0", "Camera1", "Camera2", "Camera3", "Camera4", "Camera5"]
-    data_dir = "/workspace/MTFpose/data/Arber_tiny"
-    args = parse_args()
-    update_config(args.cfg)
-    reset_config(cfg, args)
-    reset_config_arber(cfg)
-
-    args = parse_args()
-    update_config(args.cfg)  ###config file->cfg
-    reset_config(cfg, args)  ###arg -> cfg
-    reset_config_rat7m(cfg)
-
-    print(cfg)
-    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cfg.GPU)
-
-    root_index = cfg.TINY_DATA.ROOT_INDEX
-    sampling_gap = cfg.TINY_DATA.SAMPLING_GAP
-    joint_num = cfg.TINY_DATA.NUM_JOINTS
-    img_W, img_H = cfg.TINY_DATA.IMG_SIZE
-    use_2d_gt = cfg.DATA.USE_GT_2D
-    receptive_field = cfg.NETWORK.TEMPORAL_LENGTH
-    pad = receptive_field // 2
-    causal_shift = 0
-    train_dataset = ArberDataset(
-        cfg,
-        cfg.ARBER_DATA.ROOT_DIR,
-        "Train",
-        cam_names,
-        pad,
-        root_index=root_index,
-        use_2D_gt=use_2d_gt,
-        joint_num=23,
-        sampling_gap=60,
-        img_W=img_W,
-        img_H=img_H,
-        arg_views=0,
-        resize_2D_scale=cfg.ARBER_DATA.RESIZE_SCALE,
-    )
-
-    pose_3D, pose_root, pose_2D, vid_3D, rotation, sample_info = train_dataset.getitem(2)
-    print(
-        "output at item 250, pose_3D",
-        pose_3D.shape,
-        "pose_root",
-        pose_root.shape,
-        "pose_2D",
-        pose_2D.shape,
-        "vid_3D",
-        vid_3D.shape,
-        "rotation",
-        rotation.shape,
-        "sample_info",
-        sample_info,
-    )
-    # output at item 250, pose_3D torch.Size([7, 23, 3, 6]) pose_root torch.Size([7, 1, 3, 6]) pose_2D torch.Size([7, 23, 3, 6]) vid_3D torch.Size([7, 23, 1]) rotation torch.Size([3, 3, 1, 6, 6]) sample_info tensor([  0., 120.])
-    print("in camera 0", pose_2D[0, 0, :, 0], "in camera 1", pose_2D[0, 0, :, 1])
-    print(f"pose_2D maxime: {pose_2D.max().item():.4f}")
-    print(f"pose_2D mini: {pose_2D.min().item():.4f}")
diff --git a/fmpose3d/animals/common/arguments.py b/fmpose3d/animals/common/arguments.py
index f465f172..7fab2e4b 100755
--- a/fmpose3d/animals/common/arguments.py
+++ b/fmpose3d/animals/common/arguments.py
@@ -32,7 +32,7 @@ def init(self):
         self.parser.add_argument("--layers", default=3, type=int)
         self.parser.add_argument("--channel", default=512, type=int)
         self.parser.add_argument("--d_hid", default=1024, type=int)
-        self.parser.add_argument("--dataset", type=str, default="rat7m")
+        self.parser.add_argument("--dataset", type=str, default="animal3d")
         self.parser.add_argument("-k", "--keypoints", default="cpn_ft_h36m_dbb", type=str)
         self.parser.add_argument("--data_augmentation", type=bool, default=False)
         self.parser.add_argument("--reverse_augmentation", type=bool, default=False)
@@ -42,7 +42,7 @@ def init(self):
         )
         self.parser.add_argument("--test_augmentation_FlowAug", type=str2bool, default=False)
         self.parser.add_argument("--crop_uv", type=int, default=0)
-        self.parser.add_argument("--root_path", type=str, default="Rat7M_data/")
+        self.parser.add_argument("--root_path", type=str, default="./dataset/")
         self.parser.add_argument("-a", "--actions", default="*", type=str)
         self.parser.add_argument("--downsample", default=1, type=int)
         self.parser.add_argument("--subset", default=1, type=float)
@@ -78,6 +78,14 @@ def init(self):
         )
         self.parser.add_argument("--saved_model_path", type=str, default="")
 
+        # 2D pose model overrides (consumed by animal demo vis_animals.py).
+        # --saved_2d_model_path: empty -> auto-download the fine-tuned snapshot
+        #   from Hugging Face on first run; non-empty -> use as a local override.
+        # --pytorch_config_2d_path: DLC architecture yaml; empty -> use the
+        #   bundled fmpose3d/animals/configs/sa_finetune_hrnet_w32.yaml.
+        self.parser.add_argument("--saved_2d_model_path", type=str, default="")
+        self.parser.add_argument("--pytorch_config_2d_path", type=str, default="")
+
         self.parser.add_argument("--n_joints", type=int, default=26)
         self.parser.add_argument("--out_joints", type=int, default=26)
         self.parser.add_argument("--out_all", type=int, default=1)
@@ -193,13 +201,6 @@ def parse(self):
                 self.opt.joints_left = [4, 5, 6, 11, 12, 13]
                 self.opt.joints_right = [1, 2, 3, 14, 15, 16]
 
-        elif self.opt.dataset == "rat7m":
-            # Rat7M dataset configuration
-            self.opt.n_joints = 20
-            self.opt.out_joints = 20
-            self.opt.joints_left = [8, 10, 11, 17, 18]  # HipL, ElbowL, ArmL, KneeL, ShinL
-            self.opt.joints_right = [9, 14, 15, 16, 19]  # HipR, ElbowR, ArmR, KneeR, ShinR
-            self.opt.root_joint = 4
         elif self.opt.dataset == "animal3d":
             # Animal3D dataset configuration
 
diff --git a/fmpose3d/animals/common/graph_utils.py b/fmpose3d/animals/common/graph_utils.py
deleted file mode 100755
index aad42f8a..00000000
--- a/fmpose3d/animals/common/graph_utils.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-FMPose3D: monocular 3D Pose Estimation via Flow Matching
-
-Official implementation of the paper:
-"FMPose3D: monocular 3D Pose Estimation via Flow Matching"
-by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis
-Licensed under Apache 2.0
-"""
-
-from __future__ import absolute_import
-
-import numpy as np
-import scipy.sparse as sp
-import torch
-
-
-def normalize(mx):
-    """Row-normalize sparse matrix"""
-    rowsum = np.array(mx.sum(1))
-    r_inv = np.power(rowsum, -1).flatten()
-    r_inv[np.isinf(r_inv)] = 0.0
-    r_mat_inv = sp.diags(r_inv)
-    mx = r_mat_inv.dot(mx)
-    return mx
-
-
-def sparse_mx_to_torch_sparse_tensor(sparse_mx):
-    """Convert a scipy sparse matrix to a torch sparse tensor."""
-    sparse_mx = sparse_mx.tocoo().astype(np.float32)
-    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
-    values = torch.from_numpy(sparse_mx.data)
-    shape = torch.Size(sparse_mx.shape)
-    return torch.sparse.FloatTensor(indices, values, shape)
-
-
-def adj_mx_from_edges(num_pts, edges, sparse=True):
-    edges = np.array(edges, dtype=np.int32)
-    data, i, j = np.ones(edges.shape[0]), edges[:, 0], edges[:, 1]
-    adj_mx = sp.coo_matrix((data, (i, j)), shape=(num_pts, num_pts), dtype=np.float32)
-    # build symmetric adjacency matrix  https://github.com/yao8839836/text_gcn/issues/17
-    adj_mx = adj_mx + adj_mx.T.multiply(adj_mx.T > adj_mx) - adj_mx.multiply(adj_mx.T > adj_mx)
-    adj_mx = normalize(adj_mx + sp.eye(adj_mx.shape[0]))
-    if sparse:
-        adj_mx = sparse_mx_to_torch_sparse_tensor(adj_mx)
-    else:
-        adj_mx = torch.tensor(adj_mx.todense(), dtype=torch.float)
-    return adj_mx
-
-
-def adj_mx_from_skeleton(skeleton):
-    num_joints = skeleton.num_joints()  # 16|17
-    # edge [16,2]
-    edges = list(
-        filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), skeleton.parents()))
-    )  # 15  # [-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 7, 11, 12, 7, 14, 15]
-    return adj_mx_from_edges(num_joints, edges, sparse=False)
-
-
-def print_matrix(mat):
-    for i in range(len(mat)):
-        print(mat[i])
-
-
-if __name__ == "__main__":
-    num_joints = 17
-    parents = [-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 7, 11, 12, 7, 14, 15]
-    edges = list(filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), parents)))
-    A = adj_mx_from_edges(num_joints, edges, sparse=False)
-    print_matrix(A)
diff --git a/fmpose3d/animals/common/lifter3d.py b/fmpose3d/animals/common/lifter3d.py
deleted file mode 100644
index afebffbd..00000000
--- a/fmpose3d/animals/common/lifter3d.py
+++ /dev/null
@@ -1,669 +0,0 @@
-"""
-FMPose3D: monocular 3D Pose Estimation via Flow Matching
-
-Official implementation of the paper:
-"FMPose3D: monocular 3D Pose Estimation via Flow Matching"
-by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis
-Licensed under Apache 2.0
-"""
-
-import glob
-
-import cv2
-import h5py
-import matplotlib.pyplot as plt
-import numpy as np
-from matplotlib import cm, colormaps
-from scipy.optimize import least_squares
-from tqdm import tqdm  # Import tqdm for the progress bar
-
-joint_names = [
-    "snout",
-    "Right_Ear",
-    "Left_Ear",
-    "Shoulder_Center",
-    "Right_Paw",
-    "Right_Wrist",
-    "Right_Elbow",
-    "Right_Shoulder",
-    "Left_Paw",
-    "Left_Wrist",
-    "Left_Elbow",
-    "Left_Shoulder",
-    "Body_Center",
-    "Hip_Center",
-    "Right_Foot",
-    "Right_Ankle",
-    "Right_Knee",
-    "Left_Foot",
-    "Left_Ankle",
-    "Left_Knee",
-    "Tail_Tip",
-    "Tail_Middle",
-    "Tail_Root",
-]
-
-
-# joint_names = [0'snout',
-#                1'Right_Ear',
-#                2'Left_Ear',
-#                3'Shoulder_Center',
-#                4'Right_Paw',
-#                5'Right_Wrist',
-#                6'Right_Elbow',
-#                7'Right_Shoulder',
-#                8'Left_Paw',
-#                9'Left_Wrist',
-#                10'Left_Elbow',
-#                11'Left_Shoulder',
-#                12'Body_Center',
-#                13'Hip_Center',
-#                14'Right_Foot',
-#                15'Right_Ankle',
-#                16'Right_Knee',
-#                17'Left_Foot',
-#                18'Left_Ankle',
-#                19'Left_Knee',
-#                20'Tail_Tip',
-#                21'Tail_Middle',
-#                22'Tail_Root']
-
-
-# Skeleton connections
-skeleton = [
-    ["snout", "Right_Ear"],
-    ["snout", "Left_Ear"],
-    ["Shoulder_Center", "Right_Shoulder"],
-    ["Right_Shoulder", "Right_Elbow"],
-    ["Right_Elbow", "Right_Wrist"],
-    ["Right_Wrist", "Right_Paw"],
-    ["Shoulder_Center", "Left_Shoulder"],
-    ["Left_Shoulder", "Left_Elbow"],
-    ["Left_Elbow", "Left_Wrist"],
-    ["Left_Wrist", "Left_Paw"],
-    ["Shoulder_Center", "Body_Center"],
-    ["Body_Center", "Hip_Center"],
-    ["Hip_Center", "Right_Knee"],
-    ["Right_Knee", "Right_Ankle"],
-    ["Right_Ankle", "Right_Foot"],
-    ["Hip_Center", "Left_Knee"],
-    ["Left_Knee", "Left_Ankle"],
-    ["Left_Ankle", "Left_Foot"],
-    ["Tail_Root", "Tail_Middle"],
-    ["Tail_Middle", "Tail_Tip"],
-    ["Hip_Center", "Tail_Root"],
-]
-
-# from name to index
-name_to_index = {name: idx for idx, name in enumerate(joint_names)}
-
-# to skeleton
-skeleton_indices = [[name_to_index[a], name_to_index[b]] for a, b in skeleton]  # start from 0
-
-
-def compute_reprojection_errors(keypoints_2d, reprojected_2d):
-    # Euclidean distance per keypoint
-    errors = np.linalg.norm(keypoints_2d - reprojected_2d, axis=-1)  # (num_frames, num_keypoints)
-
-    # Mean error over all frames/keypoints
-    total_error = np.mean(errors)
-    # Mean error per keypoint
-    per_keypoint_error = np.mean(errors, axis=0)  # (num_keypoints,)
-
-    return total_error, per_keypoint_error
-
-
-import numpy as np
-
-
-def compute_relative_errors(keypoints_2d, reprojected_2d):
-    # Euclidean error (num_frames, num_keypoints)
-    errors = np.linalg.norm(keypoints_2d - reprojected_2d, axis=-1)
-
-    # Mean error
-    total_error = np.mean(errors)
-
-    # Mean error per keypoint (num_keypoints,)
-    per_keypoint_error = np.mean(errors, axis=0)
-
-    # Pairwise Euclidean distance between keypoints (num_frames, num_keypoints, num_keypoints)
-    pairwise_dists = np.linalg.norm(
-        keypoints_2d[:, :, None, :] - keypoints_2d[:, None, :, :], axis=-1
-    )
-
-    # Average inter-keypoint distance per frame
-    avg_keypoint_distance = np.mean(pairwise_dists, axis=(1, 2))
-
-    # Relative errors
-    relative_error = total_error / np.mean(avg_keypoint_distance)
-
-    # Relative error per keypoint (num_keypoints,)
-    per_keypoint_relative_error = per_keypoint_error / np.mean(avg_keypoint_distance)
-
-    return total_error, per_keypoint_error, relative_error, per_keypoint_relative_error
-
-
-def normalize_points(points_3d):
-    """Normalize 3D points to [-1, 1]."""
-    min_vals = points_3d.min(axis=(0, 1), keepdims=True)
-    max_vals = points_3d.max(axis=(0, 1), keepdims=True)
-    points_3d_normalized = (points_3d - min_vals) / (max_vals - min_vals) * 2 - 1
-    return points_3d_normalized
-
-
-def plot_3d_keypoints_and_save_video(points_3d, output_video_path):
-
-    num_points = len(joint_names)
-    cmap = colormaps["rainbow"]
-    colors = [cmap(i / num_points) for i in range(num_points)]
-
-    # Create a VideoWriter to save the frames to a video
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    out = cv2.VideoWriter(
-        output_video_path, fourcc, 30.0, (1024, 768)
-    )  # Adjust the frame size if needed
-
-    num_frames = len(points_3d)
-    points_3d = normalize_points(points_3d)
-
-    for frame in range(num_frames):
-        fig = plt.figure(figsize=(10, 10))
-        ax = fig.add_subplot(111, projection="3d")
-        for i, joint in enumerate(joint_names):
-            x, y, z = points_3d[frame][i]
-            ax.scatter(x, y, z, color=colors[i], s=50)
-        ax.set_xlabel("X")
-        ax.set_ylabel("Y")
-        ax.set_zlabel("Z")
-        ax.set_title(f"Frame {frame+1}")
-
-        # Adjust the view angle and limits to make the plot consistent
-        ax.view_init(elev=30, azim=45)  # Adjust the view for better 3D perspective
-        ax.set_xlim([-1, 1])  # Adjust based on your data range
-        ax.set_ylim([-1, 1])
-        ax.set_zlim([-1, 1])
-
-        # Save the current figure as an image to be added to the video
-        plt.subplots_adjust(left=0, right=1, top=1, bottom=0)  # Remove extra margins
-        fig.canvas.draw()  # Draw the figure
-        img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
-        img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))  # Convert to RGB image format
-
-        # Resize the image to fit video frame size
-        img_resized = cv2.resize(img, (1024, 768))
-
-        # Write the frame to the video
-        out.write(img_resized)
-        plt.close(fig)
-
-        # Clear the figure to free memory
-        plt.clf()
-
-    out.release()
-
-
-def plot_3d_skeleton_and_save_video(points_3d, output_video_path, num_frames_to_save=200):
-    num_joints = len(joint_names)
-    cmap = colormaps["rainbow"]
-    colors = [cmap(i / num_joints) for i in range(num_joints)]
-
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    out = cv2.VideoWriter(output_video_path, fourcc, 30.0, (1920, 1080))
-
-    # num_frames = min(points_3d.shape[0], num_frames_to_save)
-    points_3d = normalize_points(points_3d)
-
-    for frame in range(num_frames_to_save):
-        fig = plt.figure(figsize=(10, 10))
-        ax = fig.add_subplot(111, projection="3d")
-
-        # draw keypoints
-        for i, joint in enumerate(joint_names):
-            # print("shape of 3d points",points_3d[frame][i])
-
-            x, y, z = points_3d[frame][i]
-            ax.scatter(x, y, z, color=colors[i], s=50)
-
-        # draw skeleton connections
-        for bone in skeleton:
-            if bone[0] in joint_names and bone[1] in joint_names:
-                i1, i2 = joint_names.index(bone[0]), joint_names.index(bone[1])
-                x_vals = [points_3d[frame, i1, 0], points_3d[frame, i2, 0]]
-                y_vals = [points_3d[frame, i1, 1], points_3d[frame, i2, 1]]
-                z_vals = [points_3d[frame, i1, 2], points_3d[frame, i2, 2]]
-                ax.plot(x_vals, y_vals, z_vals, color="black", linewidth=2, alpha=0.8)
-
-        ax.set_xlabel("X")
-        ax.set_ylabel("Y")
-        ax.set_zlabel("Z")
-        ax.set_title(f"Frame {frame+1}")
-        ax.view_init(elev=30, azim=45)
-        ax.set_xlim([-1, 1])
-        ax.set_ylim([-1, 1])
-        ax.set_zlim([-1, 1])
-
-        # save frame and write to video
-        fig.canvas.draw()
-        img = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
-        img = img.reshape(fig.canvas.get_width_height()[::-1] + (4,))
-        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
-        img_resized = cv2.resize(img_bgr, (1920, 1080))
-        out.write(img_resized)
-
-        plt.close(fig)
-
-    out.release()
-
-
-def load_camera_params(yaml_path, scale=0.5):
-    """use opencv to read from yaml"""
-    fs = cv2.FileStorage(yaml_path, cv2.FILE_STORAGE_READ)
-    intrinsic_matrix = fs.getNode("intrinsicMatrix").mat()
-    distortion_coeffs = fs.getNode("distortionCoefficients").mat()
-    R = fs.getNode("R").mat()
-    R = R.T
-    T = fs.getNode("T").mat()
-    fs.release()
-
-    intrinsic_matrix = intrinsic_matrix.astype(np.float64)
-    # check intrinsic matrix
-    if not (np.allclose(intrinsic_matrix[2, :], [0, 0, 1], atol=1e-6)):
-        intrinsic_matrix = intrinsic_matrix.T
-    # scale the intrinsic matrix
-    scale = 0.5
-    if scale != 1.0:
-        intrinsic_matrix[0, 0] *= scale
-        intrinsic_matrix[1, 1] *= scale
-        intrinsic_matrix[0, 2] *= scale
-        intrinsic_matrix[1, 2] *= scale
-
-    return {
-        "intrinsic_matrix": intrinsic_matrix,
-        "distortion_coeffs": distortion_coeffs,
-        "R": R,
-        "T": T,
-    }
-
-
-import cv2
-import numpy as np
-
-
-def triangulate_3d_batch(points_2d_batch, cameras):
-    """
-    input (num_cams, num_frames, num_joints, 2)
-
-    parameters:
-    - points_2d_batch: (6, num_frames, 23, 2)
-    - cameras: list of length 6  `intrinsic_matrix`、`distortion_coeffs`、`R`、`T`
-
-    return:
-    - points_3d_batch: (num_frames, 23, 3)
-
-    create matrix A to avoid for iteration
-    """
-    points_2d_batch = np.array(points_2d_batch)
-    num_cams, num_frames, num_joints, _ = points_2d_batch.shape  # (6, num_frames, 23, 2)
-
-    print("num_cams,num_frames,num_joinits", points_2d_batch.shape)
-    # **1. compute projection matrices (6, 3, 4)**
-    proj_matrices = np.array(
-        [cam["intrinsic_matrix"] @ np.hstack((cam["R"], cam["T"])) for cam in cameras]
-    )  # numpy array (6, 3, 4)
-
-    # **2. undistortPoints**
-    points_2d_undistorted = np.zeros_like(points_2d_batch)  # (6, num_frames, 23, 2)
-    for i in range(num_cams):
-        K, dist = cameras[i]["intrinsic_matrix"], cameras[i]["distortion_coeffs"]
-        undistorted = cv2.undistortPoints(points_2d_batch[i].reshape(-1, 1, 2), K, dist, None, None)
-        undistorted = undistorted.reshape(num_frames, num_joints, 2)
-        print("undistorted shape:", undistorted.shape)
-        print("ones shape:", np.ones((num_frames, num_joints, 1)).shape)
-        # undistorted = (K @ np.hstack([undistorted, np.ones((num_frames, num_joints, 1))]).T).T[:, :, :2]
-        undistorted = np.concatenate(
-            [undistorted, np.ones((*undistorted.shape[:-1], 1))], axis=-1
-        )  # (50, 23, 3)
-        undistorted = (K @ undistorted[..., None])[..., 0]
-        points_2d_undistorted[i] = undistorted[..., :2]
-
-    # **3. Construct matrix A for triangulation**
-    # Formula: A = [x P_3 - P_1; y P_3 - P_2], creating 6*2=12 equations per point
-    x = points_2d_undistorted[..., 0]  # (6, num_frames, 23)
-    y = points_2d_undistorted[..., 1]  # (6, num_frames, 23)
-
-    # Extract projection matrix rows
-    P1 = proj_matrices[:, None, None, 0, :]  # (6, 1, 1, 4)
-    P2 = proj_matrices[:, None, None, 1, :]  # (6, 1, 1, 4)
-    P3 = proj_matrices[:, None, None, 2, :]  # (6, 1, 1, 4)
-
-    # Compute A (6, num_frames, 23, 2, 4)
-    A = np.stack(
-        [x[..., None] * P3 - P1, y[..., None] * P3 - P2], axis=-2
-    )  # (6, num_frames, 23, 2, 4)
-    A = A.reshape(num_cams * 2, num_frames, num_joints, 4)  # (12, num_frames, 23, 4)
-
-    # **4. Solve using batch SVD**
-    _, _, Vh = np.linalg.svd(A, full_matrices=False)  # Vh shape: (12, num_frames, 23, 4)
-    X_hom = Vh[..., -1]  # Take last row (solution) (12, num_frames, 23, 4)
-
-    # **5. Convert homogeneous coordinates to 3D**
-    points_3d_batch = X_hom[..., :3] / X_hom[..., 3:]  # (num_frames, 23, 3)
-
-    return points_3d_batch  # (num_frames, 23, 3)
-
-
-def triangulate_3d(points_2d, cameras):
-    # points_2d list of 6 in (23,2)
-    """triangulate usd SVD"""
-    proj_matrices = []
-    points_2d_undistorted = []
-
-    for i, cam in enumerate(cameras):
-        K, dist, R, T = cam["intrinsic_matrix"], cam["distortion_coeffs"], cam["R"], cam["T"]
-
-        P = K @ np.hstack((R, T))  # projection matrix
-        # print("Projection Matrix P:\n", P)
-
-        proj_matrices.append(P)
-
-        # undistortion
-        # undistorted = cv2.undistortPoints(points_2d[i].reshape(-1, 1, 2), K, dist, None, K).reshape(-1, 2)
-        undistorted = cv2.undistortPoints(
-            points_2d[i].reshape(-1, 1, 2), K, dist, None, None
-        ).reshape(-1, 2)
-        undistorted = (K @ np.hstack([undistorted, np.ones((undistorted.shape[0], 1))]).T).T[:, :2]
-
-        points_2d_undistorted.append(undistorted)
-
-    # print("before undistortion and after",points_2d[0],points_2d_undistorted[0])
-    # SVD
-    num_points = points_2d_undistorted[0].shape[0]
-    points_3d = np.zeros((num_points, 3))
-
-    for j in range(num_points):
-        A = []
-        for i in range(len(proj_matrices)):
-            P = proj_matrices[i]
-            x, y = points_2d_undistorted[i][j]
-
-            # build linear system Ax = 0
-            A.append(x * P[2, :] - P[0, :])
-            A.append(y * P[2, :] - P[1, :])
-
-        A = np.array(A)
-        _, _, Vh = np.linalg.svd(A)
-        X_hom = Vh[-1]
-        X = X_hom[:3] / X_hom[3]
-        points_3d[j] = X
-
-    # print("3D points and the 2D on camera 0",points_3d,points_2d_undistorted[0])
-    return points_3d
-
-
-def triangulate_3d_confi(points_2d, cameras):
-    # points_2d: list of 6 in (23, 3), last dim is confidence
-    """Triangulate using SVD with confidence"""
-    proj_matrices = []
-    points_2d_undistorted = []
-    confidences = []
-
-    for i, cam in enumerate(cameras):
-        K, dist, R, T = cam["intrinsic_matrix"], cam["distortion_coeffs"], cam["R"], cam["T"]
-        P = K @ np.hstack((R, T))  # Projection matrix
-        proj_matrices.append(P)
-
-        # Undistortion
-        undistorted = cv2.undistortPoints(
-            points_2d[i][:, :2].reshape(-1, 1, 2), K, dist, None, None
-        ).reshape(-1, 2)
-        undistorted = (K @ np.hstack([undistorted, np.ones((undistorted.shape[0], 1))]).T).T[:, :2]
-        points_2d_undistorted.append(undistorted)
-
-        # Collect confidences
-        confidences.append(points_2d[i][:, 2])
-
-    num_points = points_2d_undistorted[0].shape[0]
-    points_3d = np.zeros((num_points, 4))  # last dimension stores confidence
-
-    for j in range(num_points):
-        A = []
-        point_confidences = []
-        for i in range(len(proj_matrices)):
-            P = proj_matrices[i]
-            x, y = points_2d_undistorted[i][j]
-            conf = confidences[i][j]
-
-            if conf > 0:
-                # build linear system Ax = 0
-                A.append(conf * (x * P[2, :] - P[0, :]))
-                A.append(conf * (y * P[2, :] - P[1, :]))
-                point_confidences.append(conf)
-
-        if len(A) > 0:
-            A = np.array(A)
-            _, _, Vh = np.linalg.svd(A)
-            X_hom = Vh[-1]
-            X = X_hom[:3] / X_hom[3]
-            points_3d[j, :3] = X
-
-            # confidence: mean over valid views
-            points_3d[j, 3] = np.mean(point_confidences)
-        else:
-            points_3d[j, 3] = 0
-
-    return points_3d
-
-
-# def project_3d_to_2d(points_3d, camera):
-#     K, dist, R, T = camera["intrinsic_matrix"], camera["distortion_coeffs"], camera["R"], camera["T"]
-
-#     points_3d = np.asarray(points_3d, dtype=np.float32)
-
-#     R = np.array(R, dtype=np.float32)
-#     T = np.array(T, dtype=np.float32)
-#     K = np.array(K, dtype=np.float32)
-#     dist = np.array(dist, dtype=np.float32)
-
-#     points_2d_proj, _ = cv2.projectPoints(points_3d, R, T, K, dist)
-#     return points_2d_proj.reshape(-1, 2)
-
-
-def project_3d_to_2d(points_3d, camera):
-    K, dist, R, T = (
-        camera["intrinsic_matrix"],
-        camera["distortion_coeffs"],
-        camera["R"],
-        camera["T"],
-    )
-
-    points_2d_proj, _ = cv2.projectPoints(points_3d, R, T, K, dist)
-    return points_2d_proj.reshape(-1, 2)
-
-
-def load_h5_keypoints(h5_path):
-    """load 2D keypoint from h5 file"""
-    with h5py.File(h5_path, "r") as f:
-        group = f["df_with_missing"]
-        dataset = group["block0_values"]
-        # dataset = group['table']
-        return np.array(dataset).reshape(-1, 23, 3)
-
-
-def load_h5_keypoints_cspnext(h5_path):
-    """load 2D keypoint from h5 file with cspnext format"""
-    with h5py.File(h5_path, "r") as f:
-        group = f["df_with_missing"]
-        dataset = np.array(group["table"])
-        values = dataset["values_block_0"]
-        return values.reshape(-1, 23, 3)  # Reshape to (num_frames, num_joints, 3)
-
-
-def visualize_2d_on_video(
-    video_path, frame_number, original_keypoints, reprojected_keypoints, output_path
-):
-    """plot keypoint on frame n and save into png"""
-    cap = cv2.VideoCapture(video_path)
-    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)  # jump to frame
-    ret, frame = cap.read()
-
-    if not ret:
-        print(f"Failed to read frame {frame_number} from video {video_path}")
-        cap.release()
-        return
-
-    h, w = frame.shape[:2]
-    # reprojected_keypoints[:,0] /= h
-    # reprojected_keypoints[:,1] /= w
-    print("Original 2D Points (+):", original_keypoints)
-    print("Projected 2D Points (dot):", reprojected_keypoints)
-    print("height, and width", h, w)
-    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    plt.figure(figsize=(10, 10))
-    plt.imshow(frame)
-
-    num_points = max(len(original_keypoints), len(reprojected_keypoints))
-    cmap = colormaps["rainbow"]
-    colors = [cmap(i / num_points) for i in range(num_points)]
-
-    # plot original keypoints in +
-    for i, point in enumerate(original_keypoints):
-        if not np.isnan(point).any():
-            x, y = point[0], point[1]
-            if x < 0 or y < 0 or x >= w or y >= h:
-                continue
-            plt.scatter(
-                x, y, marker="+", color=colors[i], s=15, linewidths=1, label=f"keypoints{i}"
-            )
-
-    # plot reprojected keypoints in o
-    for i, point in enumerate(reprojected_keypoints):
-        if not np.isnan(point).any():
-            x, y = point[0], point[1]
-            if x < 0 or y < 0 or x >= w or y >= h:
-                continue
-            plt.scatter(x, y, marker="o", color=colors[i], s=3, label=f"reprojected{i}")
-
-    plt.axis("off")
-
-    plt.savefig(output_path, bbox_inches="tight")
-    plt.close()
-    cap.release()
-
-
-def main():
-    yaml_files = sorted(
-        glob.glob(
-            "/workspace/MTFpose/data/Arber_data_noeva/076_02_221114_start0/calibration/*.yaml"
-        )
-    )
-    h5_files = sorted(
-        glob.glob("/workspace/MTFpose/data/Arber_data_noeva/076_02_221114_start0/pose2d_dlc/*.h5")
-    )
-    video_files = sorted(
-        glob.glob("/workspace/MTFpose/data/Arber_data_noeva/076_02_221114_start0/video_dlc/*.mp4")
-    )
-
-    cameras = [load_camera_params(yaml) for yaml in yaml_files]
-
-    # print("cameras len",len(cameras))
-    # keypoints_2d = [load_h5_keypoints(h5) for h5 in h5_files]
-    keypoints_2d = [load_h5_keypoints_cspnext(h5) for h5 in h5_files]
-
-    print(
-        "list: yaml ",
-        len(yaml_files),
-        "h5",
-        len(h5_files),
-        "video ",
-        len(video_files),
-        "cameras",
-        len(cameras),
-        "keypoints",
-        len(keypoints_2d),
-    )
-
-    total_num_frames = keypoints_2d[0].shape[0]
-    print("num_frames", total_num_frames, "for each cam", keypoints_2d[0].shape)  # (119498, 23, 3)
-
-    # choose frame
-    frame_number = 5
-
-    # Choose the number of frames to visualize
-    num_frames_to_save = total_num_frames
-
-    points_3d_list = []
-    repro_2d_list = []
-
-    # save all 3d keypoints into .npy
-
-    # visualization and save
-    for frame in tqdm(
-        range(total_num_frames), desc="Processing frames", unit="frame"
-    ):  # loop in frames
-        points_2d_frame = [keypoints_2d[cam_i][frame][:, :2] for cam_i in range(len(cameras))]
-
-        # print("len of points 2d frame",len(points_2d_frame),points_2d_frame[0].shape) #len: 6 each shape :(23,2)
-
-        # points_3d = triangulate_3d_confi(points_2d_frame, cameras)
-        points_3d = triangulate_3d(points_2d_frame, cameras)
-        points_3d_list.append(points_3d)
-
-        # for i in range(len(cameras)):
-        #     points_3d_array = np.array(points_3d_list).reshape(total_num_frames,-1,3)
-        #     reprojected_2d = np.array([project_3d_to_2d(frame_3d, cameras[i]) for frame_3d in points_3d_array])
-        #     total_error, per_keypoint_error, relative_error, per_keypoint_relative_error = compute_relative_errors(keypoints_2d[i][:total_num_frames,:,:2], reprojected_2d)
-
-        # reprojected_2d = project_3d_to_2d(points_3d, cameras[0])
-    points_3d = np.array(points_3d_list)
-    print("shape of points 3d to save", points_3d.shape)  # (num_frames, 23, 3)
-    output_npy_path = (
-        "/workspace/MTFpose/data/Arber_data_noeva/076_02_221114_start0/triangulated_3d.npy"
-    )
-    np.save(output_npy_path, points_3d)
-    # print("points_3d_fragment from 3d lift and stack, shape",points_3d.shape)
-
-    #     if frame==frame_number:# visualize nth frame and save PNG
-    #         for i in range(len(cameras)):
-    #             output_path = f"/workspace/MTFpose/results/Camera_{i}_frame_{frame_number}.png"
-    #             reprojected_2d = project_3d_to_2d(points_3d, cameras[i])
-    #             # visualize_2d_on_video(video_files[i], frame_number, points_2d_frame[i], reprojected_2d, output_path)
-
-    # # test on triangulate 3D batch
-    # left_frame_id = 10
-    # right_frame_id = 60
-
-    # points_2d_fragment = [keypoints_2d[i][left_frame_id:right_frame_id,:,:2] for i in range(len(yaml_files))]
-
-    # # get 3D keypoint from 3d lift
-    # points_2d_fragment_np = np.array(points_2d_fragment) # (num_cams, num_frames, num_joints, 2) N,T,K,2
-    # points_3d_fragment = triangulate_3d_batch(points_2d_fragment_np,cameras)  #(num_frames, 23, 3) T,K,3
-
-    # # save skeleton for several frames
-    # output_video_path = '/workspace/MTFpose/results/skeleton_batch_videodemo.mp4'
-    # plot_3d_skeleton_and_save_video(points_3d_fragment,output_video_path,num_frames_to_save = num_frames_to_save)
-
-    # compute reprojection error for each view
-    for i in range(len(cameras)):
-        # points_3d_list : (num_frames, num_keypoints, 3)
-        # print("shape of points",len(points_3d_list),points_3d_list[0].shape)
-        points_3d_array = np.array(points_3d_list).reshape(
-            total_num_frames, -1, 3
-        )  # with confidence
-
-        # points_3d_array = points_3d_array[:,:,:3]
-        # print("shape of points_3d_array",points_3d_array.shape,points_3d.dtype,points_3d_array)
-
-        # reprojected_2d = project_3d_to_2d(points_3d_array, cameras[i])
-        reprojected_2d = np.array(
-            [project_3d_to_2d(frame_3d, cameras[i]) for frame_3d in points_3d_array]
-        )
-        # print("shape of reprojected_2d and points_3d",reprojected_2d.shape, points_3d_array.shape,keypoints_2d[0].shape)
-
-        total_error, per_keypoint_error, relative_error, per_keypoint_relative_error = (
-            compute_relative_errors(keypoints_2d[i][:total_num_frames, :, :2], reprojected_2d)
-        )
-        print(f"reprojection error in camera_{i}", total_error, relative_error)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/fmpose3d/animals/common/mocap_dataset.py b/fmpose3d/animals/common/mocap_dataset.py
deleted file mode 100755
index 75d837cd..00000000
--- a/fmpose3d/animals/common/mocap_dataset.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""
-FMPose3D: monocular 3D Pose Estimation via Flow Matching
-
-Official implementation of the paper:
-"FMPose3D: monocular 3D Pose Estimation via Flow Matching"
-by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis
-Licensed under Apache 2.0
-"""
-
-class MocapDataset:
-    def __init__(self, fps, skeleton):
-        self._skeleton = skeleton
-        self._fps = fps
-        self._data = None
-        self._cameras = None
-
-    def remove_joints(self, joints_to_remove):
-        kept_joints = self._skeleton.remove_joints(joints_to_remove)
-        for subject in self._data.keys():
-            for action in self._data[subject].keys():
-                s = self._data[subject][action]
-                s["positions"] = s["positions"][:, kept_joints]
-
-    def __getitem__(self, key):
-        return self._data[key]
-
-    def subjects(self):
-        return self._data.keys()
-
-    def fps(self):
-        return self._fps
-
-    def skeleton(self):
-        return self._skeleton
-
-    def cameras(self):
-        return self._cameras
-
-    def supports_semi_supervised(self):
-        return False
diff --git a/fmpose3d/animals/configs/__init__.py b/fmpose3d/animals/configs/__init__.py
new file mode 100644
index 00000000..f5646c7a
--- /dev/null
+++ b/fmpose3d/animals/configs/__init__.py
@@ -0,0 +1,26 @@
+"""
+FMPose3D: monocular 3D Pose Estimation via Flow Matching
+
+Official implementation of the paper:
+"FMPose3D: monocular 3D Pose Estimation via Flow Matching"
+by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis
+Licensed under Apache 2.0
+"""
+
+"""Bundled DLC ``pytorch_config.yaml`` files for the animal 2D detector.
+
+These yamls describe FMPose3D's fine-tuned SuperAnimal-Quadruped variants
+and are loaded by :class:`fmpose3d.inference_api.SuperAnimalEstimator` when
+the user does not supply an explicit ``pytorch_config_path``. They are
+shipped as package data (see ``pyproject.toml`` ``[tool.setuptools.package-data]``).
+"""
+
+from pathlib import Path
+
+CONFIGS_DIR = Path(__file__).parent
+
+SA_FINETUNE_HRNET_W32_YAML: str = str(CONFIGS_DIR / "sa_finetune_hrnet_w32.yaml")
+"""DLC config for SA-Quadruped HRNet-w32 fine-tuned on Animal3D +
+Control-Animal3D with the 26-joint Animal3D output layout."""
+
+__all__ = ["CONFIGS_DIR", "SA_FINETUNE_HRNET_W32_YAML"]
diff --git a/fmpose3d/animals/configs/sa_finetune_hrnet_w32.yaml b/fmpose3d/animals/configs/sa_finetune_hrnet_w32.yaml
new file mode 100644
index 00000000..530ea63f
--- /dev/null
+++ b/fmpose3d/animals/configs/sa_finetune_hrnet_w32.yaml
@@ -0,0 +1,220 @@
+# DeepLabCut pytorch_config for FMPose3D's 2D animal pose model:
+# SuperAnimal-Quadruped HRNet-w32 backbone fine-tuned on Animal3D, with
+# the heatmap head re-trained for the 26-joint Animal3D output layout.
+#
+# Loaded by fmpose3d.inference_api.SuperAnimalEstimator and passed to
+# DLC's `superanimal_analyze_images(..., customized_model_config=<this yaml>,
+# customized_pose_checkpoint=<sa_finetune_hrnet_w32.pt>)`. Only the pose
+# model is fine-tuned; the bounding-box detector (Faster R-CNN) is the
+# stock SuperAnimal-Quadruped one resolved by DLC at runtime.
+data:
+  bbox_margin: 20
+  colormode: RGB
+  inference:
+    normalize_images: true
+    top_down_crop:
+      width: 256
+      height: 256
+    auto_padding:
+      pad_width_divisor: 32
+      pad_height_divisor: 32
+  train:
+    affine:
+      p: 0.5
+      rotation: 30
+      scaling:
+      - 1.0
+      - 1.0
+      translation: 0
+    gaussian_noise: 12.75
+    motion_blur: true
+    normalize_images: true
+    top_down_crop:
+      width: 256
+      height: 256
+    auto_padding:
+      pad_width_divisor: 32
+      pad_height_divisor: 32
+detector:
+  data:
+    colormode: RGB
+    inference:
+      normalize_images: true
+    train:
+      affine:
+        p: 0.5
+        rotation: 30
+        scaling:
+        - 1.0
+        - 1.0
+        translation: 40
+      collate:
+        type: ResizeFromDataSizeCollate
+        min_scale: 0.4
+        max_scale: 1.0
+        min_short_side: 128
+        max_short_side: 1152
+        multiple_of: 32
+        to_square: false
+      hflip: true
+      normalize_images: true
+  device: auto
+  model:
+    type: FasterRCNN
+    freeze_bn_stats: true
+    freeze_bn_weights: false
+    variant: fasterrcnn_resnet50_fpn_v2
+  runner:
+    type: DetectorTrainingRunner
+    key_metric: test.mAP@50:95
+    key_metric_asc: true
+    eval_interval: 10
+    optimizer:
+      type: AdamW
+      params:
+        lr: 0.0001
+    scheduler:
+      type: LRListScheduler
+      params:
+        milestones:
+        - 160
+        lr_list:
+        - - 1e-05
+    snapshots:
+      max_snapshots: 5
+      save_epochs: 25
+      save_optimizer_state: false
+  train_settings:
+    batch_size: 1
+    dataloader_workers: 0
+    dataloader_pin_memory: false
+    display_iters: 500
+    epochs: 250
+device: auto
+inference:
+  multithreading:
+    enabled: true
+    queue_length: 4
+    timeout: 30.0
+  compile:
+    enabled: false
+    backend: inductor
+  autocast:
+    enabled: false
+metadata:
+  project_path: ""
+  pose_config_path: ""
+  bodyparts:
+  - left_eye
+  - right_eye
+  - chin
+  - left_front_paw
+  - right_front_paw
+  - left_back_paw
+  - right_back_paw
+  - tail_base
+  - left_front_thigh
+  - right_front_thigh
+  - left_back_thigh
+  - right_back_thigh
+  - left_shoulder
+  - right_shoulder
+  - left_front_knee
+  - right_front_knee
+  - left_back_knee
+  - right_back_knee
+  - neck_base
+  - tail_mid
+  - left_ear_base
+  - right_ear_base
+  - left_mouth_corner
+  - right_mouth_corner
+  - nose
+  - tail_tip_first
+  unique_bodyparts: []
+  individuals:
+  - individual000
+  with_identity: false
+method: td
+model:
+  backbone:
+    type: HRNet
+    model_name: hrnet_w32
+    freeze_bn_stats: true
+    freeze_bn_weights: false
+    interpolate_branches: false
+    increased_channel_count: false
+  backbone_output_channels: 32
+  heads:
+    bodypart:
+      type: HeatmapHead
+      weight_init: normal
+      predictor:
+        type: HeatmapPredictor
+        apply_sigmoid: false
+        clip_scores: true
+        location_refinement: true
+        locref_std: 7.2801
+      target_generator:
+        type: HeatmapGaussianGenerator
+        num_heatmaps: 26
+        pos_dist_thresh: 17
+        heatmap_mode: KEYPOINT
+        gradient_masking: true
+        background_weight: 0.0
+        generate_locref: true
+        locref_std: 7.2801
+      criterion:
+        heatmap:
+          type: WeightedMSECriterion
+          weight: 1.0
+        locref:
+          type: WeightedHuberCriterion
+          weight: 0.05
+      heatmap_config:
+        channels:
+        - 32
+        kernel_size: []
+        strides: []
+        final_conv:
+          out_channels: 26
+          kernel_size: 1
+      locref_config:
+        channels:
+        - 32
+        kernel_size: []
+        strides: []
+        final_conv:
+          out_channels: 52
+          kernel_size: 1
+net_type: hrnet_w32
+runner:
+  type: PoseTrainingRunner
+  gpus:
+  key_metric: test.mAP
+  key_metric_asc: true
+  eval_interval: 10
+  optimizer:
+    type: AdamW
+    params:
+      lr: 0.0001
+  scheduler:
+    type: LRListScheduler
+    params:
+      lr_list:
+      - - 1e-05
+      - - 1e-06
+      milestones:
+      - 160
+      - 190
+  snapshots:
+    max_snapshots: 5
+    save_epochs: 10
+    save_optimizer_state: false
+train_settings:
+  batch_size: 64
+  dataloader_workers: 8
+  dataloader_pin_memory: false
+  display_iters: 500
+  epochs: 200
+  seed: 42
diff --git a/fmpose3d/animals/models/graph_frames.py b/fmpose3d/animals/models/graph_frames.py
index 7d07645d..d69f173c 100755
--- a/fmpose3d/animals/models/graph_frames.py
+++ b/fmpose3d/animals/models/graph_frames.py
@@ -19,7 +19,6 @@ class Graph():
         layout (string): must be one of the follow candidates
         - 'hm36_gt': Ground truth structure of Human3.6M, with 17 joints per frame
         - 'animal3d': Skeleton structure for Animal3D dataset, with 26 joints per frame
-        - 'rat7m': Skeleton structure for Rat7M dataset, with 20 joints per frame
 
         max_hop (int): the maximal distance between two connected nodes
         dilation (int): controls the spacing between the kernel points
@@ -48,7 +47,6 @@ def get_distance_to_center(self,layout):
         :return: get the distance of each node to center
         For hm36_gt: center is joint 7
         For animal3d: center is joint 18 (neck, root joint)
-        For rat7m: center is joint 4 (SpineM, root joint)
         """
         dist_center = np.zeros(self.num_node)
         if layout == 'hm36_gt':
@@ -338,17 +336,4 @@ def normalize_undigraph(A):
     print(f"    - Head: {graph_animal.head}")
     print(f"    - Tail: {graph_animal.tail}")
     print(f"  Distance to center (joint 18): {graph_animal.dist_center}")
-    
-    # Test Rat7M skeleton
-    print("\nTesting Rat7M skeleton (20 joints):")
-    graph_rat = Graph('rat7m', 'spatial', 1)
-    print(f"  Adjacency matrix shape: {graph_rat.A.shape}")
-    print(f"  Center joint: {graph_rat.center}")
-    print(f"  Number of nodes: {graph_rat.num_node}")
-    print(f"  Body parts:")
-    print(f"    - Left front leg: {graph_rat.left_front}")
-    print(f"    - Right front leg: {graph_rat.right_front}")
-    print(f"    - Left hind leg: {graph_rat.left_hind}")
-    print(f"    - Right hind leg: {graph_rat.right_hind}")
-    print(f"    - Spine: {graph_rat.spine}")
     print(f"  Distance to center (joint 4): {graph_rat.dist_center}")
\ No newline at end of file
diff --git a/fmpose3d/common/config.py b/fmpose3d/common/config.py
index ded5afa5..3508c0b6 100644
--- a/fmpose3d/common/config.py
+++ b/fmpose3d/common/config.py
@@ -249,9 +249,17 @@ class SuperAnimalConfig(Pose2DConfig):
     """DeepLabCut SuperAnimal 2D pose detector configuration.
 
     Uses the DeepLabCut ``superanimal_analyze_images`` API to detect
-    animal keypoints in the quadruped80K format, then maps them to the
-    Animal3D 26-keypoint layout expected by the ``fmpose3d_animals``
-    3D lifter.
+    animal keypoints. Supports two modes:
+
+    * **Fine-tuned.** Predicts the 26-joint Animal3D layout natively
+      (no remap). Activated by either ``auto_download_finetuned=True``
+      (used by :meth:`FMPose3DInference.for_animals` — snapshot is
+      auto-downloaded from Hugging Face on first predict) or by setting
+      ``pose_snapshot_path`` to a local ``.pt`` file.
+    * **Stock SA.** Runs the published ``superanimal_quadruped`` weights
+      (39 keypoints) and remaps to the 26-joint Animal3D layout via
+      :meth:`SuperAnimalEstimator._map_keypoints`. Active when the bare
+      ``SuperAnimalConfig()`` default is used (all paths/flag empty).
 
     Attributes
     ----------
@@ -263,12 +271,37 @@ class SuperAnimalConfig(Pose2DConfig):
         Object detector used for animal bounding boxes.
     max_individuals : int
         Maximum number of individuals to detect per image (default 1).
+    pytorch_config_path : str
+        Path to a DLC ``pytorch_config.yaml`` describing a fine-tuned
+        model. When empty, the packaged default
+        (:data:`fmpose3d.animals.configs.SA_FINETUNE_HRNET_W32_YAML`)
+        is used. Only consulted in fine-tuned mode.
+    pose_snapshot_path : str
+        Path to a fine-tuned pose ``.pt`` checkpoint. **Non-empty value
+        activates fine-tuned mode.** Empty → stock SA, unless
+        ``auto_download_finetuned`` is True.
+    detector_snapshot_path : str
+        Path to a custom Faster R-CNN checkpoint. When empty, DLC
+        resolves the stock SA detector from its modelzoo.
+    auto_download_finetuned : bool
+        When True and ``pose_snapshot_path`` is empty, the FMPose3D
+        fine-tuned snapshot is downloaded from Hugging Face on first
+        :meth:`SuperAnimalEstimator.predict` call (cached under
+        ``~/.cache/huggingface``). This is what
+        :meth:`FMPose3DInference.for_animals` uses as its default so the
+        animal pipeline runs out-of-the-box without manual downloads.
+        Standalone ``SuperAnimalConfig()`` keeps it False so that
+        stock SA + 39→26 remap remains the explicit, no-network default.
     """
     pose2d_model: str = "superanimal"
     superanimal_name: str = "superanimal_quadruped"
     sa_model_name: str = "hrnet_w32"
     detector_name: str = "fasterrcnn_resnet50_fpn_v2"
     max_individuals: int = 1
+    pytorch_config_path: str = ""
+    pose_snapshot_path: str = ""
+    detector_snapshot_path: str = ""
+    auto_download_finetuned: bool = False
 
 
 @dataclass
diff --git a/fmpose3d/inference_api/README.md b/fmpose3d/inference_api/README.md
index 5a159006..3c9f1459 100644
--- a/fmpose3d/inference_api/README.md
+++ b/fmpose3d/inference_api/README.md
@@ -225,7 +225,10 @@ Default 2D estimator for the human pipeline. Wraps HRNet + YOLO with a COCO →
 
 #### `SuperAnimalEstimator(cfg: SuperAnimalConfig | None)`
 
-2D estimator for the animal pipeline. Uses DeepLabCut SuperAnimal and maps quadruped80K keypoints to the 26-joint Animal3D layout.
+2D estimator for the animal pipeline. Produces the 26-joint Animal3D keypoint layout via DeepLabCut SuperAnimal. Supports two modes:
+
+- **Fine-tuned** (default when accessed via `FMPose3DInference.for_animals()`): runs an FMPose3D fine-tuned SA-Quadruped HRNet-w32 snapshot that natively outputs 26 joints. The snapshot is auto-downloaded from [Hugging Face](https://huggingface.co/MLAdaptiveIntelligence/FMPose3D) on the first predict call when `cfg.auto_download_finetuned=True`.
+- **Stock SA** (low-level opt-in): runs the published `superanimal_quadruped` weights (39 keypoints) and remaps to 26 joints via `_map_keypoints`. Activated by `SuperAnimalEstimator(SuperAnimalConfig())` with all paths/flags empty.
 
 If DeepLabCut is not installed, calling this estimator raises a clear `ImportError`
 with the recommended install command: `pip install "fmpose3d[animals]"`.
diff --git a/fmpose3d/inference_api/fmpose3d.py b/fmpose3d/inference_api/fmpose3d.py
index 970a5e3d..10d0b3c7 100644
--- a/fmpose3d/inference_api/fmpose3d.py
+++ b/fmpose3d/inference_api/fmpose3d.py
@@ -256,6 +256,28 @@ def predict(
         all_mapped: list[np.ndarray] = []
         all_scores: list[np.ndarray] = []
 
+        # Resolve pose snapshot: explicit local path > HF auto-download > empty (stock).
+        pose_snapshot_path = cfg.pose_snapshot_path
+        if not pose_snapshot_path and cfg.auto_download_finetuned:
+            from fmpose3d.utils.weights import resolve_weights_path
+            pose_snapshot_path = resolve_weights_path("", "sa_finetune_hrnet_w32.pt")
+
+        # Fine-tuned mode: non-empty resolved path swaps the stock 39-joint head
+        # for a custom DLC checkpoint that predicts the 26-joint Animal3D layout
+        # natively (no _map_keypoints needed).
+        is_finetuned = bool(pose_snapshot_path)
+        if is_finetuned:
+            from fmpose3d.animals.configs import SA_FINETUNE_HRNET_W32_YAML
+            customized_kwargs = dict(
+                customized_model_config=(
+                    cfg.pytorch_config_path or SA_FINETUNE_HRNET_W32_YAML
+                ),
+                customized_pose_checkpoint=pose_snapshot_path,
+                customized_detector_checkpoint=cfg.detector_snapshot_path or None,
+            )
+        else:
+            customized_kwargs = {}
+
         with tempfile.TemporaryDirectory() as tmpdir:
             # Write each frame as an image so DLC can read it.
             paths: list[str] = []
@@ -272,10 +294,12 @@ def predict(
                 images=paths,
                 max_individuals=cfg.max_individuals,
                 out_folder=tmpdir,
-                progress_bar=False
+                progress_bar=False,
+                **customized_kwargs,
             )
             # predictions: {image_path: {"bodyparts": (N_ind, K, 3), ...}}
-            # Iterate in input order to keep frame alignment stable.
+            # In fine-tuned mode K == 26 already; in stock mode K == 39
+            # (quadruped80K) and is remapped via _map_keypoints/_map_scores.
             for img_path in paths:
                 payload = predictions.get(img_path) if isinstance(predictions, dict) else None
                 if payload is None and isinstance(predictions, dict) and len(predictions) == 1:
@@ -291,8 +315,12 @@ def predict(
 
                 xy = bodyparts[..., :2]   # (N_ind, K, 2)
                 conf = bodyparts[..., 2]  # (N_ind, K)
-                mapped = self._map_keypoints(xy)
-                mapped_scores = self._map_scores(conf)
+                if is_finetuned:
+                    mapped = xy
+                    mapped_scores = conf
+                else:
+                    mapped = self._map_keypoints(xy)
+                    mapped_scores = self._map_scores(conf)
 
                 # Take only the first individual.
                 all_mapped.append(mapped[:1])
@@ -599,7 +627,13 @@ def _default_components(
     means adding one branch here (or turning this into a registry).
     """
     if model_cfg.model_type == SupportedModel.FMPOSE3D_ANIMALS:
-        return SuperAnimalEstimator(), AnimalPostProcessor()
+        # Default to fine-tuned + lazy HF auto-download so the animal API
+        # works out-of-the-box. Construction stays cheap (no network);
+        # the download fires on the first predict() call.
+        return (
+            SuperAnimalEstimator(SuperAnimalConfig(auto_download_finetuned=True)),
+            AnimalPostProcessor(),
+        )
     return HRNetEstimator(), HumanPostProcessor()
 
 
diff --git a/fmpose3d/lib/hrnet/gen_kpts.py b/fmpose3d/lib/hrnet/gen_kpts.py
index 0049997c..1445b700 100755
--- a/fmpose3d/lib/hrnet/gen_kpts.py
+++ b/fmpose3d/lib/hrnet/gen_kpts.py
@@ -82,11 +82,11 @@ def reset_config(args):
 
 # load model
 def model_load(config):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = pose_hrnet.get_pose_net(config, is_train=False)
-    if torch.cuda.is_available():
-        model = model.cuda()
+    model = model.to(device)
 
-    state_dict = torch.load(config.OUTPUT_DIR, weights_only=True)
+    state_dict = torch.load(config.OUTPUT_DIR, map_location=device, weights_only=True)
     from collections import OrderedDict
     new_state_dict = OrderedDict()
     for k, v in state_dict.items():
@@ -133,8 +133,8 @@ def gen_from_image(args, frame, people_sort, human_model, pose_model, det_dim=41
 
         inputs = inputs[:, [2, 1, 0]]
 
-        if torch.cuda.is_available():
-            inputs = inputs.cuda()
+        device = next(pose_model.parameters()).device
+        inputs = inputs.to(device)
         output = pose_model(inputs)
 
         # compute coordinate
diff --git a/fmpose3d/lib/hrnet/hrnet.py b/fmpose3d/lib/hrnet/hrnet.py
index 0d0b7529..fa8e6822 100644
--- a/fmpose3d/lib/hrnet/hrnet.py
+++ b/fmpose3d/lib/hrnet/hrnet.py
@@ -196,11 +196,11 @@ def _load_hrnet(config):
         """Instantiate HRNet and load checkpoint weights."""
         from fmpose3d.lib.hrnet.lib.models import pose_hrnet
 
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         model = pose_hrnet.get_pose_net(config, is_train=False)
-        if torch.cuda.is_available():
-            model = model.cuda()
+        model = model.to(device)
 
-        state_dict = torch.load(config.OUTPUT_DIR, weights_only=True)
+        state_dict = torch.load(config.OUTPUT_DIR, map_location=device, weights_only=True)
         new_state_dict = OrderedDict()
         for k, v in state_dict.items():
             new_state_dict[k] = v
@@ -258,8 +258,8 @@ def _estimate_frame(
             )
             inputs = inputs[:, [2, 1, 0]]
 
-            if torch.cuda.is_available():
-                inputs = inputs.cuda()
+            device = next(self._pose_model.parameters()).device
+            inputs = inputs.to(device)
             output = self._pose_model(inputs)
 
             preds, maxvals = get_final_preds(
@@ -277,4 +277,3 @@ def _estimate_frame(
             scores[i] = score.squeeze()
 
         return kpts, scores
-
diff --git a/fmpose3d/lib/hrnet/lib/utils/coco_h36m.py b/fmpose3d/lib/hrnet/lib/utils/coco_h36m.py
deleted file mode 100755
index 577ef0bb..00000000
--- a/fmpose3d/lib/hrnet/lib/utils/coco_h36m.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""
-FMPose3D: monocular 3D Pose Estimation via Flow Matching
-
-Official implementation of the paper:
-"FMPose3D: monocular 3D Pose Estimation via Flow Matching"
-by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis
-Licensed under Apache 2.0
-"""
-
-import numpy as np
-
-
-h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3]
-coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
-spple_keypoints = [10, 8, 0, 7]
-
-
-def coco_h36m(keypoints):
-    # keypoints: (T, N, 2) or (M, N, 2)
-
-    temporal = keypoints.shape[0]
-    keypoints_h36m = np.zeros_like(keypoints, dtype=np.float32)
-    htps_keypoints = np.zeros((temporal, 4, 2), dtype=np.float32)
-
-    # htps_keypoints: head, thorax, pelvis, spine
-    htps_keypoints[:, 0, 0] = np.mean(keypoints[:, 1:5, 0], axis=1, dtype=np.float32)
-    htps_keypoints[:, 0, 1] = np.sum(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1]
-    htps_keypoints[:, 1, :] = np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)
-    htps_keypoints[:, 1, :] += (keypoints[:, 0, :] - htps_keypoints[:, 1, :]) / 3
-
-    htps_keypoints[:, 2, :] = np.mean(keypoints[:, 11:13, :], axis=1, dtype=np.float32)
-    htps_keypoints[:, 3, :] = np.mean(keypoints[:, [5, 6, 11, 12], :], axis=1, dtype=np.float32)
-
-    keypoints_h36m[:, spple_keypoints, :] = htps_keypoints
-    keypoints_h36m[:, h36m_coco_order, :] = keypoints[:, coco_order, :]
-
-    keypoints_h36m[:, 9, :] -= (keypoints_h36m[:, 9, :] - np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)) / 4
-    keypoints_h36m[:, 7, 0] += 0.3*(keypoints_h36m[:, 7, 0] - np.mean(keypoints_h36m[:, [0, 8], 0], axis=1, dtype=np.float32))
-    keypoints_h36m[:, 8, 1] -= (np.mean(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1])*2/3
-
-    # half body: the joint of ankle and knee equal to hip
-    # keypoints_h36m[:, [2, 3]] = keypoints_h36m[:, [1, 1]]
-    # keypoints_h36m[:, [5, 6]] = keypoints_h36m[:, [4, 4]]
-    return keypoints_h36m
-
-
-h36m_mpii_order = [3, 2, 1, 4, 5, 6, 0, 8, 9, 10, 16, 15, 14, 11, 12, 13]
-mpii_order = [i for i in range(16)]
-lr_hip_shouler = [2, 3, 12, 13]
-
-
-def mpii_h36m(keypoints):
-    temporal = keypoints.shape[0]
-    keypoints_h36m = np.zeros((temporal, 17, 2), dtype=np.float32)
-    keypoints_h36m[:, h36m_mpii_order] = keypoints
-    # keypoints_h36m[:, 7] = np.mean(keypoints[:, 6:8], axis=1, dtype=np.float32)
-    keypoints_h36m[:, 7] = np.mean(keypoints[:, lr_hip_shouler], axis=1, dtype=np.float32)
-    return keypoints_h36m
-
-
diff --git a/fmpose3d/lib/hrnet/lib/utils/utilitys.py b/fmpose3d/lib/hrnet/lib/utils/utilitys.py
index ba587ff7..acd65d88 100755
--- a/fmpose3d/lib/hrnet/lib/utils/utilitys.py
+++ b/fmpose3d/lib/hrnet/lib/utils/utilitys.py
@@ -14,7 +14,6 @@
 import torchvision.transforms as transforms
 from fmpose3d.lib.hrnet.lib.utils.transforms import *
 
-from fmpose3d.lib.hrnet.lib.utils.coco_h36m import coco_h36m
 import numpy as np
 
 joint_pairs = [[0, 1], [1, 3], [0, 2], [2, 4],
diff --git a/fmpose3d/lib/yolov3/cfg/tiny-yolo-voc.cfg b/fmpose3d/lib/yolov3/cfg/tiny-yolo-voc.cfg
deleted file mode 100755
index ab2c066a..00000000
--- a/fmpose3d/lib/yolov3/cfg/tiny-yolo-voc.cfg
+++ /dev/null
@@ -1,134 +0,0 @@
-[net]
-batch=64
-subdivisions=8
-width=416
-height=416
-channels=3
-momentum=0.9
-decay=0.0005
-angle=0
-saturation = 1.5
-exposure = 1.5
-hue=.1
-
-learning_rate=0.001
-max_batches = 40200
-policy=steps
-steps=-1,100,20000,30000
-scales=.1,10,.1,.1
-
-[convolutional]
-batch_normalize=1
-filters=16
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=32
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=1
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-###########
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-size=1
-stride=1
-pad=1
-filters=125
-activation=linear
-
-[region]
-anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
-bias_match=1
-classes=20
-coords=4
-num=5
-softmax=1
-jitter=.2
-rescore=1
-
-object_scale=5
-noobject_scale=1
-class_scale=1
-coord_scale=1
-
-absolute=1
-thresh = .6
-random=1
diff --git a/fmpose3d/lib/yolov3/cfg/yolo-voc.cfg b/fmpose3d/lib/yolov3/cfg/yolo-voc.cfg
deleted file mode 100755
index d5bdfc1c..00000000
--- a/fmpose3d/lib/yolov3/cfg/yolo-voc.cfg
+++ /dev/null
@@ -1,258 +0,0 @@
-[net]
-# Testing
-batch=64
-subdivisions=8
-# Training
-# batch=64
-# subdivisions=8
-height=416
-width=416
-channels=3
-momentum=0.9
-decay=0.0005
-angle=0
-saturation = 1.5
-exposure = 1.5
-hue=.1
-
-learning_rate=0.001
-burn_in=1000
-max_batches = 80200
-policy=steps
-steps=-1,500,40000,60000
-scales=0.1,10,.1,.1
-
-[convolutional]
-batch_normalize=1
-filters=32
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-
-#######
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[route]
-layers=-9
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=1
-pad=1
-filters=64
-activation=leaky
-
-[reorg]
-stride=2
-
-[route]
-layers=-1,-4
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-size=1
-stride=1
-pad=1
-filters=125
-activation=linear
-
-
-[region]
-anchors =  1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
-bias_match=1
-classes=20
-coords=4
-num=5
-softmax=1
-jitter=.3
-rescore=1
-
-object_scale=5
-noobject_scale=1
-class_scale=1
-coord_scale=1
-
-absolute=1
-thresh = .6
-random=1
diff --git a/fmpose3d/lib/yolov3/cfg/yolo.cfg b/fmpose3d/lib/yolov3/cfg/yolo.cfg
deleted file mode 100755
index 2a0cd98f..00000000
--- a/fmpose3d/lib/yolov3/cfg/yolo.cfg
+++ /dev/null
@@ -1,258 +0,0 @@
-[net]
-# Testing
-batch=1
-subdivisions=1
-# Training
-# batch=64
-# subdivisions=8
-width=416
-height=416
-channels=3
-momentum=0.9
-decay=0.0005
-angle=0
-saturation = 1.5
-exposure = 1.5
-hue=.1
-
-learning_rate=0.001
-burn_in=1000
-max_batches = 500200
-policy=steps
-steps=400000,450000
-scales=.1,.1
-
-[convolutional]
-batch_normalize=1
-filters=32
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-
-#######
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[route]
-layers=-9
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=1
-pad=1
-filters=64
-activation=leaky
-
-[reorg]
-stride=2
-
-[route]
-layers=-1,-4
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-size=1
-stride=1
-pad=1
-filters=425
-activation=linear
-
-
-[region]
-anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
-bias_match=1
-classes=80
-coords=4
-num=5
-softmax=1
-jitter=.3
-rescore=1
-
-object_scale=5
-noobject_scale=1
-class_scale=1
-coord_scale=1
-
-absolute=1
-thresh = .6
-random=1
diff --git a/fmpose3d/lib/yolov3/data/pallete b/fmpose3d/lib/yolov3/data/pallete
deleted file mode 100755
index 25f0143e..00000000
Binary files a/fmpose3d/lib/yolov3/data/pallete and /dev/null differ
diff --git a/fmpose3d/lib/yolov3/data/voc.names b/fmpose3d/lib/yolov3/data/voc.names
deleted file mode 100755
index 8420ab35..00000000
--- a/fmpose3d/lib/yolov3/data/voc.names
+++ /dev/null
@@ -1,20 +0,0 @@
-aeroplane
-bicycle
-bird
-boat
-bottle
-bus
-car
-cat
-chair
-cow
-diningtable
-dog
-horse
-motorbike
-person
-pottedplant
-sheep
-sofa
-train
-tvmonitor
diff --git a/fmpose3d/utils/weights.py b/fmpose3d/utils/weights.py
index 941e4817..8ca98324 100644
--- a/fmpose3d/utils/weights.py
+++ b/fmpose3d/utils/weights.py
@@ -7,30 +7,31 @@
 Licensed under Apache 2.0
 """
 
-"""Shared helpers for resolving / downloading FMPose3D model weights."""
+"""Shared helper for resolving / downloading FMPose3D model weights."""
 
 HF_REPO_ID: str = "MLAdaptiveIntelligence/FMPose3D"
 
 
-def resolve_weights_path(model_weights_path: str, model_type: str) -> str:
+def resolve_weights_path(local_path: str, filename: str) -> str:
     """Return a local weights path, downloading from Hugging Face Hub if needed.
 
     Parameters
     ----------
-    model_weights_path : str
-        User-supplied local path.  If falsy the weights are fetched from the
-        Hugging Face Hub automatically.
-    model_type : str
-        Model variant name used to derive the remote filename
-        (e.g. ``"fmpose3d_humans"`` -> ``fmpose3d_humans.pth``).
+    local_path : str
+        User-supplied local path. If falsy, ``filename`` is fetched from
+        the Hugging Face Hub (cached under ``~/.cache/huggingface``).
+    filename : str
+        The exact remote filename in the FMPose3D Hugging Face repo
+        (e.g. ``"fmpose3d_humans.pth"``, ``"fmpose3d_animals.pth"``,
+        ``"sa_finetune_hrnet_w32.pt"``).
 
     Returns
     -------
     str
         Absolute path to the weight file on disk.
     """
-    if model_weights_path:
-        return model_weights_path
+    if local_path:
+        return local_path
 
     try:
         from huggingface_hub import hf_hub_download
@@ -41,7 +42,6 @@ def resolve_weights_path(model_weights_path: str, model_type: str) -> str:
             "Or download the weights manually and pass the local path."
         ) from None
 
-    filename = f"{model_type}.pth"
     print(
         f"No local weights path specified. "
         f"Downloading '{filename}' from Hugging Face ({HF_REPO_ID})..."
diff --git a/pyproject.toml b/pyproject.toml
index e7df467d..d91a9f79 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "fmpose3d"
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10,<3.13"
 dynamic = ["version"]
 license = {text = "Apache 2.0"}
 authors = [
@@ -24,8 +24,12 @@ classifiers = [
 ]
 
 dependencies = [
-    "torch>=2.4.1",
-    "torchvision>=0.19.1",
+    # Pinned to torch 2.4.x: PyPI's Linux wheel for this range depends on
+    # CUDA 12.1 runtime packages. Newer torch releases may pull newer CUDA
+    # runtimes by default, so keep this bound to avoid surprising NVIDIA
+    # driver requirements for users.
+    "torch>=2.4.1,<2.5",
+    "torchvision>=0.19.1,<0.20",
     "timm>=1.0.0",
     "einops>=0.4.0",
     "numpy>=1.18.5,<2.0",
@@ -77,4 +81,4 @@ markers = [
 [tool.codespell]
 skip = '.git,*.pdf,*.svg,*.css,*.txt,*.pth'
 check-hidden = true
-ignore-words-list = 'fmpose,mpjpe,uvd,xyz,hm36,cpn,dbb'
+ignore-words-list = 'fmpose,mpjpe,uvd,xyz,hm36,cpn,dbb,mot'
diff --git a/scripts/FMPose3D_main.py b/scripts/FMPose3D_main.py
index 172a4824..5378d326 100644
--- a/scripts/FMPose3D_main.py
+++ b/scripts/FMPose3D_main.py
@@ -342,7 +342,7 @@ def print_error_action(action_error_sum, is_train):
 
     if args.reload:
         model_dict = model["CFM"].state_dict()
-        model_path = resolve_weights_path(args.model_weights_path, args.model_type)
+        model_path = resolve_weights_path(args.model_weights_path, f"{args.model_type}.pth")
 
         print(f"Loading weights from: {model_path}")
         pre_dict = torch.load(model_path, map_location=device, weights_only=True)
diff --git a/tests/fmpose3d_api/test_fmpose3d.py b/tests/fmpose3d_api/test_fmpose3d.py
index f4d4a026..89af7f60 100644
--- a/tests/fmpose3d_api/test_fmpose3d.py
+++ b/tests/fmpose3d_api/test_fmpose3d.py
@@ -36,7 +36,7 @@
     apply_limb_regularization,
     compute_limb_regularization_matrix,
 )
-from fmpose3d.common.config import FMPose3DConfig, InferenceConfig
+from fmpose3d.common.config import FMPose3DConfig, InferenceConfig, SuperAnimalConfig
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -336,6 +336,10 @@ def test_animal(self):
         est, pp = _default_components(FMPose3DConfig(model_type="fmpose3d_animals"))
         assert isinstance(est, SuperAnimalEstimator)
         assert isinstance(pp, AnimalPostProcessor)
+        # Animals default to fine-tuned mode with lazy HF auto-download so the
+        # API works out-of-the-box. Construction itself stays cheap (no network).
+        assert est.cfg.auto_download_finetuned is True
+        assert est.cfg.pose_snapshot_path == ""
 
 
 # =========================================================================
@@ -791,3 +795,162 @@ def test_predict_maps_valid_bodyparts(self):
         np.testing.assert_array_equal(mask, np.array([True]))
         # target[24] ← source[0] → (0*3, 0*3+1) = (0.0, 1.0)
         np.testing.assert_allclose(kpts[0, 0, 24], fake_bp[0, 0, :2])
+
+
+# =========================================================================
+# Unit tests — SuperAnimalEstimator fine-tuned mode (mocked DLC)
+# =========================================================================
+
+
+class TestSuperAnimalFinetunedPrediction:
+    """Fine-tuned mode covers two activation paths:
+
+    * ``cfg.pose_snapshot_path`` is non-empty (explicit local override).
+    * ``cfg.auto_download_finetuned=True`` with empty ``pose_snapshot_path``
+      (lazy HF auto-download on first predict).
+
+    Both forward ``customized_*`` kwargs to DLC's ``superanimal_analyze_images``
+    and skip the 39->26 keypoint remap.
+    """
+
+    def test_finetuned_forwards_customized_kwargs(self):
+        """pose_snapshot_path set → customized_* kwargs piped to DLC; empty
+        pytorch_config_path falls back to the packaged default yaml; empty
+        detector_snapshot_path forwards None so DLC resolves the stock detector.
+        """
+        pytest.importorskip("deeplabcut")
+        from fmpose3d.animals.configs import SA_FINETUNE_HRNET_W32_YAML
+
+        cfg = SuperAnimalConfig(pose_snapshot_path="/fake/snapshot.pt")
+        estimator = SuperAnimalEstimator(cfg)
+        frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8)
+        fake_bp = np.random.rand(1, 26, 3).astype("float32")
+
+        captured: dict = {}
+
+        def spy(*_, **kwargs):
+            captured.update(kwargs)
+            return {kwargs["images"][0]: {"bodyparts": fake_bp}}
+
+        with patch(
+            "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images",
+            side_effect=spy,
+        ):
+            estimator.predict(frames)
+
+        assert captured["customized_pose_checkpoint"] == "/fake/snapshot.pt"
+        assert captured["customized_model_config"] == SA_FINETUNE_HRNET_W32_YAML
+        assert captured["customized_detector_checkpoint"] is None
+
+    def test_finetuned_skips_remap(self):
+        """26-joint DLC output passes through unchanged; the stock-SA
+        ``_map_keypoints`` / ``_map_scores`` helpers must not be called."""
+        pytest.importorskip("deeplabcut")
+
+        cfg = SuperAnimalConfig(pose_snapshot_path="/fake/snapshot.pt")
+        estimator = SuperAnimalEstimator(cfg)
+        frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8)
+        # 26-joint output — what a fine-tuned snapshot natively produces.
+        fake_bp = np.arange(78, dtype="float32").reshape(1, 26, 3)
+
+        with patch.object(SuperAnimalEstimator, "_map_keypoints") as spy_map, \
+             patch.object(SuperAnimalEstimator, "_map_scores") as spy_scores, \
+             patch(
+                 "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images",
+             ) as mock_fn:
+            mock_fn.return_value = {"frame.png": {"bodyparts": fake_bp}}
+            kpts, scores, mask = estimator.predict(frames)
+
+        spy_map.assert_not_called()
+        spy_scores.assert_not_called()
+        assert kpts.shape == (1, 1, 26, 2)
+        assert scores.shape == (1, 1, 26)
+        np.testing.assert_array_equal(mask, np.array([True]))
+        # Output is the raw bodyparts xy / conf, not a remap.
+        np.testing.assert_allclose(kpts[0, 0], fake_bp[0, :, :2])
+        np.testing.assert_allclose(scores[0, 0], fake_bp[0, :, 2])
+
+    def test_finetuned_custom_paths_override_packaged_defaults(self):
+        """Explicit pytorch_config_path / detector_snapshot_path override the
+        packaged defaults and are forwarded verbatim to DLC."""
+        pytest.importorskip("deeplabcut")
+
+        cfg = SuperAnimalConfig(
+            pose_snapshot_path="/fake/snapshot.pt",
+            pytorch_config_path="/custom/pytorch_config.yaml",
+            detector_snapshot_path="/custom/detector.pt",
+        )
+        estimator = SuperAnimalEstimator(cfg)
+        frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8)
+        fake_bp = np.random.rand(1, 26, 3).astype("float32")
+
+        captured: dict = {}
+
+        def spy(*_, **kwargs):
+            captured.update(kwargs)
+            return {kwargs["images"][0]: {"bodyparts": fake_bp}}
+
+        with patch(
+            "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images",
+            side_effect=spy,
+        ):
+            estimator.predict(frames)
+
+        assert captured["customized_pose_checkpoint"] == "/fake/snapshot.pt"
+        assert captured["customized_model_config"] == "/custom/pytorch_config.yaml"
+        assert captured["customized_detector_checkpoint"] == "/custom/detector.pt"
+
+    def test_stock_mode_does_not_forward_customized_kwargs(self):
+        """Default config (empty pose_snapshot_path, auto_download_finetuned=False)
+        → no customized_* kwargs; DLC runs with stock SuperAnimal-Quadruped
+        weights and the 39->26 remap path is taken downstream."""
+        pytest.importorskip("deeplabcut")
+
+        estimator = SuperAnimalEstimator()  # default config (stock SA mode)
+        frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8)
+        fake_bp = np.random.rand(1, 40, 3).astype("float32")  # 40-joint stock output
+
+        captured: dict = {}
+
+        def spy(*_, **kwargs):
+            captured.update(kwargs)
+            return {kwargs["images"][0]: {"bodyparts": fake_bp}}
+
+        with patch(
+            "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images",
+            side_effect=spy,
+        ):
+            estimator.predict(frames)
+
+        assert not any(k.startswith("customized_") for k in captured), (
+            f"stock mode must not forward customized_* kwargs, got: {list(captured)}"
+        )
+
+    def test_auto_download_finetuned_resolves_via_hf_at_predict_time(self):
+        """auto_download_finetuned=True with empty pose_snapshot_path triggers
+        a lazy HF resolution on the first predict() call. The resolved path
+        is forwarded to DLC as customized_pose_checkpoint."""
+        pytest.importorskip("deeplabcut")
+
+        cfg = SuperAnimalConfig(auto_download_finetuned=True)
+        assert cfg.pose_snapshot_path == ""  # trigger condition
+        estimator = SuperAnimalEstimator(cfg)
+        frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8)
+        fake_bp = np.random.rand(1, 26, 3).astype("float32")
+        captured: dict = {}
+
+        def spy(*_, **kwargs):
+            captured.update(kwargs)
+            return {kwargs["images"][0]: {"bodyparts": fake_bp}}
+
+        with patch(
+            "fmpose3d.utils.weights.resolve_weights_path",
+            return_value="/hf/cache/sa_finetune_hrnet_w32.pt",
+        ) as mock_resolver, patch(
+            "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images",
+            side_effect=spy,
+        ):
+            estimator.predict(frames)
+
+        mock_resolver.assert_called_once_with("", "sa_finetune_hrnet_w32.pt")
+        assert captured["customized_pose_checkpoint"] == "/hf/cache/sa_finetune_hrnet_w32.pt"
diff --git a/tests/test_config.py b/tests/test_config.py
index 2b2983c0..78457de6 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -69,13 +69,13 @@ def test_list_defaults_are_independent(self):
 
     def test_custom_values(self):
         cfg = DatasetConfig(
-            dataset="rat7m",
-            root_path="Rat7M_data/",
+            dataset="animal3d",
+            root_path="dataset/",
             joints_left=[8, 10, 11],
             joints_right=[9, 14, 15],
         )
-        assert cfg.dataset == "rat7m"
-        assert cfg.root_path == "Rat7M_data/"
+        assert cfg.dataset == "animal3d"
+        assert cfg.root_path == "dataset/"
         assert cfg.joints_left == [8, 10, 11]
 
 
@@ -261,9 +261,9 @@ def test_from_namespace_basic(self):
             out_channels=3,
             frames=3,
             # DatasetConfig
-            dataset="rat7m",
+            dataset="animal3d",
             keypoints="cpn",
-            root_path="Rat7M_data/",
+            root_path="dataset/",
             actions="*",
             downsample=1,
             subset=1.0,
@@ -343,7 +343,7 @@ def test_from_namespace_basic(self):
         # Verify a sample from each group
         assert cfg.model_cfg.layers == 5
         assert cfg.model_cfg.channel == 256
-        assert cfg.dataset_cfg.dataset == "rat7m"
+        assert cfg.dataset_cfg.dataset == "animal3d"
         assert cfg.dataset_cfg.joints_left == [8, 10]
         assert cfg.training_cfg.train is True
         assert cfg.training_cfg.nepoch == 100