diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index 5d98c37e..3155eb8f 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -18,4 +18,4 @@ jobs: - name: Codespell uses: codespell-project/actions-codespell@v1 with: - ignore_words_list: fmpose, mpjpe, uvd, xyz, hm36, cpn, dbb + ignore_words_list: fmpose, mpjpe, uvd, xyz, hm36, cpn, dbb, mot diff --git a/README.md b/README.md index 1005e6a0..5037520e 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ FMPose3D creates a 3D pose from a single 2D image. It leverages fast Flow Matchi ### Set up an environment -Make sure you have Python 3.10+. You can set this up with: +Make sure you have Python 3.10. The installation and demos are tested with Python 3.10. You can set this up with: ```bash conda create -n fmpose_3d python=3.10 conda activate fmpose_3d @@ -45,6 +45,8 @@ For the animal pipeline, install the optional DeepLabCut dependency: pip install "fmpose3d[animals]" ``` +> **PyTorch/CUDA note.** FMPose3D pins `torch>=2.4.1,<2.5` and `torchvision>=0.19.1,<0.20`, which use CUDA 12.1 wheels by default on Linux. If your driver does not support CUDA 12.1, or if you need a specific CUDA build, install PyTorch first using the matching command from [pytorch.org](https://pytorch.org/get-started/locally/), then install `fmpose3d`. + ## Demos ### Testing on in-the-wild images (humans) @@ -108,7 +110,7 @@ FMPose3D also ships a high-level Python API for end-to-end 3D pose estimation fr ## Experiments on non-human animals -For animal training/testing and demo scripts, see [animals/README.md](animals/README.md). +For animal training/testing and demo scripts, see [animals/README.md](animals/README.md). The animal demo **auto-downloads both checkpoints** (a 26-joint SuperAnimal-Quadruped fine-tuned on Animal3D for 2D pose, and the FMPose3D animal flow-matching lifter for 3D) from [Hugging Face](https://huggingface.co/MLAdaptiveIntelligence/FMPose3D) on first run — no manual setup needed. ## Citation diff --git a/animals/README.md b/animals/README.md index 5121533b..6ab16f31 100644 --- a/animals/README.md +++ b/animals/README.md @@ -9,8 +9,10 @@ In this part, the FMPose3D model is trained on [Animal3D](https://xujiacong.gith This visualization script is designed for single-frame based model, allowing you to easily run 3D animal pose estimation on any single image. -Before testing, make sure you have the pre-trained model ready. -You may either use the model trained by your own or download ours from [here](https://drive.google.com/drive/folders/1kL4aOyWNq0o9zB0rSTRM8KYgkySVmUTk?usp=drive_link) and place it in the `./pre_trained_models` directory. +Both pre-trained checkpoints are **auto-downloaded from [Hugging Face](https://huggingface.co/MLAdaptiveIntelligence/FMPose3D)** on first run and cached under `~/.cache/huggingface/`. No manual downloads required. + +- **3D lifter** (`fmpose3d_animals.pth`) — Animal3D 26-joint flow-matching 2D→3D lifter. Override: set `saved_model_path` in `vis_animals.sh` to a local `.pth`. +- **2D pose model** (`sa_finetune_hrnet_w32.pt`) — SuperAnimal-Quadruped HRNet-w32 fine-tuned on Animal3D for the 26-joint Animal3D output layout. Override: set `saved_2d_model_path` in `vis_animals.sh` to a local `.pt`. Next, put your test images into folder `demo/images`. Then run the visualization script: ```bash @@ -49,7 +51,7 @@ Place the downloaded files in the `dataset/` folder of this project: ## Training The training logs, checkpoints, and related files of each training time will be saved in the './checkpoint' folder. -For trainig on the two datasets: +For training on the two datasets: ```bash cd animals diff --git a/animals/demo/vis_animals.py b/animals/demo/vis_animals.py index 5772060b..b31abcb0 100644 --- a/animals/demo/vis_animals.py +++ b/animals/demo/vis_animals.py @@ -46,21 +46,6 @@ from fmpose3d.models import get_model CFM = get_model(args.model_type) -try: - from deeplabcut.pose_estimation_pytorch.apis import ( # pyright: ignore[reportMissingImports] - superanimal_analyze_images, - ) -except ImportError: - raise ImportError( - "DeepLabCut is required for the animal demo. " - "Install it with: pip install \"fmpose3d[animals]\"" - ) from None - -superanimal_name = "superanimal_quadruped" -model_name = "hrnet_w32" -detector_name = "fasterrcnn_resnet50_fpn_v2" -max_individuals = 1 - def compute_limb_regularization_matrix(gt_3d): """ Compute regularization matrix to align limb directions to vertical (0,0,1). @@ -145,108 +130,39 @@ def apply_regularization(pose_3d, R): """ return (R @ pose_3d.T).T -def get_pose2D(path, output_dir, type): +def build_2d_estimator(): + """Build the 2D pose estimator once. Snapshot resolves lazily on first predict. + + Empty --saved_2d_model_path -> auto-download fine-tuned snapshot from HF. + Non-empty path -> use as a local override. + """ + from fmpose3d.common.config import SuperAnimalConfig + from fmpose3d.inference_api.fmpose3d import SuperAnimalEstimator + from fmpose3d.utils.weights import resolve_weights_path + + pose_snapshot_path = resolve_weights_path( + args.saved_2d_model_path, "sa_finetune_hrnet_w32.pt" + ) + cfg = SuperAnimalConfig( + pose_snapshot_path=pose_snapshot_path, + pytorch_config_path=args.pytorch_config_2d_path, + ) + print(f"[2D] pose snapshot = {cfg.pose_snapshot_path}") + return SuperAnimalEstimator(cfg) + + +def get_pose2D(estimator, path, output_dir, type): print('\nGenerating 2D pose...') - - # Check if this is the special debug case for 000000119761_horse - filename = Path(path).stem - is_debug_case = "000000119761_horse" in filename - - if is_debug_case: - print(f"DEBUG MODE: Using provided 2D pose for {filename}") - # User provided 2D pose (26 keypoints, x, y coordinates, ignoring the last dimension) - provided_pose = np.array([ - [361, 230], [361, 237], [363, 279], [257, 359], [251, 374], - [164, 365], [68, 372], [99, 206], [247, 266], [253, 285], - [127, 275], [101, 285], [267, 217], [268, 229], [273, 318], - [250, 340], [128, 311], [76, 305], [313, 220], [48, 310], - [351, 203], [352, 210], [340, 257], [340, 261], [373, 276], - [55, 247] - ], dtype=np.float32) - - # Reshape to match expected format: (1, 26, 2) for single individual - provided_pose = provided_pose.reshape(1, 26, 2) - - # Create xy_preds dict with the provided pose - xy_preds = {path: provided_pose} - print(f"Using provided 2D pose with shape: {provided_pose.shape}") - else: - # Normal prediction flow - predictions = superanimal_analyze_images( - superanimal_name, - model_name, - detector_name, - path, - max_individuals, - out_folder=output_dir - ) - print("predictions:", predictions) - - # get the 2D keypoints from the predictions - xy_preds = {} - # predictions is a dict: {image_path: {"bodyparts": (N, K, 3), "bboxes": ..., "bbox_scores": ...}} - for img_path, payload in predictions.items(): - bodyparts = payload.get("bodyparts") - if bodyparts is None: - continue - # bodyparts shape: (num_individuals, num_keypoints, 3) -> [:, :, :2] keeps x,y - xy_preds[img_path] = bodyparts[..., :2] - - print("2D keypoints (x,y) by image:") - for img_path, xy in xy_preds.items(): - print(f"{img_path}: shape {xy.shape}") - - # For debug case, the provided pose is already in Animal3D format (26 keypoints) - # So we skip the mapping step - if is_debug_case: - print("DEBUG MODE: Skipping keypoint mapping (already in Animal3D format)") - mapped_keypoints = xy_preds - else: - # now map the keypoints to a different set of keypoints (used in Animal3D) - # keypoint mapping from quadruped80K super keypotints to animal3d keypoints - keypoint_mapping = {"quadruped80k":[10, 5, -1, 26, 29, 30, 35, 22, 24, 27, 31, 32, -1, -1, 25, 28, 33, 34, 15, 23, 11, 6, 4, 3, 0, -1]} - - # for the keypoint_mapping, -1 indicates that there is no corresponding keypoint in the source set, but we can interpolate - # for index 2, we can interpolate between keypoints 3 and 4 in the source set to get a better estimate of the missing keypoint - # for index 25, we can interpolate between keypoints 22 and 23 in the source set - # for index 12, we can interpolate between keypoints 24 and 19 in the source set - # for index 13, we can interpolate between keypoints 27 and 19 in the source set - - # Define interpolation rules for -1 indices: {target_idx: (source_idx1, source_idx2)} - interpolation_rules = { - 2: (3, 4), # interpolate between source keypoints 3 and 4 - 12: (24, 19), # interpolate between source keypoints 24 and 19 - 13: (27, 19), # interpolate between source keypoints 27 and 19 - 25: (22, 23), # interpolate between source keypoints 22 and 23 - } - - # map the keypoints - mapped_keypoints = {} - mapping_indices = keypoint_mapping["quadruped80k"] - - for img_path, xy in xy_preds.items(): - # xy shape: (num_individuals, num_keypoints, 2) - num_individuals, num_keypoints, _ = xy.shape - num_target_keypoints = len(mapping_indices) - - # Initialize mapped array with NaN or zeros - mapped_xy = np.full((num_individuals, num_target_keypoints, 2), np.nan) - - for target_idx, source_idx in enumerate(mapping_indices): - if source_idx != -1 and source_idx < num_keypoints: - # Copy the keypoint from source to target position - mapped_xy[:, target_idx, :] = xy[:, source_idx, :] - elif source_idx == -1 and target_idx in interpolation_rules: - # Perform interpolation for -1 indices - src1, src2 = interpolation_rules[target_idx] - if src1 < num_keypoints and src2 < num_keypoints: - # Interpolate as the average of the two source keypoints - mapped_xy[:, target_idx, :] = (xy[:, src1, :] + xy[:, src2, :]) / 2.0 - print(f"Interpolated keypoint {target_idx} from source keypoints {src1} and {src2}") - - mapped_keypoints[img_path] = mapped_xy - print(f"Mapped {img_path}: {xy.shape} -> {mapped_xy.shape}") + + img_bgr = cv2.imread(path) + if img_bgr is None: + raise FileNotFoundError(f"Failed to read image: {path}") + + # predict() returns (kpts (1, N, 26, 2), scores (1, N, 26), valid_mask (N,)). + kpts, _scores, _mask = estimator.predict(img_bgr[None]) + # Pack into the {img_path: (1, 26, 2)} format expected by the save/vis code below. + mapped_keypoints = {path: kpts[:, 0, :, :]} print('Generating 2D pose successful!') @@ -259,7 +175,6 @@ def get_pose2D(path, output_dir, type): # Save in the same format as vis_in_the_wild.py for compatibility output_npz = output_dir_2D + 'keypoints.npz' np.savez_compressed(output_npz, reconstruction=mapped_xy) - print(f"Saved keypoints to {output_npz}") # Also save as npy for backup img_name = Path(img_path).stem @@ -275,7 +190,6 @@ def get_pose2D(path, output_dir, type): index=[f'keypoint_{i}' for i in range(mapped_xy.shape[1])] ) df.to_csv(csv_file) - print(f"Saved individual {ind_idx} keypoints to {csv_file}") # Visualize mapped keypoints on image img = Image.open(img_path) @@ -328,39 +242,38 @@ def get_pose2D(path, output_dir, type): plt.tight_layout() plt.savefig(vis_file, dpi=150, bbox_inches='tight') plt.close(fig) - print(f"Saved visualization to {vis_file}") -def get_pose3D(path, output_dir, type='image'): - """ - Generate 3D pose from 2D keypoints using the model. - This function reads the 2D keypoints saved by get_pose2D and generates 3D poses. +def build_3d_lifter(): + """Build the 3D lifter once and return (model, device). + + Empty --saved_model_path -> auto-download fmpose3d_animals.pth from HF. + Non-empty path is used as a local override. """ - print('\nGenerating 3D pose...') - print(f"args.n_joints: {args.n_joints}, args.out_joints: {args.out_joints}") - - ## Reload model + from fmpose3d.utils.weights import resolve_weights_path + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = CFM(args).to(device) - model = {} - model['CFM'] = CFM(args).to(device) - - model_dict = model['CFM'].state_dict() - model_path = args.saved_model_path - print(f"Loading model from: {model_path}") + model_path = resolve_weights_path(args.saved_model_path, f"{args.model_type}.pth") + print(f"[3D] lifter weights = {model_path}") pre_dict = torch.load(model_path, map_location=device, weights_only=True) - for name, key in model_dict.items(): + model_dict = model.state_dict() + for name in model_dict: model_dict[name] = pre_dict[name] - model['CFM'].load_state_dict(model_dict) - print("Model loaded successfully!") - - model = model['CFM'].eval() + model.load_state_dict(model_dict) + return model.eval() + + +def get_pose3D(model, path, output_dir, type='image'): + """ + Generate 3D pose from 2D keypoints using the model. + Reads the 2D keypoints saved by get_pose2D and generates 3D poses. + """ + print('\nGenerating 3D pose...') - ## Load input 2D keypoints keypoints = np.load(output_dir + 'input_2D/keypoints.npz', allow_pickle=True)['reconstruction'] - print(f"Loaded keypoints shape: {keypoints.shape}") - ## Generate 3D poses if type == "image": i = 0 img = cv2.imread(path) @@ -422,9 +335,6 @@ def euler_sample(c_2d, y_local, steps, model_3d): return y_local ## Estimation (without TTA for better results) - print("input_2D.shape:", input_2D.shape) - print("input_2D:", input_2D[0, 0]) - # Single inference without flip augmentation # Create 3D random noise with shape (1, 1, J, 3) y = torch.randn(input_2D.size(0), input_2D.size(1), input_2D.size(2), 3, device=device) @@ -492,7 +402,6 @@ def euler_sample(c_2d, y_local, steps, model_3d): output_dir_2D_img = output_dir + 'pose2D_on_image/' os.makedirs(output_dir_2D_img, exist_ok=True) cv2.imwrite(f'{output_dir_2D_img}{i:04d}_2d.png', img_copy) - print(f"Saved 2D pose on image to {output_dir_2D_img}{i:04d}_2d.png") ## Save 3D pose as npz output_dir_3D = output_dir + 'pose3D/' @@ -603,46 +512,46 @@ def img2gif(video_path, name, output_dir, duration=0.25): if __name__ == "__main__": - + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu path = args.path # file path or folder path - - # Check if path is a directory + + # Build the 2D estimator and 3D lifter ONCE; reuse across all images/frames. + # This avoids redundant HF resolution and DLC/torch model reloads. + estimator_2d = build_2d_estimator() + model_3d = build_3d_lifter() + if os.path.isdir(path): - # Get all image files in the directory image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.JPG', '*.JPEG', '*.PNG', '*.BMP'] image_files = [] for ext in image_extensions: image_files.extend(glob.glob(os.path.join(path, ext))) image_files.sort() - + if len(image_files) == 0: print(f"No image files found in {path}") exit(0) - + print(f"Found {len(image_files)} images in {path}") - - # Process each image + for img_path in tqdm(image_files, desc="Processing images"): filename = img_path.split('/')[-1].split('.')[0] output_dir = './predictions/' + filename + '/' - + print(f"\nProcessing: {img_path}") - get_pose2D(img_path, output_dir, args.type) - get_pose3D(img_path, output_dir, args.type) - + get_pose2D(estimator_2d, img_path, output_dir, args.type) + get_pose3D(model_3d, img_path, output_dir, args.type) + print(f'\nAll {len(image_files)} images processed successfully!') else: # Single file processing filename = path.split('/')[-1].split('.')[0] output_dir = './predictions/' + filename + '/' - get_pose2D(path, output_dir, args.type) - get_pose3D(path, output_dir, args.type) + get_pose2D(estimator_2d, path, output_dir, args.type) + get_pose3D(model_3d, path, output_dir, args.type) - if args.type=="video": + if args.type == "video": img2video(path, filename, output_dir) - img2gif(path, filename, output_dir) - - print('Generating demo successful!') \ No newline at end of file + img2gif(path, filename, output_dir) \ No newline at end of file diff --git a/animals/demo/vis_animals.sh b/animals/demo/vis_animals.sh index e2944c2d..a2109695 100644 --- a/animals/demo/vis_animals.sh +++ b/animals/demo/vis_animals.sh @@ -4,12 +4,19 @@ gpu_id=1 sample_steps=3 batch_size=1 sh_file='vis_animals.sh' -# n_joints=26 -# out_joints=26 model_type='fmpose3d_animals' # model_path='' # set to a local file path to override the registry -saved_model_path='../pre_trained_models/fmpose3d_animals/fmpose3d_animals_pretrained_weights.pth' + +# 3D lifter (2D keypoints -> 3D pose). +# Empty -> auto-downloads fmpose3d_animals.pth from Hugging Face (cached under ~/.cache/huggingface). +# Set to a local path to override. +saved_model_path='' + +# 2D pose model (image -> 26-joint Animal3D keypoints). +# Empty -> auto-downloads from Hugging Face on first run (cached under ~/.cache/huggingface). +# Set to a local path to override (e.g., for a custom checkpoint). +saved_2d_model_path='' # path='./images/image_00068.jpg' # single image input_images_folder='./images/' # folder containing multiple images @@ -18,6 +25,7 @@ python3 vis_animals.py \ --type 'image' \ --path ${input_images_folder} \ --saved_model_path "${saved_model_path}" \ + --saved_2d_model_path "${saved_2d_model_path}" \ ${model_path:+--model_path "$model_path"} \ --model_type "${model_type}" \ --sample_steps ${sample_steps} \ @@ -25,4 +33,4 @@ python3 vis_animals.py \ --layers ${layers} \ --dataset animal3d \ --gpu ${gpu_id} \ - --sh_file ${sh_file} \ No newline at end of file + --sh_file ${sh_file} diff --git a/animals/scripts/main_animal3d.py b/animals/scripts/main_animal3d.py index c90bdead..b2bdaa7a 100644 --- a/animals/scripts/main_animal3d.py +++ b/animals/scripts/main_animal3d.py @@ -18,6 +18,7 @@ from fmpose3d.animals.common.arguments import opts as parse_args from fmpose3d.animals.common.utils import * from fmpose3d.animals.common.animal3d_dataset import TrainDataset +from fmpose3d.utils.weights import resolve_weights_path import time args = parse_args().parse() @@ -210,7 +211,7 @@ def get_parameter_number(net): if args.train==False: # create a new folder for the test results - args.folder_dir = os.path.dirname(args.saved_model_path) + args.folder_dir = os.path.dirname(args.saved_model_path) if args.saved_model_path else './checkpoint' args.checkpoint = os.path.join(args.folder_dir, 'test_results_' + args.create_time) if not os.path.exists(args.checkpoint): @@ -247,8 +248,8 @@ def get_parameter_number(net): train_paths = args.train_dataset_path if isinstance(args.train_dataset_path, list) else [args.train_dataset_path] test_paths = args.test_dataset_path if isinstance(args.test_dataset_path, list) else [args.test_dataset_path] - # Rat7M doesn't have action labels, use placeholder for error calculation - actions = ['rat_motion'] + # Animal3D doesn't have per-clip action labels; use a single placeholder bucket for error aggregation. + actions = ['animal_motion'] if args.train: train_datasets = [TrainDataset(is_train=True, json_file=p, root_joint=args.root_joint) for p in train_paths] @@ -268,9 +269,8 @@ def get_parameter_number(net): if args.reload: model_dict = model['CFM'].state_dict() - # Prefer explicit saved_model_path; otherwise fallback to previous_dir glob - model_path = args.saved_model_path - print(model_path) + model_path = resolve_weights_path(args.saved_model_path, f"{args.model_type}.pth") + print(f"Loading weights from: {model_path}") pre_dict = torch.load(model_path, weights_only=True, map_location=device) for name, key in model_dict.items(): model_dict[name] = pre_dict[name] @@ -348,4 +348,3 @@ def get_parameter_number(net): print(args.checkpoint) logging.info(args.checkpoint) - diff --git a/animals/scripts/test_animal3d.sh b/animals/scripts/test_animal3d.sh index 207e3321..c975d136 100644 --- a/animals/scripts/test_animal3d.sh +++ b/animals/scripts/test_animal3d.sh @@ -11,8 +11,11 @@ n_joints=26 out_joints=26 epochs=300 model_type='fmpose3d_animals' -# model_path='' # set to a local file path to override the registry -saved_model_path='./pre_trained_models/fmpose3d_animals/fmpose3d_animals_pretrained_weights.pth' +model_path='' # set to a local file path to override the registry +# By default, weights are automatically downloaded from Hugging Face Hub. +# To use local weights instead, uncomment the line below: +# saved_model_path='./pre_trained_models/fmpose3d_animals/fmpose3d_animals_pretrained_weights.pth' +saved_model_path='' # root path denotes the path to the original dataset root_path="./dataset/" @@ -47,4 +50,4 @@ python ./scripts/main_animal3d.py \ --lr_decay_large ${lr_decay_large} \ --train_dataset_path ${train_dataset_paths[@]} \ --test_dataset_path ${test_dataset_paths[@]} \ - --saved_model_path ${saved_model_path} \ No newline at end of file + --saved_model_path "${saved_model_path}" diff --git a/demo/vis_in_the_wild.py b/demo/vis_in_the_wild.py index 8f888198..26515964 100755 --- a/demo/vis_in_the_wild.py +++ b/demo/vis_in_the_wild.py @@ -279,7 +279,7 @@ def get_pose3D(path, output_dir, type='image'): # if args.reload: model_dict = model['CFM'].state_dict() - model_path = resolve_weights_path(args.model_weights_path, args.model_type) + model_path = resolve_weights_path(args.model_weights_path, f"{args.model_type}.pth") print(f"Loading weights from: {model_path}") pre_dict = torch.load(model_path, map_location=device, weights_only=True) diff --git a/fmpose3d/animals/common/animal_visualization.py b/fmpose3d/animals/common/animal_visualization.py deleted file mode 100644 index c45d6f66..00000000 --- a/fmpose3d/animals/common/animal_visualization.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -FMPose3D: monocular 3D Pose Estimation via Flow Matching - -Official implementation of the paper: -"FMPose3D: monocular 3D Pose Estimation via Flow Matching" -by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis -Licensed under Apache 2.0 -""" - -import os - -import cv2 -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - - -def save_3Dpose_colored(pre_pose, gt_pose, figure_name): - fig = plt.figure() - ax1 = fig.add_subplot(211, projection="3d") - ax1.scatter( - pre_pose[:, 0], pre_pose[:, 1], pre_pose[:, 2], c=list(range(pre_pose.shape[0])), cmap="jet" - ) - # plt.axis('off') - ax2 = fig.add_subplot(212, projection="3d") - ax2.scatter( - gt_pose[:, 0], gt_pose[:, 1], gt_pose[:, 2], c=list(range(gt_pose.shape[0])), cmap="jet" - ) - # plt.axis('off') - plt.show() - plt.savefig(figure_name, dpi=400.0) - plt.close() - - -def save_absolute_3Dpose_image(image, pre_pose, gt_pose, vid_3D, skeleton, figure_name): - fig = plt.figure(figsize=(20, 9)) - ax1 = fig.add_subplot(131, projection="3d") - ax1.scatter( - pre_pose[:, 0], - pre_pose[:, 2], - -pre_pose[:, 1], - c=list(range(pre_pose.shape[0])), - cmap="jet", - ) - for i in range(skeleton.shape[0]): - ax1.plot( - [pre_pose[skeleton[i, 0], 0], pre_pose[skeleton[i, 1], 0]], - [pre_pose[skeleton[i, 0], 2], pre_pose[skeleton[i, 1], 2]], - [-pre_pose[skeleton[i, 0], 1], -pre_pose[skeleton[i, 1], 1]], - c="black", - ) - ax1.set_xlim([-3, 3]) - ax1.set_zlim([-1.5, 3]) - ax1.set_ylim([12, 20]) - ax1.title.set_text("Prediction") - - # plt.axis('off') - ax2 = fig.add_subplot(132, projection="3d") - visiable_gt = gt_pose[np.where(vid_3D)[0], :] - ax2.scatter( - visiable_gt[:, 0], - visiable_gt[:, 2], - -visiable_gt[:, 1], - c=list(np.array(range(gt_pose.shape[0]))[np.where(vid_3D)]), - cmap="jet", - ) - for i in range(skeleton.shape[0]): - if vid_3D[skeleton[i, 0]] > 0 and vid_3D[skeleton[i, 1]] > 0: - ax2.plot( - [gt_pose[skeleton[i, 0], 0], gt_pose[skeleton[i, 1], 0]], - [gt_pose[skeleton[i, 0], 2], gt_pose[skeleton[i, 1], 2]], - [-gt_pose[skeleton[i, 0], 1], -gt_pose[skeleton[i, 1], 1]], - c="black", - ) - ax2.set_xlim([-3, 3]) - ax2.set_zlim([-1.5, 3]) - ax2.set_ylim([12, 20]) - ax2.title.set_text("GT") - # plt.axis('off') - ax3 = fig.add_subplot(133) - ax3.imshow(image) - ax3.title.set_text("Camera1 view") - plt.show() - plt.savefig(figure_name, dpi=200.0) - plt.close() - - -def save_absolute_3Dpose(pre_pose, skeleton, figure_name): - fig = plt.figure(figsize=(20, 9)) - ax1 = fig.add_subplot(111, projection="3d") - ax1.scatter( - pre_pose[:, 0], - pre_pose[:, 2], - -pre_pose[:, 1], - c=list(range(pre_pose.shape[0])), - cmap="jet", - ) - for i in range(skeleton.shape[0]): - ax1.plot( - [pre_pose[skeleton[i, 0], 0], pre_pose[skeleton[i, 1], 0]], - [pre_pose[skeleton[i, 0], 2], pre_pose[skeleton[i, 1], 2]], - [-pre_pose[skeleton[i, 0], 1], -pre_pose[skeleton[i, 1], 1]], - c="black", - ) - ax1.set_xlim([-1, 1]) - ax1.set_zlim([-1, 1]) - ax1.set_ylim([-1, 1]) - ax1.title.set_text("gt") - plt.show() - plt.savefig(figure_name, dpi=200.0) - plt.close() diff --git a/fmpose3d/animals/common/arber_dataset.py b/fmpose3d/animals/common/arber_dataset.py deleted file mode 100644 index 27dba171..00000000 --- a/fmpose3d/animals/common/arber_dataset.py +++ /dev/null @@ -1,312 +0,0 @@ -""" -FMPose3D: monocular 3D Pose Estimation via Flow Matching - -Official implementation of the paper: -"FMPose3D: monocular 3D Pose Estimation via Flow Matching" -by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis -Licensed under Apache 2.0 -""" - -import copy -import gc -import glob -import os -import random - -import cv2 -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import torch -from torch import from_numpy as FN -from torch.utils.data import Dataset -from tqdm import tqdm - -from fmpose3d.common.camera import normalize_screen_coordinates -from fmpose3d.animals.common.lifter3d import load_camera_params, load_h5_keypoints - - -class ArberDataset(Dataset): - def __init__( - self, - cfg, - path, - split, - cam_names, - root_index=12, - joint_num=23, - sampling_gap=100, - frame_per_video=9000, - norm_rate=50.0, - img_W=2048, - img_H=1536, - arg_views=1, - resize_2D_scale=0.5, - visualize=False, - ): - - self.cfg = cfg - self.cam_names = cam_names - self.joint_num = joint_num - self.root_index = root_index - self.img_W = img_W * resize_2D_scale - self.img_H = img_H * resize_2D_scale - self.arg_views = arg_views - self.split = split - self.visualize = visualize - - # subject_index: category names - subject_index = os.listdir(path) - subject_index.sort() - - # use split to define start and end frame - if split == "Train": - self.subject_list = subject_index - self.start_frame = 0 - self.end_frame = 10000 - elif split == "Valid": - self.subject_list = subject_index - self.start_frame = 3 - self.end_frame = 8000 - elif split == "Test": - self.subject_list = subject_index - self.start_frame = 6 - self.end_frame = 10000 - elif split == "Infer": - self.subject_list = subject_index[:1] - self.start_frame = 0 - self.end_frame = 2000000 - - # prepare pose data - print("prepare the pose data...") - self.pose_3D_list = [] - self.pose_2D_list = [] - self.sample_info_list = [] - self.cam_para_list = [] - - for sub_idx, subject_name in enumerate(self.subject_list): # iterate on subject - print(subject_name) - subject_folder = os.path.join(path, subject_name) - - # load asked cameras - yaml_files = [] - for cam in cam_names: - yaml_files.extend( - sorted(glob.glob(os.path.join(subject_folder, f"calibration/*{cam}*.yaml"))) - ) - - # yaml_files = sorted(glob.glob(os.path.join(subject_folder,'calibration/*.yaml'))) - cameras = [load_camera_params(yaml) for yaml in yaml_files] - self.cam_para_list = cameras - - # load triangulated 3d points - # points_3d_np = np.load(os.path.join(subject_folder,'triangulated_3d.npy')) # shape (num_frames, 23, 3) - - # apply norm_rate on translation vector - for i in range(len(cam_names)): - cameras[i]["T"] = cameras[i]["T"] / norm_rate - - # load all 2D keypoints from asked cameras - h5_files = [] - for cam in cam_names: - # print("cam:", cam) - # h5_files.extend(sorted(glob.glob(os.path.join(subject_folder,f'pose2d_dlc/Camera_{cam}*.h5')))) # for cspnext model - h5_files.extend( - sorted(glob.glob(os.path.join(subject_folder, f"pose2d_dlc/*{cam}*.h5"))) - ) # for rtmpose model - - keypoints_2d = [ - load_h5_keypoints(h5) for h5 in h5_files - ] # (num_cameras,num_frames,23,3) # for rtmpose model - # keypoints_2d = [load_h5_keypoints_cspnext(h5) for h5 in h5_files] # (num_cameras,num_frames,23,3) # for cspnext model - - # get total frame - > real end frame - total_frame_num = keypoints_2d[0].shape[0] - real_end_frame = min(self.end_frame, total_frame_num) - - for idx in tqdm( - range(self.start_frame, real_end_frame, sampling_gap) - ): # get temporal video fragment of 2D and 3D keypoints - idx = max(idx, self.t_pad) - idx = min(idx, real_end_frame - self.t_pad - 1) - left_frame_id = idx - self.t_pad - right_frame_id = idx + self.t_pad + 1 - - # record sample info - tmp_info = np.zeros(2) - tmp_info[0] = sub_idx - tmp_info[1] = idx - - # extract 3d fragment, get 3D points from npy file - - points_3d_fragment = ( - np.load(os.path.join(subject_folder, "triangulated_3d.npy"))[ - left_frame_id:right_frame_id, :, :3 - ] - / norm_rate - ) # load from prepared .npy and apply norm_rate - - # print("points_3d_fragment shape:", points_3d_fragment.shape) # (num_frames, 23, 3) - keypoints_2d = np.array(keypoints_2d) # Ensure it's a NumPy array - # get 2D keypoint vis - points_2d_vis_np = keypoints_2d[:, left_frame_id:right_frame_id, :, 2:] # N,T,K,1 - - # clip vis - points_2d_vis_np = np.clip(points_2d_vis_np, 0, 1) - - # get 2D keypoint - points_2d_np = keypoints_2d[:, left_frame_id:right_frame_id, :, :2] # N,T,K,2 - # # get 3D keypoint from 3D lifting - # points_2d_fragment_np = np.array(points_2d_fragment) # (num_cams, num_frames, num_joints, 2) N,T,K,2 - # points_3d_fragment = triangulate_3d_batch(points_2d_fragment_np,cameras) #(num_frames, 23, 3) T,K,3 - - # get 3D pose from world to camera, with respect to different camera - points_3d = np.zeros( - (self.t_length, self.joint_num, 3, len(self.cam_names)) - ) # initialize 3D keypints, from T,K,3 to T,K,3,N - points_3d_world = np.reshape(points_3d_fragment, (-1, 3)) # T,K,3 - # print("before and after reshape",points_3d_fragment.shape,points_3d_world.shape) - for cam_idx, cam in enumerate(cam_names): - # todo: check transformation - points_3d_cam = ( - np.dot(points_3d_world, cameras[cam_idx]["R"].T) + cameras[cam_idx]["T"].T - ) - - points_3d[:, :, :, cam_idx] = np.reshape( - points_3d_cam, (self.t_length, self.joint_num, 3) - ) # T,K,3,N - - # get relative 3D pose - points_3d_root = copy.deepcopy( - points_3d[:, self.root_index : self.root_index + 1, :, :] - ) - rela_points_3d = points_3d - points_3d_root - - del points_3d, points_3d_root - gc.collect() - - # normalize 2D pose - points_2d_np = normalize_screen_coordinates( - copy.deepcopy(points_2d_np), self.img_W, self.img_H - ) # N,T,K,2 - - # get fake vis3d - points_vis3D = np.ones((self.t_length, self.joint_num, 1)) - - self.pose_3D_list.append(rela_points_3d) - self.pose_2D_list.append( - np.nan_to_num(points_2d_np.transpose(1, 2, 3, 0)) - ) # transpose to T,K,2,N - self.vid2D_list.append( - points_2d_vis_np.transpose(1, 2, 3, 0) - ) # Transpose to T,K,1,N, move N to the end - self.vid3D_list.append(points_vis3D) - self.sample_info_list.append(tmp_info) - - del points_2d_np, points_vis3D, points_2d_vis_np - gc.collect() - torch.cuda.empty_cache() - - def __len__(self): - return len(self.pose_3D_list) - - def __getitem__(self, index): - return self.getitem(index) - - def getitem(self, index): - - pose_3D = self.pose_3D_list[index].copy() - pose_2D = self.pose_2D_list[index].copy() - vid_3D = self.vid3D_list[index].copy() - vid_2D = self.vid2D_list[index].copy() - sample_info = self.sample_info_list[index] - - if "TRAIN" in self.split.upper() and self.arg_views > 0: - pose_3D, pose_2D = self.view_aug(pose_3D, pose_2D) - tmp_vid = np.repeat( - np.expand_dims(copy.deepcopy(vid_3D), axis=-1), self.arg_views, axis=-1 - ) - vid_2D = np.concatenate((vid_2D, tmp_vid), axis=-1) - # clip vid into 0,1 - # vid_2D = np.clip(vid_2D,0,1) - - pose_root = copy.deepcopy(pose_3D[:, self.root_index : self.root_index + 1, :, :]) - pose_3D[:, self.root_index : self.root_index + 1, :, :] = 0.0 - pose_3D = np.nan_to_num(pose_3D, nan=0) - pose_2D = np.concatenate((pose_2D, vid_2D), axis=2) - - return ( - FN(pose_3D).float(), - FN(pose_root).float(), - FN(pose_2D).float(), - FN(vid_3D).float(), - FN(sample_info).float(), - ) - - -if __name__ == "__main__": - from common.arguments import parse_args - from common.config import config as cfg - from common.config import reset_config, update_config - - from scripts.reset_config_arber import reset_config_arber - from scripts.reset_config_rat7m import reset_config_rat7m - - cam_names = ["Camera0", "Camera1", "Camera2", "Camera3", "Camera4", "Camera5"] - data_dir = "/workspace/MTFpose/data/Arber_tiny" - args = parse_args() - update_config(args.cfg) - reset_config(cfg, args) - reset_config_arber(cfg) - - args = parse_args() - update_config(args.cfg) ###config file->cfg - reset_config(cfg, args) ###arg -> cfg - reset_config_rat7m(cfg) - - print(cfg) - os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cfg.GPU) - - root_index = cfg.TINY_DATA.ROOT_INDEX - sampling_gap = cfg.TINY_DATA.SAMPLING_GAP - joint_num = cfg.TINY_DATA.NUM_JOINTS - img_W, img_H = cfg.TINY_DATA.IMG_SIZE - use_2d_gt = cfg.DATA.USE_GT_2D - receptive_field = cfg.NETWORK.TEMPORAL_LENGTH - pad = receptive_field // 2 - causal_shift = 0 - train_dataset = ArberDataset( - cfg, - cfg.ARBER_DATA.ROOT_DIR, - "Train", - cam_names, - pad, - root_index=root_index, - use_2D_gt=use_2d_gt, - joint_num=23, - sampling_gap=60, - img_W=img_W, - img_H=img_H, - arg_views=0, - resize_2D_scale=cfg.ARBER_DATA.RESIZE_SCALE, - ) - - pose_3D, pose_root, pose_2D, vid_3D, rotation, sample_info = train_dataset.getitem(2) - print( - "output at item 250, pose_3D", - pose_3D.shape, - "pose_root", - pose_root.shape, - "pose_2D", - pose_2D.shape, - "vid_3D", - vid_3D.shape, - "rotation", - rotation.shape, - "sample_info", - sample_info, - ) - # output at item 250, pose_3D torch.Size([7, 23, 3, 6]) pose_root torch.Size([7, 1, 3, 6]) pose_2D torch.Size([7, 23, 3, 6]) vid_3D torch.Size([7, 23, 1]) rotation torch.Size([3, 3, 1, 6, 6]) sample_info tensor([ 0., 120.]) - print("in camera 0", pose_2D[0, 0, :, 0], "in camera 1", pose_2D[0, 0, :, 1]) - print(f"pose_2D maxime: {pose_2D.max().item():.4f}") - print(f"pose_2D mini: {pose_2D.min().item():.4f}") diff --git a/fmpose3d/animals/common/arguments.py b/fmpose3d/animals/common/arguments.py index f465f172..7fab2e4b 100755 --- a/fmpose3d/animals/common/arguments.py +++ b/fmpose3d/animals/common/arguments.py @@ -32,7 +32,7 @@ def init(self): self.parser.add_argument("--layers", default=3, type=int) self.parser.add_argument("--channel", default=512, type=int) self.parser.add_argument("--d_hid", default=1024, type=int) - self.parser.add_argument("--dataset", type=str, default="rat7m") + self.parser.add_argument("--dataset", type=str, default="animal3d") self.parser.add_argument("-k", "--keypoints", default="cpn_ft_h36m_dbb", type=str) self.parser.add_argument("--data_augmentation", type=bool, default=False) self.parser.add_argument("--reverse_augmentation", type=bool, default=False) @@ -42,7 +42,7 @@ def init(self): ) self.parser.add_argument("--test_augmentation_FlowAug", type=str2bool, default=False) self.parser.add_argument("--crop_uv", type=int, default=0) - self.parser.add_argument("--root_path", type=str, default="Rat7M_data/") + self.parser.add_argument("--root_path", type=str, default="./dataset/") self.parser.add_argument("-a", "--actions", default="*", type=str) self.parser.add_argument("--downsample", default=1, type=int) self.parser.add_argument("--subset", default=1, type=float) @@ -78,6 +78,14 @@ def init(self): ) self.parser.add_argument("--saved_model_path", type=str, default="") + # 2D pose model overrides (consumed by animal demo vis_animals.py). + # --saved_2d_model_path: empty -> auto-download the fine-tuned snapshot + # from Hugging Face on first run; non-empty -> use as a local override. + # --pytorch_config_2d_path: DLC architecture yaml; empty -> use the + # bundled fmpose3d/animals/configs/sa_finetune_hrnet_w32.yaml. + self.parser.add_argument("--saved_2d_model_path", type=str, default="") + self.parser.add_argument("--pytorch_config_2d_path", type=str, default="") + self.parser.add_argument("--n_joints", type=int, default=26) self.parser.add_argument("--out_joints", type=int, default=26) self.parser.add_argument("--out_all", type=int, default=1) @@ -193,13 +201,6 @@ def parse(self): self.opt.joints_left = [4, 5, 6, 11, 12, 13] self.opt.joints_right = [1, 2, 3, 14, 15, 16] - elif self.opt.dataset == "rat7m": - # Rat7M dataset configuration - self.opt.n_joints = 20 - self.opt.out_joints = 20 - self.opt.joints_left = [8, 10, 11, 17, 18] # HipL, ElbowL, ArmL, KneeL, ShinL - self.opt.joints_right = [9, 14, 15, 16, 19] # HipR, ElbowR, ArmR, KneeR, ShinR - self.opt.root_joint = 4 elif self.opt.dataset == "animal3d": # Animal3D dataset configuration diff --git a/fmpose3d/animals/common/graph_utils.py b/fmpose3d/animals/common/graph_utils.py deleted file mode 100755 index aad42f8a..00000000 --- a/fmpose3d/animals/common/graph_utils.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -FMPose3D: monocular 3D Pose Estimation via Flow Matching - -Official implementation of the paper: -"FMPose3D: monocular 3D Pose Estimation via Flow Matching" -by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis -Licensed under Apache 2.0 -""" - -from __future__ import absolute_import - -import numpy as np -import scipy.sparse as sp -import torch - - -def normalize(mx): - """Row-normalize sparse matrix""" - rowsum = np.array(mx.sum(1)) - r_inv = np.power(rowsum, -1).flatten() - r_inv[np.isinf(r_inv)] = 0.0 - r_mat_inv = sp.diags(r_inv) - mx = r_mat_inv.dot(mx) - return mx - - -def sparse_mx_to_torch_sparse_tensor(sparse_mx): - """Convert a scipy sparse matrix to a torch sparse tensor.""" - sparse_mx = sparse_mx.tocoo().astype(np.float32) - indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) - values = torch.from_numpy(sparse_mx.data) - shape = torch.Size(sparse_mx.shape) - return torch.sparse.FloatTensor(indices, values, shape) - - -def adj_mx_from_edges(num_pts, edges, sparse=True): - edges = np.array(edges, dtype=np.int32) - data, i, j = np.ones(edges.shape[0]), edges[:, 0], edges[:, 1] - adj_mx = sp.coo_matrix((data, (i, j)), shape=(num_pts, num_pts), dtype=np.float32) - # build symmetric adjacency matrix https://github.com/yao8839836/text_gcn/issues/17 - adj_mx = adj_mx + adj_mx.T.multiply(adj_mx.T > adj_mx) - adj_mx.multiply(adj_mx.T > adj_mx) - adj_mx = normalize(adj_mx + sp.eye(adj_mx.shape[0])) - if sparse: - adj_mx = sparse_mx_to_torch_sparse_tensor(adj_mx) - else: - adj_mx = torch.tensor(adj_mx.todense(), dtype=torch.float) - return adj_mx - - -def adj_mx_from_skeleton(skeleton): - num_joints = skeleton.num_joints() # 16|17 - # edge [16,2] - edges = list( - filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), skeleton.parents())) - ) # 15 # [-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 7, 11, 12, 7, 14, 15] - return adj_mx_from_edges(num_joints, edges, sparse=False) - - -def print_matrix(mat): - for i in range(len(mat)): - print(mat[i]) - - -if __name__ == "__main__": - num_joints = 17 - parents = [-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 7, 11, 12, 7, 14, 15] - edges = list(filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), parents))) - A = adj_mx_from_edges(num_joints, edges, sparse=False) - print_matrix(A) diff --git a/fmpose3d/animals/common/lifter3d.py b/fmpose3d/animals/common/lifter3d.py deleted file mode 100644 index afebffbd..00000000 --- a/fmpose3d/animals/common/lifter3d.py +++ /dev/null @@ -1,669 +0,0 @@ -""" -FMPose3D: monocular 3D Pose Estimation via Flow Matching - -Official implementation of the paper: -"FMPose3D: monocular 3D Pose Estimation via Flow Matching" -by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis -Licensed under Apache 2.0 -""" - -import glob - -import cv2 -import h5py -import matplotlib.pyplot as plt -import numpy as np -from matplotlib import cm, colormaps -from scipy.optimize import least_squares -from tqdm import tqdm # Import tqdm for the progress bar - -joint_names = [ - "snout", - "Right_Ear", - "Left_Ear", - "Shoulder_Center", - "Right_Paw", - "Right_Wrist", - "Right_Elbow", - "Right_Shoulder", - "Left_Paw", - "Left_Wrist", - "Left_Elbow", - "Left_Shoulder", - "Body_Center", - "Hip_Center", - "Right_Foot", - "Right_Ankle", - "Right_Knee", - "Left_Foot", - "Left_Ankle", - "Left_Knee", - "Tail_Tip", - "Tail_Middle", - "Tail_Root", -] - - -# joint_names = [0'snout', -# 1'Right_Ear', -# 2'Left_Ear', -# 3'Shoulder_Center', -# 4'Right_Paw', -# 5'Right_Wrist', -# 6'Right_Elbow', -# 7'Right_Shoulder', -# 8'Left_Paw', -# 9'Left_Wrist', -# 10'Left_Elbow', -# 11'Left_Shoulder', -# 12'Body_Center', -# 13'Hip_Center', -# 14'Right_Foot', -# 15'Right_Ankle', -# 16'Right_Knee', -# 17'Left_Foot', -# 18'Left_Ankle', -# 19'Left_Knee', -# 20'Tail_Tip', -# 21'Tail_Middle', -# 22'Tail_Root'] - - -# Skeleton connections -skeleton = [ - ["snout", "Right_Ear"], - ["snout", "Left_Ear"], - ["Shoulder_Center", "Right_Shoulder"], - ["Right_Shoulder", "Right_Elbow"], - ["Right_Elbow", "Right_Wrist"], - ["Right_Wrist", "Right_Paw"], - ["Shoulder_Center", "Left_Shoulder"], - ["Left_Shoulder", "Left_Elbow"], - ["Left_Elbow", "Left_Wrist"], - ["Left_Wrist", "Left_Paw"], - ["Shoulder_Center", "Body_Center"], - ["Body_Center", "Hip_Center"], - ["Hip_Center", "Right_Knee"], - ["Right_Knee", "Right_Ankle"], - ["Right_Ankle", "Right_Foot"], - ["Hip_Center", "Left_Knee"], - ["Left_Knee", "Left_Ankle"], - ["Left_Ankle", "Left_Foot"], - ["Tail_Root", "Tail_Middle"], - ["Tail_Middle", "Tail_Tip"], - ["Hip_Center", "Tail_Root"], -] - -# from name to index -name_to_index = {name: idx for idx, name in enumerate(joint_names)} - -# to skeleton -skeleton_indices = [[name_to_index[a], name_to_index[b]] for a, b in skeleton] # start from 0 - - -def compute_reprojection_errors(keypoints_2d, reprojected_2d): - # Euclidean distance per keypoint - errors = np.linalg.norm(keypoints_2d - reprojected_2d, axis=-1) # (num_frames, num_keypoints) - - # Mean error over all frames/keypoints - total_error = np.mean(errors) - # Mean error per keypoint - per_keypoint_error = np.mean(errors, axis=0) # (num_keypoints,) - - return total_error, per_keypoint_error - - -import numpy as np - - -def compute_relative_errors(keypoints_2d, reprojected_2d): - # Euclidean error (num_frames, num_keypoints) - errors = np.linalg.norm(keypoints_2d - reprojected_2d, axis=-1) - - # Mean error - total_error = np.mean(errors) - - # Mean error per keypoint (num_keypoints,) - per_keypoint_error = np.mean(errors, axis=0) - - # Pairwise Euclidean distance between keypoints (num_frames, num_keypoints, num_keypoints) - pairwise_dists = np.linalg.norm( - keypoints_2d[:, :, None, :] - keypoints_2d[:, None, :, :], axis=-1 - ) - - # Average inter-keypoint distance per frame - avg_keypoint_distance = np.mean(pairwise_dists, axis=(1, 2)) - - # Relative errors - relative_error = total_error / np.mean(avg_keypoint_distance) - - # Relative error per keypoint (num_keypoints,) - per_keypoint_relative_error = per_keypoint_error / np.mean(avg_keypoint_distance) - - return total_error, per_keypoint_error, relative_error, per_keypoint_relative_error - - -def normalize_points(points_3d): - """Normalize 3D points to [-1, 1].""" - min_vals = points_3d.min(axis=(0, 1), keepdims=True) - max_vals = points_3d.max(axis=(0, 1), keepdims=True) - points_3d_normalized = (points_3d - min_vals) / (max_vals - min_vals) * 2 - 1 - return points_3d_normalized - - -def plot_3d_keypoints_and_save_video(points_3d, output_video_path): - - num_points = len(joint_names) - cmap = colormaps["rainbow"] - colors = [cmap(i / num_points) for i in range(num_points)] - - # Create a VideoWriter to save the frames to a video - fourcc = cv2.VideoWriter_fourcc(*"mp4v") - out = cv2.VideoWriter( - output_video_path, fourcc, 30.0, (1024, 768) - ) # Adjust the frame size if needed - - num_frames = len(points_3d) - points_3d = normalize_points(points_3d) - - for frame in range(num_frames): - fig = plt.figure(figsize=(10, 10)) - ax = fig.add_subplot(111, projection="3d") - for i, joint in enumerate(joint_names): - x, y, z = points_3d[frame][i] - ax.scatter(x, y, z, color=colors[i], s=50) - ax.set_xlabel("X") - ax.set_ylabel("Y") - ax.set_zlabel("Z") - ax.set_title(f"Frame {frame+1}") - - # Adjust the view angle and limits to make the plot consistent - ax.view_init(elev=30, azim=45) # Adjust the view for better 3D perspective - ax.set_xlim([-1, 1]) # Adjust based on your data range - ax.set_ylim([-1, 1]) - ax.set_zlim([-1, 1]) - - # Save the current figure as an image to be added to the video - plt.subplots_adjust(left=0, right=1, top=1, bottom=0) # Remove extra margins - fig.canvas.draw() # Draw the figure - img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) - img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,)) # Convert to RGB image format - - # Resize the image to fit video frame size - img_resized = cv2.resize(img, (1024, 768)) - - # Write the frame to the video - out.write(img_resized) - plt.close(fig) - - # Clear the figure to free memory - plt.clf() - - out.release() - - -def plot_3d_skeleton_and_save_video(points_3d, output_video_path, num_frames_to_save=200): - num_joints = len(joint_names) - cmap = colormaps["rainbow"] - colors = [cmap(i / num_joints) for i in range(num_joints)] - - fourcc = cv2.VideoWriter_fourcc(*"mp4v") - out = cv2.VideoWriter(output_video_path, fourcc, 30.0, (1920, 1080)) - - # num_frames = min(points_3d.shape[0], num_frames_to_save) - points_3d = normalize_points(points_3d) - - for frame in range(num_frames_to_save): - fig = plt.figure(figsize=(10, 10)) - ax = fig.add_subplot(111, projection="3d") - - # draw keypoints - for i, joint in enumerate(joint_names): - # print("shape of 3d points",points_3d[frame][i]) - - x, y, z = points_3d[frame][i] - ax.scatter(x, y, z, color=colors[i], s=50) - - # draw skeleton connections - for bone in skeleton: - if bone[0] in joint_names and bone[1] in joint_names: - i1, i2 = joint_names.index(bone[0]), joint_names.index(bone[1]) - x_vals = [points_3d[frame, i1, 0], points_3d[frame, i2, 0]] - y_vals = [points_3d[frame, i1, 1], points_3d[frame, i2, 1]] - z_vals = [points_3d[frame, i1, 2], points_3d[frame, i2, 2]] - ax.plot(x_vals, y_vals, z_vals, color="black", linewidth=2, alpha=0.8) - - ax.set_xlabel("X") - ax.set_ylabel("Y") - ax.set_zlabel("Z") - ax.set_title(f"Frame {frame+1}") - ax.view_init(elev=30, azim=45) - ax.set_xlim([-1, 1]) - ax.set_ylim([-1, 1]) - ax.set_zlim([-1, 1]) - - # save frame and write to video - fig.canvas.draw() - img = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8) - img = img.reshape(fig.canvas.get_width_height()[::-1] + (4,)) - img_bgr = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR) - img_resized = cv2.resize(img_bgr, (1920, 1080)) - out.write(img_resized) - - plt.close(fig) - - out.release() - - -def load_camera_params(yaml_path, scale=0.5): - """use opencv to read from yaml""" - fs = cv2.FileStorage(yaml_path, cv2.FILE_STORAGE_READ) - intrinsic_matrix = fs.getNode("intrinsicMatrix").mat() - distortion_coeffs = fs.getNode("distortionCoefficients").mat() - R = fs.getNode("R").mat() - R = R.T - T = fs.getNode("T").mat() - fs.release() - - intrinsic_matrix = intrinsic_matrix.astype(np.float64) - # check intrinsic matrix - if not (np.allclose(intrinsic_matrix[2, :], [0, 0, 1], atol=1e-6)): - intrinsic_matrix = intrinsic_matrix.T - # scale the intrinsic matrix - scale = 0.5 - if scale != 1.0: - intrinsic_matrix[0, 0] *= scale - intrinsic_matrix[1, 1] *= scale - intrinsic_matrix[0, 2] *= scale - intrinsic_matrix[1, 2] *= scale - - return { - "intrinsic_matrix": intrinsic_matrix, - "distortion_coeffs": distortion_coeffs, - "R": R, - "T": T, - } - - -import cv2 -import numpy as np - - -def triangulate_3d_batch(points_2d_batch, cameras): - """ - input (num_cams, num_frames, num_joints, 2) - - parameters: - - points_2d_batch: (6, num_frames, 23, 2) - - cameras: list of length 6 `intrinsic_matrix`、`distortion_coeffs`、`R`、`T` - - return: - - points_3d_batch: (num_frames, 23, 3) - - create matrix A to avoid for iteration - """ - points_2d_batch = np.array(points_2d_batch) - num_cams, num_frames, num_joints, _ = points_2d_batch.shape # (6, num_frames, 23, 2) - - print("num_cams,num_frames,num_joinits", points_2d_batch.shape) - # **1. compute projection matrices (6, 3, 4)** - proj_matrices = np.array( - [cam["intrinsic_matrix"] @ np.hstack((cam["R"], cam["T"])) for cam in cameras] - ) # numpy array (6, 3, 4) - - # **2. undistortPoints** - points_2d_undistorted = np.zeros_like(points_2d_batch) # (6, num_frames, 23, 2) - for i in range(num_cams): - K, dist = cameras[i]["intrinsic_matrix"], cameras[i]["distortion_coeffs"] - undistorted = cv2.undistortPoints(points_2d_batch[i].reshape(-1, 1, 2), K, dist, None, None) - undistorted = undistorted.reshape(num_frames, num_joints, 2) - print("undistorted shape:", undistorted.shape) - print("ones shape:", np.ones((num_frames, num_joints, 1)).shape) - # undistorted = (K @ np.hstack([undistorted, np.ones((num_frames, num_joints, 1))]).T).T[:, :, :2] - undistorted = np.concatenate( - [undistorted, np.ones((*undistorted.shape[:-1], 1))], axis=-1 - ) # (50, 23, 3) - undistorted = (K @ undistorted[..., None])[..., 0] - points_2d_undistorted[i] = undistorted[..., :2] - - # **3. Construct matrix A for triangulation** - # Formula: A = [x P_3 - P_1; y P_3 - P_2], creating 6*2=12 equations per point - x = points_2d_undistorted[..., 0] # (6, num_frames, 23) - y = points_2d_undistorted[..., 1] # (6, num_frames, 23) - - # Extract projection matrix rows - P1 = proj_matrices[:, None, None, 0, :] # (6, 1, 1, 4) - P2 = proj_matrices[:, None, None, 1, :] # (6, 1, 1, 4) - P3 = proj_matrices[:, None, None, 2, :] # (6, 1, 1, 4) - - # Compute A (6, num_frames, 23, 2, 4) - A = np.stack( - [x[..., None] * P3 - P1, y[..., None] * P3 - P2], axis=-2 - ) # (6, num_frames, 23, 2, 4) - A = A.reshape(num_cams * 2, num_frames, num_joints, 4) # (12, num_frames, 23, 4) - - # **4. Solve using batch SVD** - _, _, Vh = np.linalg.svd(A, full_matrices=False) # Vh shape: (12, num_frames, 23, 4) - X_hom = Vh[..., -1] # Take last row (solution) (12, num_frames, 23, 4) - - # **5. Convert homogeneous coordinates to 3D** - points_3d_batch = X_hom[..., :3] / X_hom[..., 3:] # (num_frames, 23, 3) - - return points_3d_batch # (num_frames, 23, 3) - - -def triangulate_3d(points_2d, cameras): - # points_2d list of 6 in (23,2) - """triangulate usd SVD""" - proj_matrices = [] - points_2d_undistorted = [] - - for i, cam in enumerate(cameras): - K, dist, R, T = cam["intrinsic_matrix"], cam["distortion_coeffs"], cam["R"], cam["T"] - - P = K @ np.hstack((R, T)) # projection matrix - # print("Projection Matrix P:\n", P) - - proj_matrices.append(P) - - # undistortion - # undistorted = cv2.undistortPoints(points_2d[i].reshape(-1, 1, 2), K, dist, None, K).reshape(-1, 2) - undistorted = cv2.undistortPoints( - points_2d[i].reshape(-1, 1, 2), K, dist, None, None - ).reshape(-1, 2) - undistorted = (K @ np.hstack([undistorted, np.ones((undistorted.shape[0], 1))]).T).T[:, :2] - - points_2d_undistorted.append(undistorted) - - # print("before undistortion and after",points_2d[0],points_2d_undistorted[0]) - # SVD - num_points = points_2d_undistorted[0].shape[0] - points_3d = np.zeros((num_points, 3)) - - for j in range(num_points): - A = [] - for i in range(len(proj_matrices)): - P = proj_matrices[i] - x, y = points_2d_undistorted[i][j] - - # build linear system Ax = 0 - A.append(x * P[2, :] - P[0, :]) - A.append(y * P[2, :] - P[1, :]) - - A = np.array(A) - _, _, Vh = np.linalg.svd(A) - X_hom = Vh[-1] - X = X_hom[:3] / X_hom[3] - points_3d[j] = X - - # print("3D points and the 2D on camera 0",points_3d,points_2d_undistorted[0]) - return points_3d - - -def triangulate_3d_confi(points_2d, cameras): - # points_2d: list of 6 in (23, 3), last dim is confidence - """Triangulate using SVD with confidence""" - proj_matrices = [] - points_2d_undistorted = [] - confidences = [] - - for i, cam in enumerate(cameras): - K, dist, R, T = cam["intrinsic_matrix"], cam["distortion_coeffs"], cam["R"], cam["T"] - P = K @ np.hstack((R, T)) # Projection matrix - proj_matrices.append(P) - - # Undistortion - undistorted = cv2.undistortPoints( - points_2d[i][:, :2].reshape(-1, 1, 2), K, dist, None, None - ).reshape(-1, 2) - undistorted = (K @ np.hstack([undistorted, np.ones((undistorted.shape[0], 1))]).T).T[:, :2] - points_2d_undistorted.append(undistorted) - - # Collect confidences - confidences.append(points_2d[i][:, 2]) - - num_points = points_2d_undistorted[0].shape[0] - points_3d = np.zeros((num_points, 4)) # last dimension stores confidence - - for j in range(num_points): - A = [] - point_confidences = [] - for i in range(len(proj_matrices)): - P = proj_matrices[i] - x, y = points_2d_undistorted[i][j] - conf = confidences[i][j] - - if conf > 0: - # build linear system Ax = 0 - A.append(conf * (x * P[2, :] - P[0, :])) - A.append(conf * (y * P[2, :] - P[1, :])) - point_confidences.append(conf) - - if len(A) > 0: - A = np.array(A) - _, _, Vh = np.linalg.svd(A) - X_hom = Vh[-1] - X = X_hom[:3] / X_hom[3] - points_3d[j, :3] = X - - # confidence: mean over valid views - points_3d[j, 3] = np.mean(point_confidences) - else: - points_3d[j, 3] = 0 - - return points_3d - - -# def project_3d_to_2d(points_3d, camera): -# K, dist, R, T = camera["intrinsic_matrix"], camera["distortion_coeffs"], camera["R"], camera["T"] - -# points_3d = np.asarray(points_3d, dtype=np.float32) - -# R = np.array(R, dtype=np.float32) -# T = np.array(T, dtype=np.float32) -# K = np.array(K, dtype=np.float32) -# dist = np.array(dist, dtype=np.float32) - -# points_2d_proj, _ = cv2.projectPoints(points_3d, R, T, K, dist) -# return points_2d_proj.reshape(-1, 2) - - -def project_3d_to_2d(points_3d, camera): - K, dist, R, T = ( - camera["intrinsic_matrix"], - camera["distortion_coeffs"], - camera["R"], - camera["T"], - ) - - points_2d_proj, _ = cv2.projectPoints(points_3d, R, T, K, dist) - return points_2d_proj.reshape(-1, 2) - - -def load_h5_keypoints(h5_path): - """load 2D keypoint from h5 file""" - with h5py.File(h5_path, "r") as f: - group = f["df_with_missing"] - dataset = group["block0_values"] - # dataset = group['table'] - return np.array(dataset).reshape(-1, 23, 3) - - -def load_h5_keypoints_cspnext(h5_path): - """load 2D keypoint from h5 file with cspnext format""" - with h5py.File(h5_path, "r") as f: - group = f["df_with_missing"] - dataset = np.array(group["table"]) - values = dataset["values_block_0"] - return values.reshape(-1, 23, 3) # Reshape to (num_frames, num_joints, 3) - - -def visualize_2d_on_video( - video_path, frame_number, original_keypoints, reprojected_keypoints, output_path -): - """plot keypoint on frame n and save into png""" - cap = cv2.VideoCapture(video_path) - cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number) # jump to frame - ret, frame = cap.read() - - if not ret: - print(f"Failed to read frame {frame_number} from video {video_path}") - cap.release() - return - - h, w = frame.shape[:2] - # reprojected_keypoints[:,0] /= h - # reprojected_keypoints[:,1] /= w - print("Original 2D Points (+):", original_keypoints) - print("Projected 2D Points (dot):", reprojected_keypoints) - print("height, and width", h, w) - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - plt.figure(figsize=(10, 10)) - plt.imshow(frame) - - num_points = max(len(original_keypoints), len(reprojected_keypoints)) - cmap = colormaps["rainbow"] - colors = [cmap(i / num_points) for i in range(num_points)] - - # plot original keypoints in + - for i, point in enumerate(original_keypoints): - if not np.isnan(point).any(): - x, y = point[0], point[1] - if x < 0 or y < 0 or x >= w or y >= h: - continue - plt.scatter( - x, y, marker="+", color=colors[i], s=15, linewidths=1, label=f"keypoints{i}" - ) - - # plot reprojected keypoints in o - for i, point in enumerate(reprojected_keypoints): - if not np.isnan(point).any(): - x, y = point[0], point[1] - if x < 0 or y < 0 or x >= w or y >= h: - continue - plt.scatter(x, y, marker="o", color=colors[i], s=3, label=f"reprojected{i}") - - plt.axis("off") - - plt.savefig(output_path, bbox_inches="tight") - plt.close() - cap.release() - - -def main(): - yaml_files = sorted( - glob.glob( - "/workspace/MTFpose/data/Arber_data_noeva/076_02_221114_start0/calibration/*.yaml" - ) - ) - h5_files = sorted( - glob.glob("/workspace/MTFpose/data/Arber_data_noeva/076_02_221114_start0/pose2d_dlc/*.h5") - ) - video_files = sorted( - glob.glob("/workspace/MTFpose/data/Arber_data_noeva/076_02_221114_start0/video_dlc/*.mp4") - ) - - cameras = [load_camera_params(yaml) for yaml in yaml_files] - - # print("cameras len",len(cameras)) - # keypoints_2d = [load_h5_keypoints(h5) for h5 in h5_files] - keypoints_2d = [load_h5_keypoints_cspnext(h5) for h5 in h5_files] - - print( - "list: yaml ", - len(yaml_files), - "h5", - len(h5_files), - "video ", - len(video_files), - "cameras", - len(cameras), - "keypoints", - len(keypoints_2d), - ) - - total_num_frames = keypoints_2d[0].shape[0] - print("num_frames", total_num_frames, "for each cam", keypoints_2d[0].shape) # (119498, 23, 3) - - # choose frame - frame_number = 5 - - # Choose the number of frames to visualize - num_frames_to_save = total_num_frames - - points_3d_list = [] - repro_2d_list = [] - - # save all 3d keypoints into .npy - - # visualization and save - for frame in tqdm( - range(total_num_frames), desc="Processing frames", unit="frame" - ): # loop in frames - points_2d_frame = [keypoints_2d[cam_i][frame][:, :2] for cam_i in range(len(cameras))] - - # print("len of points 2d frame",len(points_2d_frame),points_2d_frame[0].shape) #len: 6 each shape :(23,2) - - # points_3d = triangulate_3d_confi(points_2d_frame, cameras) - points_3d = triangulate_3d(points_2d_frame, cameras) - points_3d_list.append(points_3d) - - # for i in range(len(cameras)): - # points_3d_array = np.array(points_3d_list).reshape(total_num_frames,-1,3) - # reprojected_2d = np.array([project_3d_to_2d(frame_3d, cameras[i]) for frame_3d in points_3d_array]) - # total_error, per_keypoint_error, relative_error, per_keypoint_relative_error = compute_relative_errors(keypoints_2d[i][:total_num_frames,:,:2], reprojected_2d) - - # reprojected_2d = project_3d_to_2d(points_3d, cameras[0]) - points_3d = np.array(points_3d_list) - print("shape of points 3d to save", points_3d.shape) # (num_frames, 23, 3) - output_npy_path = ( - "/workspace/MTFpose/data/Arber_data_noeva/076_02_221114_start0/triangulated_3d.npy" - ) - np.save(output_npy_path, points_3d) - # print("points_3d_fragment from 3d lift and stack, shape",points_3d.shape) - - # if frame==frame_number:# visualize nth frame and save PNG - # for i in range(len(cameras)): - # output_path = f"/workspace/MTFpose/results/Camera_{i}_frame_{frame_number}.png" - # reprojected_2d = project_3d_to_2d(points_3d, cameras[i]) - # # visualize_2d_on_video(video_files[i], frame_number, points_2d_frame[i], reprojected_2d, output_path) - - # # test on triangulate 3D batch - # left_frame_id = 10 - # right_frame_id = 60 - - # points_2d_fragment = [keypoints_2d[i][left_frame_id:right_frame_id,:,:2] for i in range(len(yaml_files))] - - # # get 3D keypoint from 3d lift - # points_2d_fragment_np = np.array(points_2d_fragment) # (num_cams, num_frames, num_joints, 2) N,T,K,2 - # points_3d_fragment = triangulate_3d_batch(points_2d_fragment_np,cameras) #(num_frames, 23, 3) T,K,3 - - # # save skeleton for several frames - # output_video_path = '/workspace/MTFpose/results/skeleton_batch_videodemo.mp4' - # plot_3d_skeleton_and_save_video(points_3d_fragment,output_video_path,num_frames_to_save = num_frames_to_save) - - # compute reprojection error for each view - for i in range(len(cameras)): - # points_3d_list : (num_frames, num_keypoints, 3) - # print("shape of points",len(points_3d_list),points_3d_list[0].shape) - points_3d_array = np.array(points_3d_list).reshape( - total_num_frames, -1, 3 - ) # with confidence - - # points_3d_array = points_3d_array[:,:,:3] - # print("shape of points_3d_array",points_3d_array.shape,points_3d.dtype,points_3d_array) - - # reprojected_2d = project_3d_to_2d(points_3d_array, cameras[i]) - reprojected_2d = np.array( - [project_3d_to_2d(frame_3d, cameras[i]) for frame_3d in points_3d_array] - ) - # print("shape of reprojected_2d and points_3d",reprojected_2d.shape, points_3d_array.shape,keypoints_2d[0].shape) - - total_error, per_keypoint_error, relative_error, per_keypoint_relative_error = ( - compute_relative_errors(keypoints_2d[i][:total_num_frames, :, :2], reprojected_2d) - ) - print(f"reprojection error in camera_{i}", total_error, relative_error) - - -if __name__ == "__main__": - main() diff --git a/fmpose3d/animals/common/mocap_dataset.py b/fmpose3d/animals/common/mocap_dataset.py deleted file mode 100755 index 75d837cd..00000000 --- a/fmpose3d/animals/common/mocap_dataset.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -FMPose3D: monocular 3D Pose Estimation via Flow Matching - -Official implementation of the paper: -"FMPose3D: monocular 3D Pose Estimation via Flow Matching" -by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis -Licensed under Apache 2.0 -""" - -class MocapDataset: - def __init__(self, fps, skeleton): - self._skeleton = skeleton - self._fps = fps - self._data = None - self._cameras = None - - def remove_joints(self, joints_to_remove): - kept_joints = self._skeleton.remove_joints(joints_to_remove) - for subject in self._data.keys(): - for action in self._data[subject].keys(): - s = self._data[subject][action] - s["positions"] = s["positions"][:, kept_joints] - - def __getitem__(self, key): - return self._data[key] - - def subjects(self): - return self._data.keys() - - def fps(self): - return self._fps - - def skeleton(self): - return self._skeleton - - def cameras(self): - return self._cameras - - def supports_semi_supervised(self): - return False diff --git a/fmpose3d/animals/configs/__init__.py b/fmpose3d/animals/configs/__init__.py new file mode 100644 index 00000000..f5646c7a --- /dev/null +++ b/fmpose3d/animals/configs/__init__.py @@ -0,0 +1,26 @@ +""" +FMPose3D: monocular 3D Pose Estimation via Flow Matching + +Official implementation of the paper: +"FMPose3D: monocular 3D Pose Estimation via Flow Matching" +by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis +Licensed under Apache 2.0 +""" + +"""Bundled DLC ``pytorch_config.yaml`` files for the animal 2D detector. + +These yamls describe FMPose3D's fine-tuned SuperAnimal-Quadruped variants +and are loaded by :class:`fmpose3d.inference_api.SuperAnimalEstimator` when +the user does not supply an explicit ``pytorch_config_path``. They are +shipped as package data (see ``pyproject.toml`` ``[tool.setuptools.package-data]``). +""" + +from pathlib import Path + +CONFIGS_DIR = Path(__file__).parent + +SA_FINETUNE_HRNET_W32_YAML: str = str(CONFIGS_DIR / "sa_finetune_hrnet_w32.yaml") +"""DLC config for SA-Quadruped HRNet-w32 fine-tuned on Animal3D + +Control-Animal3D with the 26-joint Animal3D output layout.""" + +__all__ = ["CONFIGS_DIR", "SA_FINETUNE_HRNET_W32_YAML"] diff --git a/fmpose3d/animals/configs/sa_finetune_hrnet_w32.yaml b/fmpose3d/animals/configs/sa_finetune_hrnet_w32.yaml new file mode 100644 index 00000000..530ea63f --- /dev/null +++ b/fmpose3d/animals/configs/sa_finetune_hrnet_w32.yaml @@ -0,0 +1,220 @@ +# DeepLabCut pytorch_config for FMPose3D's 2D animal pose model: +# SuperAnimal-Quadruped HRNet-w32 backbone fine-tuned on Animal3D, with +# the heatmap head re-trained for the 26-joint Animal3D output layout. +# +# Loaded by fmpose3d.inference_api.SuperAnimalEstimator and passed to +# DLC's `superanimal_analyze_images(..., customized_model_config=, +# customized_pose_checkpoint=)`. Only the pose +# model is fine-tuned; the bounding-box detector (Faster R-CNN) is the +# stock SuperAnimal-Quadruped one resolved by DLC at runtime. +data: + bbox_margin: 20 + colormode: RGB + inference: + normalize_images: true + top_down_crop: + width: 256 + height: 256 + auto_padding: + pad_width_divisor: 32 + pad_height_divisor: 32 + train: + affine: + p: 0.5 + rotation: 30 + scaling: + - 1.0 + - 1.0 + translation: 0 + gaussian_noise: 12.75 + motion_blur: true + normalize_images: true + top_down_crop: + width: 256 + height: 256 + auto_padding: + pad_width_divisor: 32 + pad_height_divisor: 32 +detector: + data: + colormode: RGB + inference: + normalize_images: true + train: + affine: + p: 0.5 + rotation: 30 + scaling: + - 1.0 + - 1.0 + translation: 40 + collate: + type: ResizeFromDataSizeCollate + min_scale: 0.4 + max_scale: 1.0 + min_short_side: 128 + max_short_side: 1152 + multiple_of: 32 + to_square: false + hflip: true + normalize_images: true + device: auto + model: + type: FasterRCNN + freeze_bn_stats: true + freeze_bn_weights: false + variant: fasterrcnn_resnet50_fpn_v2 + runner: + type: DetectorTrainingRunner + key_metric: test.mAP@50:95 + key_metric_asc: true + eval_interval: 10 + optimizer: + type: AdamW + params: + lr: 0.0001 + scheduler: + type: LRListScheduler + params: + milestones: + - 160 + lr_list: + - - 1e-05 + snapshots: + max_snapshots: 5 + save_epochs: 25 + save_optimizer_state: false + train_settings: + batch_size: 1 + dataloader_workers: 0 + dataloader_pin_memory: false + display_iters: 500 + epochs: 250 +device: auto +inference: + multithreading: + enabled: true + queue_length: 4 + timeout: 30.0 + compile: + enabled: false + backend: inductor + autocast: + enabled: false +metadata: + project_path: "" + pose_config_path: "" + bodyparts: + - left_eye + - right_eye + - chin + - left_front_paw + - right_front_paw + - left_back_paw + - right_back_paw + - tail_base + - left_front_thigh + - right_front_thigh + - left_back_thigh + - right_back_thigh + - left_shoulder + - right_shoulder + - left_front_knee + - right_front_knee + - left_back_knee + - right_back_knee + - neck_base + - tail_mid + - left_ear_base + - right_ear_base + - left_mouth_corner + - right_mouth_corner + - nose + - tail_tip_first + unique_bodyparts: [] + individuals: + - individual000 + with_identity: false +method: td +model: + backbone: + type: HRNet + model_name: hrnet_w32 + freeze_bn_stats: true + freeze_bn_weights: false + interpolate_branches: false + increased_channel_count: false + backbone_output_channels: 32 + heads: + bodypart: + type: HeatmapHead + weight_init: normal + predictor: + type: HeatmapPredictor + apply_sigmoid: false + clip_scores: true + location_refinement: true + locref_std: 7.2801 + target_generator: + type: HeatmapGaussianGenerator + num_heatmaps: 26 + pos_dist_thresh: 17 + heatmap_mode: KEYPOINT + gradient_masking: true + background_weight: 0.0 + generate_locref: true + locref_std: 7.2801 + criterion: + heatmap: + type: WeightedMSECriterion + weight: 1.0 + locref: + type: WeightedHuberCriterion + weight: 0.05 + heatmap_config: + channels: + - 32 + kernel_size: [] + strides: [] + final_conv: + out_channels: 26 + kernel_size: 1 + locref_config: + channels: + - 32 + kernel_size: [] + strides: [] + final_conv: + out_channels: 52 + kernel_size: 1 +net_type: hrnet_w32 +runner: + type: PoseTrainingRunner + gpus: + key_metric: test.mAP + key_metric_asc: true + eval_interval: 10 + optimizer: + type: AdamW + params: + lr: 0.0001 + scheduler: + type: LRListScheduler + params: + lr_list: + - - 1e-05 + - - 1e-06 + milestones: + - 160 + - 190 + snapshots: + max_snapshots: 5 + save_epochs: 10 + save_optimizer_state: false +train_settings: + batch_size: 64 + dataloader_workers: 8 + dataloader_pin_memory: false + display_iters: 500 + epochs: 200 + seed: 42 diff --git a/fmpose3d/animals/models/graph_frames.py b/fmpose3d/animals/models/graph_frames.py index 7d07645d..d69f173c 100755 --- a/fmpose3d/animals/models/graph_frames.py +++ b/fmpose3d/animals/models/graph_frames.py @@ -19,7 +19,6 @@ class Graph(): layout (string): must be one of the follow candidates - 'hm36_gt': Ground truth structure of Human3.6M, with 17 joints per frame - 'animal3d': Skeleton structure for Animal3D dataset, with 26 joints per frame - - 'rat7m': Skeleton structure for Rat7M dataset, with 20 joints per frame max_hop (int): the maximal distance between two connected nodes dilation (int): controls the spacing between the kernel points @@ -48,7 +47,6 @@ def get_distance_to_center(self,layout): :return: get the distance of each node to center For hm36_gt: center is joint 7 For animal3d: center is joint 18 (neck, root joint) - For rat7m: center is joint 4 (SpineM, root joint) """ dist_center = np.zeros(self.num_node) if layout == 'hm36_gt': @@ -338,17 +336,4 @@ def normalize_undigraph(A): print(f" - Head: {graph_animal.head}") print(f" - Tail: {graph_animal.tail}") print(f" Distance to center (joint 18): {graph_animal.dist_center}") - - # Test Rat7M skeleton - print("\nTesting Rat7M skeleton (20 joints):") - graph_rat = Graph('rat7m', 'spatial', 1) - print(f" Adjacency matrix shape: {graph_rat.A.shape}") - print(f" Center joint: {graph_rat.center}") - print(f" Number of nodes: {graph_rat.num_node}") - print(f" Body parts:") - print(f" - Left front leg: {graph_rat.left_front}") - print(f" - Right front leg: {graph_rat.right_front}") - print(f" - Left hind leg: {graph_rat.left_hind}") - print(f" - Right hind leg: {graph_rat.right_hind}") - print(f" - Spine: {graph_rat.spine}") print(f" Distance to center (joint 4): {graph_rat.dist_center}") \ No newline at end of file diff --git a/fmpose3d/common/config.py b/fmpose3d/common/config.py index ded5afa5..3508c0b6 100644 --- a/fmpose3d/common/config.py +++ b/fmpose3d/common/config.py @@ -249,9 +249,17 @@ class SuperAnimalConfig(Pose2DConfig): """DeepLabCut SuperAnimal 2D pose detector configuration. Uses the DeepLabCut ``superanimal_analyze_images`` API to detect - animal keypoints in the quadruped80K format, then maps them to the - Animal3D 26-keypoint layout expected by the ``fmpose3d_animals`` - 3D lifter. + animal keypoints. Supports two modes: + + * **Fine-tuned.** Predicts the 26-joint Animal3D layout natively + (no remap). Activated by either ``auto_download_finetuned=True`` + (used by :meth:`FMPose3DInference.for_animals` — snapshot is + auto-downloaded from Hugging Face on first predict) or by setting + ``pose_snapshot_path`` to a local ``.pt`` file. + * **Stock SA.** Runs the published ``superanimal_quadruped`` weights + (39 keypoints) and remaps to the 26-joint Animal3D layout via + :meth:`SuperAnimalEstimator._map_keypoints`. Active when the bare + ``SuperAnimalConfig()`` default is used (all paths/flag empty). Attributes ---------- @@ -263,12 +271,37 @@ class SuperAnimalConfig(Pose2DConfig): Object detector used for animal bounding boxes. max_individuals : int Maximum number of individuals to detect per image (default 1). + pytorch_config_path : str + Path to a DLC ``pytorch_config.yaml`` describing a fine-tuned + model. When empty, the packaged default + (:data:`fmpose3d.animals.configs.SA_FINETUNE_HRNET_W32_YAML`) + is used. Only consulted in fine-tuned mode. + pose_snapshot_path : str + Path to a fine-tuned pose ``.pt`` checkpoint. **Non-empty value + activates fine-tuned mode.** Empty → stock SA, unless + ``auto_download_finetuned`` is True. + detector_snapshot_path : str + Path to a custom Faster R-CNN checkpoint. When empty, DLC + resolves the stock SA detector from its modelzoo. + auto_download_finetuned : bool + When True and ``pose_snapshot_path`` is empty, the FMPose3D + fine-tuned snapshot is downloaded from Hugging Face on first + :meth:`SuperAnimalEstimator.predict` call (cached under + ``~/.cache/huggingface``). This is what + :meth:`FMPose3DInference.for_animals` uses as its default so the + animal pipeline runs out-of-the-box without manual downloads. + Standalone ``SuperAnimalConfig()`` keeps it False so that + stock SA + 39→26 remap remains the explicit, no-network default. """ pose2d_model: str = "superanimal" superanimal_name: str = "superanimal_quadruped" sa_model_name: str = "hrnet_w32" detector_name: str = "fasterrcnn_resnet50_fpn_v2" max_individuals: int = 1 + pytorch_config_path: str = "" + pose_snapshot_path: str = "" + detector_snapshot_path: str = "" + auto_download_finetuned: bool = False @dataclass diff --git a/fmpose3d/inference_api/README.md b/fmpose3d/inference_api/README.md index 5a159006..3c9f1459 100644 --- a/fmpose3d/inference_api/README.md +++ b/fmpose3d/inference_api/README.md @@ -225,7 +225,10 @@ Default 2D estimator for the human pipeline. Wraps HRNet + YOLO with a COCO → #### `SuperAnimalEstimator(cfg: SuperAnimalConfig | None)` -2D estimator for the animal pipeline. Uses DeepLabCut SuperAnimal and maps quadruped80K keypoints to the 26-joint Animal3D layout. +2D estimator for the animal pipeline. Produces the 26-joint Animal3D keypoint layout via DeepLabCut SuperAnimal. Supports two modes: + +- **Fine-tuned** (default when accessed via `FMPose3DInference.for_animals()`): runs an FMPose3D fine-tuned SA-Quadruped HRNet-w32 snapshot that natively outputs 26 joints. The snapshot is auto-downloaded from [Hugging Face](https://huggingface.co/MLAdaptiveIntelligence/FMPose3D) on the first predict call when `cfg.auto_download_finetuned=True`. +- **Stock SA** (low-level opt-in): runs the published `superanimal_quadruped` weights (39 keypoints) and remaps to 26 joints via `_map_keypoints`. Activated by `SuperAnimalEstimator(SuperAnimalConfig())` with all paths/flags empty. If DeepLabCut is not installed, calling this estimator raises a clear `ImportError` with the recommended install command: `pip install "fmpose3d[animals]"`. diff --git a/fmpose3d/inference_api/fmpose3d.py b/fmpose3d/inference_api/fmpose3d.py index 970a5e3d..10d0b3c7 100644 --- a/fmpose3d/inference_api/fmpose3d.py +++ b/fmpose3d/inference_api/fmpose3d.py @@ -256,6 +256,28 @@ def predict( all_mapped: list[np.ndarray] = [] all_scores: list[np.ndarray] = [] + # Resolve pose snapshot: explicit local path > HF auto-download > empty (stock). + pose_snapshot_path = cfg.pose_snapshot_path + if not pose_snapshot_path and cfg.auto_download_finetuned: + from fmpose3d.utils.weights import resolve_weights_path + pose_snapshot_path = resolve_weights_path("", "sa_finetune_hrnet_w32.pt") + + # Fine-tuned mode: non-empty resolved path swaps the stock 39-joint head + # for a custom DLC checkpoint that predicts the 26-joint Animal3D layout + # natively (no _map_keypoints needed). + is_finetuned = bool(pose_snapshot_path) + if is_finetuned: + from fmpose3d.animals.configs import SA_FINETUNE_HRNET_W32_YAML + customized_kwargs = dict( + customized_model_config=( + cfg.pytorch_config_path or SA_FINETUNE_HRNET_W32_YAML + ), + customized_pose_checkpoint=pose_snapshot_path, + customized_detector_checkpoint=cfg.detector_snapshot_path or None, + ) + else: + customized_kwargs = {} + with tempfile.TemporaryDirectory() as tmpdir: # Write each frame as an image so DLC can read it. paths: list[str] = [] @@ -272,10 +294,12 @@ def predict( images=paths, max_individuals=cfg.max_individuals, out_folder=tmpdir, - progress_bar=False + progress_bar=False, + **customized_kwargs, ) # predictions: {image_path: {"bodyparts": (N_ind, K, 3), ...}} - # Iterate in input order to keep frame alignment stable. + # In fine-tuned mode K == 26 already; in stock mode K == 39 + # (quadruped80K) and is remapped via _map_keypoints/_map_scores. for img_path in paths: payload = predictions.get(img_path) if isinstance(predictions, dict) else None if payload is None and isinstance(predictions, dict) and len(predictions) == 1: @@ -291,8 +315,12 @@ def predict( xy = bodyparts[..., :2] # (N_ind, K, 2) conf = bodyparts[..., 2] # (N_ind, K) - mapped = self._map_keypoints(xy) - mapped_scores = self._map_scores(conf) + if is_finetuned: + mapped = xy + mapped_scores = conf + else: + mapped = self._map_keypoints(xy) + mapped_scores = self._map_scores(conf) # Take only the first individual. all_mapped.append(mapped[:1]) @@ -599,7 +627,13 @@ def _default_components( means adding one branch here (or turning this into a registry). """ if model_cfg.model_type == SupportedModel.FMPOSE3D_ANIMALS: - return SuperAnimalEstimator(), AnimalPostProcessor() + # Default to fine-tuned + lazy HF auto-download so the animal API + # works out-of-the-box. Construction stays cheap (no network); + # the download fires on the first predict() call. + return ( + SuperAnimalEstimator(SuperAnimalConfig(auto_download_finetuned=True)), + AnimalPostProcessor(), + ) return HRNetEstimator(), HumanPostProcessor() diff --git a/fmpose3d/lib/hrnet/gen_kpts.py b/fmpose3d/lib/hrnet/gen_kpts.py index 0049997c..1445b700 100755 --- a/fmpose3d/lib/hrnet/gen_kpts.py +++ b/fmpose3d/lib/hrnet/gen_kpts.py @@ -82,11 +82,11 @@ def reset_config(args): # load model def model_load(config): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = pose_hrnet.get_pose_net(config, is_train=False) - if torch.cuda.is_available(): - model = model.cuda() + model = model.to(device) - state_dict = torch.load(config.OUTPUT_DIR, weights_only=True) + state_dict = torch.load(config.OUTPUT_DIR, map_location=device, weights_only=True) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): @@ -133,8 +133,8 @@ def gen_from_image(args, frame, people_sort, human_model, pose_model, det_dim=41 inputs = inputs[:, [2, 1, 0]] - if torch.cuda.is_available(): - inputs = inputs.cuda() + device = next(pose_model.parameters()).device + inputs = inputs.to(device) output = pose_model(inputs) # compute coordinate diff --git a/fmpose3d/lib/hrnet/hrnet.py b/fmpose3d/lib/hrnet/hrnet.py index 0d0b7529..fa8e6822 100644 --- a/fmpose3d/lib/hrnet/hrnet.py +++ b/fmpose3d/lib/hrnet/hrnet.py @@ -196,11 +196,11 @@ def _load_hrnet(config): """Instantiate HRNet and load checkpoint weights.""" from fmpose3d.lib.hrnet.lib.models import pose_hrnet + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = pose_hrnet.get_pose_net(config, is_train=False) - if torch.cuda.is_available(): - model = model.cuda() + model = model.to(device) - state_dict = torch.load(config.OUTPUT_DIR, weights_only=True) + state_dict = torch.load(config.OUTPUT_DIR, map_location=device, weights_only=True) new_state_dict = OrderedDict() for k, v in state_dict.items(): new_state_dict[k] = v @@ -258,8 +258,8 @@ def _estimate_frame( ) inputs = inputs[:, [2, 1, 0]] - if torch.cuda.is_available(): - inputs = inputs.cuda() + device = next(self._pose_model.parameters()).device + inputs = inputs.to(device) output = self._pose_model(inputs) preds, maxvals = get_final_preds( @@ -277,4 +277,3 @@ def _estimate_frame( scores[i] = score.squeeze() return kpts, scores - diff --git a/fmpose3d/lib/hrnet/lib/utils/coco_h36m.py b/fmpose3d/lib/hrnet/lib/utils/coco_h36m.py deleted file mode 100755 index 577ef0bb..00000000 --- a/fmpose3d/lib/hrnet/lib/utils/coco_h36m.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -FMPose3D: monocular 3D Pose Estimation via Flow Matching - -Official implementation of the paper: -"FMPose3D: monocular 3D Pose Estimation via Flow Matching" -by Ti Wang, Xiaohang Yu, and Mackenzie Weygandt Mathis -Licensed under Apache 2.0 -""" - -import numpy as np - - -h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3] -coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] -spple_keypoints = [10, 8, 0, 7] - - -def coco_h36m(keypoints): - # keypoints: (T, N, 2) or (M, N, 2) - - temporal = keypoints.shape[0] - keypoints_h36m = np.zeros_like(keypoints, dtype=np.float32) - htps_keypoints = np.zeros((temporal, 4, 2), dtype=np.float32) - - # htps_keypoints: head, thorax, pelvis, spine - htps_keypoints[:, 0, 0] = np.mean(keypoints[:, 1:5, 0], axis=1, dtype=np.float32) - htps_keypoints[:, 0, 1] = np.sum(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1] - htps_keypoints[:, 1, :] = np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32) - htps_keypoints[:, 1, :] += (keypoints[:, 0, :] - htps_keypoints[:, 1, :]) / 3 - - htps_keypoints[:, 2, :] = np.mean(keypoints[:, 11:13, :], axis=1, dtype=np.float32) - htps_keypoints[:, 3, :] = np.mean(keypoints[:, [5, 6, 11, 12], :], axis=1, dtype=np.float32) - - keypoints_h36m[:, spple_keypoints, :] = htps_keypoints - keypoints_h36m[:, h36m_coco_order, :] = keypoints[:, coco_order, :] - - keypoints_h36m[:, 9, :] -= (keypoints_h36m[:, 9, :] - np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)) / 4 - keypoints_h36m[:, 7, 0] += 0.3*(keypoints_h36m[:, 7, 0] - np.mean(keypoints_h36m[:, [0, 8], 0], axis=1, dtype=np.float32)) - keypoints_h36m[:, 8, 1] -= (np.mean(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1])*2/3 - - # half body: the joint of ankle and knee equal to hip - # keypoints_h36m[:, [2, 3]] = keypoints_h36m[:, [1, 1]] - # keypoints_h36m[:, [5, 6]] = keypoints_h36m[:, [4, 4]] - return keypoints_h36m - - -h36m_mpii_order = [3, 2, 1, 4, 5, 6, 0, 8, 9, 10, 16, 15, 14, 11, 12, 13] -mpii_order = [i for i in range(16)] -lr_hip_shouler = [2, 3, 12, 13] - - -def mpii_h36m(keypoints): - temporal = keypoints.shape[0] - keypoints_h36m = np.zeros((temporal, 17, 2), dtype=np.float32) - keypoints_h36m[:, h36m_mpii_order] = keypoints - # keypoints_h36m[:, 7] = np.mean(keypoints[:, 6:8], axis=1, dtype=np.float32) - keypoints_h36m[:, 7] = np.mean(keypoints[:, lr_hip_shouler], axis=1, dtype=np.float32) - return keypoints_h36m - - diff --git a/fmpose3d/lib/hrnet/lib/utils/utilitys.py b/fmpose3d/lib/hrnet/lib/utils/utilitys.py index ba587ff7..acd65d88 100755 --- a/fmpose3d/lib/hrnet/lib/utils/utilitys.py +++ b/fmpose3d/lib/hrnet/lib/utils/utilitys.py @@ -14,7 +14,6 @@ import torchvision.transforms as transforms from fmpose3d.lib.hrnet.lib.utils.transforms import * -from fmpose3d.lib.hrnet.lib.utils.coco_h36m import coco_h36m import numpy as np joint_pairs = [[0, 1], [1, 3], [0, 2], [2, 4], diff --git a/fmpose3d/lib/yolov3/cfg/tiny-yolo-voc.cfg b/fmpose3d/lib/yolov3/cfg/tiny-yolo-voc.cfg deleted file mode 100755 index ab2c066a..00000000 --- a/fmpose3d/lib/yolov3/cfg/tiny-yolo-voc.cfg +++ /dev/null @@ -1,134 +0,0 @@ -[net] -batch=64 -subdivisions=8 -width=416 -height=416 -channels=3 -momentum=0.9 -decay=0.0005 -angle=0 -saturation = 1.5 -exposure = 1.5 -hue=.1 - -learning_rate=0.001 -max_batches = 40200 -policy=steps -steps=-1,100,20000,30000 -scales=.1,10,.1,.1 - -[convolutional] -batch_normalize=1 -filters=16 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=32 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=64 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=128 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=256 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=512 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=1 - -[convolutional] -batch_normalize=1 -filters=1024 -size=3 -stride=1 -pad=1 -activation=leaky - -########### - -[convolutional] -batch_normalize=1 -size=3 -stride=1 -pad=1 -filters=1024 -activation=leaky - -[convolutional] -size=1 -stride=1 -pad=1 -filters=125 -activation=linear - -[region] -anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 -bias_match=1 -classes=20 -coords=4 -num=5 -softmax=1 -jitter=.2 -rescore=1 - -object_scale=5 -noobject_scale=1 -class_scale=1 -coord_scale=1 - -absolute=1 -thresh = .6 -random=1 diff --git a/fmpose3d/lib/yolov3/cfg/yolo-voc.cfg b/fmpose3d/lib/yolov3/cfg/yolo-voc.cfg deleted file mode 100755 index d5bdfc1c..00000000 --- a/fmpose3d/lib/yolov3/cfg/yolo-voc.cfg +++ /dev/null @@ -1,258 +0,0 @@ -[net] -# Testing -batch=64 -subdivisions=8 -# Training -# batch=64 -# subdivisions=8 -height=416 -width=416 -channels=3 -momentum=0.9 -decay=0.0005 -angle=0 -saturation = 1.5 -exposure = 1.5 -hue=.1 - -learning_rate=0.001 -burn_in=1000 -max_batches = 80200 -policy=steps -steps=-1,500,40000,60000 -scales=0.1,10,.1,.1 - -[convolutional] -batch_normalize=1 -filters=32 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=64 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=128 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=64 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=128 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=256 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=128 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=256 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=512 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=256 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=512 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=256 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=512 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=1024 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=512 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=1024 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=512 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=1024 -size=3 -stride=1 -pad=1 -activation=leaky - - -####### - -[convolutional] -batch_normalize=1 -size=3 -stride=1 -pad=1 -filters=1024 -activation=leaky - -[convolutional] -batch_normalize=1 -size=3 -stride=1 -pad=1 -filters=1024 -activation=leaky - -[route] -layers=-9 - -[convolutional] -batch_normalize=1 -size=1 -stride=1 -pad=1 -filters=64 -activation=leaky - -[reorg] -stride=2 - -[route] -layers=-1,-4 - -[convolutional] -batch_normalize=1 -size=3 -stride=1 -pad=1 -filters=1024 -activation=leaky - -[convolutional] -size=1 -stride=1 -pad=1 -filters=125 -activation=linear - - -[region] -anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 -bias_match=1 -classes=20 -coords=4 -num=5 -softmax=1 -jitter=.3 -rescore=1 - -object_scale=5 -noobject_scale=1 -class_scale=1 -coord_scale=1 - -absolute=1 -thresh = .6 -random=1 diff --git a/fmpose3d/lib/yolov3/cfg/yolo.cfg b/fmpose3d/lib/yolov3/cfg/yolo.cfg deleted file mode 100755 index 2a0cd98f..00000000 --- a/fmpose3d/lib/yolov3/cfg/yolo.cfg +++ /dev/null @@ -1,258 +0,0 @@ -[net] -# Testing -batch=1 -subdivisions=1 -# Training -# batch=64 -# subdivisions=8 -width=416 -height=416 -channels=3 -momentum=0.9 -decay=0.0005 -angle=0 -saturation = 1.5 -exposure = 1.5 -hue=.1 - -learning_rate=0.001 -burn_in=1000 -max_batches = 500200 -policy=steps -steps=400000,450000 -scales=.1,.1 - -[convolutional] -batch_normalize=1 -filters=32 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=64 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=128 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=64 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=128 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=256 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=128 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=256 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=512 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=256 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=512 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=256 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=512 -size=3 -stride=1 -pad=1 -activation=leaky - -[maxpool] -size=2 -stride=2 - -[convolutional] -batch_normalize=1 -filters=1024 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=512 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=1024 -size=3 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=512 -size=1 -stride=1 -pad=1 -activation=leaky - -[convolutional] -batch_normalize=1 -filters=1024 -size=3 -stride=1 -pad=1 -activation=leaky - - -####### - -[convolutional] -batch_normalize=1 -size=3 -stride=1 -pad=1 -filters=1024 -activation=leaky - -[convolutional] -batch_normalize=1 -size=3 -stride=1 -pad=1 -filters=1024 -activation=leaky - -[route] -layers=-9 - -[convolutional] -batch_normalize=1 -size=1 -stride=1 -pad=1 -filters=64 -activation=leaky - -[reorg] -stride=2 - -[route] -layers=-1,-4 - -[convolutional] -batch_normalize=1 -size=3 -stride=1 -pad=1 -filters=1024 -activation=leaky - -[convolutional] -size=1 -stride=1 -pad=1 -filters=425 -activation=linear - - -[region] -anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 -bias_match=1 -classes=80 -coords=4 -num=5 -softmax=1 -jitter=.3 -rescore=1 - -object_scale=5 -noobject_scale=1 -class_scale=1 -coord_scale=1 - -absolute=1 -thresh = .6 -random=1 diff --git a/fmpose3d/lib/yolov3/data/pallete b/fmpose3d/lib/yolov3/data/pallete deleted file mode 100755 index 25f0143e..00000000 Binary files a/fmpose3d/lib/yolov3/data/pallete and /dev/null differ diff --git a/fmpose3d/lib/yolov3/data/voc.names b/fmpose3d/lib/yolov3/data/voc.names deleted file mode 100755 index 8420ab35..00000000 --- a/fmpose3d/lib/yolov3/data/voc.names +++ /dev/null @@ -1,20 +0,0 @@ -aeroplane -bicycle -bird -boat -bottle -bus -car -cat -chair -cow -diningtable -dog -horse -motorbike -person -pottedplant -sheep -sofa -train -tvmonitor diff --git a/fmpose3d/utils/weights.py b/fmpose3d/utils/weights.py index 941e4817..8ca98324 100644 --- a/fmpose3d/utils/weights.py +++ b/fmpose3d/utils/weights.py @@ -7,30 +7,31 @@ Licensed under Apache 2.0 """ -"""Shared helpers for resolving / downloading FMPose3D model weights.""" +"""Shared helper for resolving / downloading FMPose3D model weights.""" HF_REPO_ID: str = "MLAdaptiveIntelligence/FMPose3D" -def resolve_weights_path(model_weights_path: str, model_type: str) -> str: +def resolve_weights_path(local_path: str, filename: str) -> str: """Return a local weights path, downloading from Hugging Face Hub if needed. Parameters ---------- - model_weights_path : str - User-supplied local path. If falsy the weights are fetched from the - Hugging Face Hub automatically. - model_type : str - Model variant name used to derive the remote filename - (e.g. ``"fmpose3d_humans"`` -> ``fmpose3d_humans.pth``). + local_path : str + User-supplied local path. If falsy, ``filename`` is fetched from + the Hugging Face Hub (cached under ``~/.cache/huggingface``). + filename : str + The exact remote filename in the FMPose3D Hugging Face repo + (e.g. ``"fmpose3d_humans.pth"``, ``"fmpose3d_animals.pth"``, + ``"sa_finetune_hrnet_w32.pt"``). Returns ------- str Absolute path to the weight file on disk. """ - if model_weights_path: - return model_weights_path + if local_path: + return local_path try: from huggingface_hub import hf_hub_download @@ -41,7 +42,6 @@ def resolve_weights_path(model_weights_path: str, model_type: str) -> str: "Or download the weights manually and pass the local path." ) from None - filename = f"{model_type}.pth" print( f"No local weights path specified. " f"Downloading '{filename}' from Hugging Face ({HF_REPO_ID})..." diff --git a/pyproject.toml b/pyproject.toml index e7df467d..d91a9f79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "fmpose3d" readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10,<3.13" dynamic = ["version"] license = {text = "Apache 2.0"} authors = [ @@ -24,8 +24,12 @@ classifiers = [ ] dependencies = [ - "torch>=2.4.1", - "torchvision>=0.19.1", + # Pinned to torch 2.4.x: PyPI's Linux wheel for this range depends on + # CUDA 12.1 runtime packages. Newer torch releases may pull newer CUDA + # runtimes by default, so keep this bound to avoid surprising NVIDIA + # driver requirements for users. + "torch>=2.4.1,<2.5", + "torchvision>=0.19.1,<0.20", "timm>=1.0.0", "einops>=0.4.0", "numpy>=1.18.5,<2.0", @@ -77,4 +81,4 @@ markers = [ [tool.codespell] skip = '.git,*.pdf,*.svg,*.css,*.txt,*.pth' check-hidden = true -ignore-words-list = 'fmpose,mpjpe,uvd,xyz,hm36,cpn,dbb' +ignore-words-list = 'fmpose,mpjpe,uvd,xyz,hm36,cpn,dbb,mot' diff --git a/scripts/FMPose3D_main.py b/scripts/FMPose3D_main.py index 172a4824..5378d326 100644 --- a/scripts/FMPose3D_main.py +++ b/scripts/FMPose3D_main.py @@ -342,7 +342,7 @@ def print_error_action(action_error_sum, is_train): if args.reload: model_dict = model["CFM"].state_dict() - model_path = resolve_weights_path(args.model_weights_path, args.model_type) + model_path = resolve_weights_path(args.model_weights_path, f"{args.model_type}.pth") print(f"Loading weights from: {model_path}") pre_dict = torch.load(model_path, map_location=device, weights_only=True) diff --git a/tests/fmpose3d_api/test_fmpose3d.py b/tests/fmpose3d_api/test_fmpose3d.py index f4d4a026..89af7f60 100644 --- a/tests/fmpose3d_api/test_fmpose3d.py +++ b/tests/fmpose3d_api/test_fmpose3d.py @@ -36,7 +36,7 @@ apply_limb_regularization, compute_limb_regularization_matrix, ) -from fmpose3d.common.config import FMPose3DConfig, InferenceConfig +from fmpose3d.common.config import FMPose3DConfig, InferenceConfig, SuperAnimalConfig # --------------------------------------------------------------------------- # Helpers @@ -336,6 +336,10 @@ def test_animal(self): est, pp = _default_components(FMPose3DConfig(model_type="fmpose3d_animals")) assert isinstance(est, SuperAnimalEstimator) assert isinstance(pp, AnimalPostProcessor) + # Animals default to fine-tuned mode with lazy HF auto-download so the + # API works out-of-the-box. Construction itself stays cheap (no network). + assert est.cfg.auto_download_finetuned is True + assert est.cfg.pose_snapshot_path == "" # ========================================================================= @@ -791,3 +795,162 @@ def test_predict_maps_valid_bodyparts(self): np.testing.assert_array_equal(mask, np.array([True])) # target[24] ← source[0] → (0*3, 0*3+1) = (0.0, 1.0) np.testing.assert_allclose(kpts[0, 0, 24], fake_bp[0, 0, :2]) + + +# ========================================================================= +# Unit tests — SuperAnimalEstimator fine-tuned mode (mocked DLC) +# ========================================================================= + + +class TestSuperAnimalFinetunedPrediction: + """Fine-tuned mode covers two activation paths: + + * ``cfg.pose_snapshot_path`` is non-empty (explicit local override). + * ``cfg.auto_download_finetuned=True`` with empty ``pose_snapshot_path`` + (lazy HF auto-download on first predict). + + Both forward ``customized_*`` kwargs to DLC's ``superanimal_analyze_images`` + and skip the 39->26 keypoint remap. + """ + + def test_finetuned_forwards_customized_kwargs(self): + """pose_snapshot_path set → customized_* kwargs piped to DLC; empty + pytorch_config_path falls back to the packaged default yaml; empty + detector_snapshot_path forwards None so DLC resolves the stock detector. + """ + pytest.importorskip("deeplabcut") + from fmpose3d.animals.configs import SA_FINETUNE_HRNET_W32_YAML + + cfg = SuperAnimalConfig(pose_snapshot_path="/fake/snapshot.pt") + estimator = SuperAnimalEstimator(cfg) + frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8) + fake_bp = np.random.rand(1, 26, 3).astype("float32") + + captured: dict = {} + + def spy(*_, **kwargs): + captured.update(kwargs) + return {kwargs["images"][0]: {"bodyparts": fake_bp}} + + with patch( + "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images", + side_effect=spy, + ): + estimator.predict(frames) + + assert captured["customized_pose_checkpoint"] == "/fake/snapshot.pt" + assert captured["customized_model_config"] == SA_FINETUNE_HRNET_W32_YAML + assert captured["customized_detector_checkpoint"] is None + + def test_finetuned_skips_remap(self): + """26-joint DLC output passes through unchanged; the stock-SA + ``_map_keypoints`` / ``_map_scores`` helpers must not be called.""" + pytest.importorskip("deeplabcut") + + cfg = SuperAnimalConfig(pose_snapshot_path="/fake/snapshot.pt") + estimator = SuperAnimalEstimator(cfg) + frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8) + # 26-joint output — what a fine-tuned snapshot natively produces. + fake_bp = np.arange(78, dtype="float32").reshape(1, 26, 3) + + with patch.object(SuperAnimalEstimator, "_map_keypoints") as spy_map, \ + patch.object(SuperAnimalEstimator, "_map_scores") as spy_scores, \ + patch( + "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images", + ) as mock_fn: + mock_fn.return_value = {"frame.png": {"bodyparts": fake_bp}} + kpts, scores, mask = estimator.predict(frames) + + spy_map.assert_not_called() + spy_scores.assert_not_called() + assert kpts.shape == (1, 1, 26, 2) + assert scores.shape == (1, 1, 26) + np.testing.assert_array_equal(mask, np.array([True])) + # Output is the raw bodyparts xy / conf, not a remap. + np.testing.assert_allclose(kpts[0, 0], fake_bp[0, :, :2]) + np.testing.assert_allclose(scores[0, 0], fake_bp[0, :, 2]) + + def test_finetuned_custom_paths_override_packaged_defaults(self): + """Explicit pytorch_config_path / detector_snapshot_path override the + packaged defaults and are forwarded verbatim to DLC.""" + pytest.importorskip("deeplabcut") + + cfg = SuperAnimalConfig( + pose_snapshot_path="/fake/snapshot.pt", + pytorch_config_path="/custom/pytorch_config.yaml", + detector_snapshot_path="/custom/detector.pt", + ) + estimator = SuperAnimalEstimator(cfg) + frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8) + fake_bp = np.random.rand(1, 26, 3).astype("float32") + + captured: dict = {} + + def spy(*_, **kwargs): + captured.update(kwargs) + return {kwargs["images"][0]: {"bodyparts": fake_bp}} + + with patch( + "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images", + side_effect=spy, + ): + estimator.predict(frames) + + assert captured["customized_pose_checkpoint"] == "/fake/snapshot.pt" + assert captured["customized_model_config"] == "/custom/pytorch_config.yaml" + assert captured["customized_detector_checkpoint"] == "/custom/detector.pt" + + def test_stock_mode_does_not_forward_customized_kwargs(self): + """Default config (empty pose_snapshot_path, auto_download_finetuned=False) + → no customized_* kwargs; DLC runs with stock SuperAnimal-Quadruped + weights and the 39->26 remap path is taken downstream.""" + pytest.importorskip("deeplabcut") + + estimator = SuperAnimalEstimator() # default config (stock SA mode) + frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8) + fake_bp = np.random.rand(1, 40, 3).astype("float32") # 40-joint stock output + + captured: dict = {} + + def spy(*_, **kwargs): + captured.update(kwargs) + return {kwargs["images"][0]: {"bodyparts": fake_bp}} + + with patch( + "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images", + side_effect=spy, + ): + estimator.predict(frames) + + assert not any(k.startswith("customized_") for k in captured), ( + f"stock mode must not forward customized_* kwargs, got: {list(captured)}" + ) + + def test_auto_download_finetuned_resolves_via_hf_at_predict_time(self): + """auto_download_finetuned=True with empty pose_snapshot_path triggers + a lazy HF resolution on the first predict() call. The resolved path + is forwarded to DLC as customized_pose_checkpoint.""" + pytest.importorskip("deeplabcut") + + cfg = SuperAnimalConfig(auto_download_finetuned=True) + assert cfg.pose_snapshot_path == "" # trigger condition + estimator = SuperAnimalEstimator(cfg) + frames = np.random.randint(0, 255, (1, 64, 64, 3), dtype=np.uint8) + fake_bp = np.random.rand(1, 26, 3).astype("float32") + captured: dict = {} + + def spy(*_, **kwargs): + captured.update(kwargs) + return {kwargs["images"][0]: {"bodyparts": fake_bp}} + + with patch( + "fmpose3d.utils.weights.resolve_weights_path", + return_value="/hf/cache/sa_finetune_hrnet_w32.pt", + ) as mock_resolver, patch( + "deeplabcut.pose_estimation_pytorch.apis.superanimal_analyze_images", + side_effect=spy, + ): + estimator.predict(frames) + + mock_resolver.assert_called_once_with("", "sa_finetune_hrnet_w32.pt") + assert captured["customized_pose_checkpoint"] == "/hf/cache/sa_finetune_hrnet_w32.pt" diff --git a/tests/test_config.py b/tests/test_config.py index 2b2983c0..78457de6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -69,13 +69,13 @@ def test_list_defaults_are_independent(self): def test_custom_values(self): cfg = DatasetConfig( - dataset="rat7m", - root_path="Rat7M_data/", + dataset="animal3d", + root_path="dataset/", joints_left=[8, 10, 11], joints_right=[9, 14, 15], ) - assert cfg.dataset == "rat7m" - assert cfg.root_path == "Rat7M_data/" + assert cfg.dataset == "animal3d" + assert cfg.root_path == "dataset/" assert cfg.joints_left == [8, 10, 11] @@ -261,9 +261,9 @@ def test_from_namespace_basic(self): out_channels=3, frames=3, # DatasetConfig - dataset="rat7m", + dataset="animal3d", keypoints="cpn", - root_path="Rat7M_data/", + root_path="dataset/", actions="*", downsample=1, subset=1.0, @@ -343,7 +343,7 @@ def test_from_namespace_basic(self): # Verify a sample from each group assert cfg.model_cfg.layers == 5 assert cfg.model_cfg.channel == 256 - assert cfg.dataset_cfg.dataset == "rat7m" + assert cfg.dataset_cfg.dataset == "animal3d" assert cfg.dataset_cfg.joints_left == [8, 10] assert cfg.training_cfg.train is True assert cfg.training_cfg.nepoch == 100