diff --git a/.gitignore b/.gitignore index 58be3ff..8e2662d 100644 --- a/.gitignore +++ b/.gitignore @@ -41,4 +41,5 @@ out*/ output/ *.npz *.pkl +recordings/ diff --git a/examples/handpose_ros2_publisher.py b/examples/handpose_ros2_publisher.py index 15ae2d2..7e7f9fe 100644 --- a/examples/handpose_ros2_publisher.py +++ b/examples/handpose_ros2_publisher.py @@ -31,6 +31,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) from handpose.ik_retargeting import ( # noqa: E402 + FINGER_TARGET_BODIES, ORCA_JOINT_NAMES, ORCAHandIKConfig, ORCAHandIKRetargeting, @@ -38,10 +39,15 @@ from handpose.tracker.hamer import HaMeRTracker # noqa: E402 -def inject_target_bodies(mjcf_path: Path) -> str: - """Injects mocap bodies and tip sites into the MJCF XML string for IK targeting. +def inject_target_bodies(mjcf_path: Path, target_joint_types: tuple[str, ...]) -> str: + """Inject mocap bodies for tracker sites and ensure IK target sites exist. - This matches the approach in live_demo_ik.py to ensure tip sites exist for all fingers. + Args: + mjcf_path: Path to MJCF model file + target_joint_types: Tuple of joint types to visualize (tip, ip, pip, mcp) + + Returns: + Modified MJCF XML string """ tree = ET.parse(mjcf_path) root = tree.getroot() @@ -51,37 +57,99 @@ def inject_target_bodies(mjcf_path: Path) -> str: raise ValueError("Could not find worldbody in MJCF") # Convert relative paths to absolute paths - # MuJoCo can't resolve relative paths when loading from string model_dir = mjcf_path.parent for asset in root.findall(".//asset"): for mesh in asset.findall("mesh"): file_attr = mesh.get("file") if file_attr and not Path(file_attr).is_absolute(): - # Convert relative path to absolute abs_path = (model_dir / file_attr).resolve() mesh.set("file", str(abs_path)) - # Add tip sites for all fingers (required for tip tracking) + # All IK target sites use blue for consistency + ik_site_color = "0 0 1 0.9" # Blue for all IK targets + + # Tip site offsets: [x, y, z] where z is upward direction tip_site_specs = { - "thumb": ("right_thumb_dp", np.array([0.0, 0.0, 0.018])), - "index": ("right_index_ip", np.array([0.0, 0.0, 0.020])), - "middle": ("right_middle_ip", np.array([0.0, 0.0, 0.022])), - "ring": ("right_ring_ip", np.array([0.0, 0.0, 0.021])), - "pinky": ("right_pinky_ip", np.array([0.0, 0.0, 0.018])), + "thumb": ("right_thumb_dp", np.array([0.0, 0.0, 0.025])), + "index": ("right_index_ip", np.array([0.0, 0.0, 0.035])), + "middle": ("right_middle_ip", np.array([0.0, 0.0, 0.037])), + "ring": ("right_ring_ip", np.array([0.0, 0.0, 0.036])), + "pinky": ("right_pinky_ip", np.array([0.0, 0.0, 0.032])), } - for finger, (parent_body, offset) in tip_site_specs.items(): - body_elem = root.find(f".//body[@name='{parent_body}']") - if body_elem is None: - continue - site_name = f"right_{finger}_tip_site" - if body_elem.find(f"./site[@name='{site_name}']") is not None: - continue - site = ET.SubElement(body_elem, "site") - site.set("name", site_name) - site.set("pos", " ".join(f"{value:.5f}" for value in offset)) - site.set("size", "0.0025") - site.set("rgba", "1 0.6 0.2 0.8") + if "tip" in target_joint_types: + tip_sites_created = [] + for finger, (parent_body, offset) in tip_site_specs.items(): + body_elem = root.find(f".//body[@name='{parent_body}']") + if body_elem is None: + print(f"Warning: Parent body '{parent_body}' not found for {finger} tip site") + continue + site_name = f"right_{finger}_tip_site" + existing_site = body_elem.find(f"./site[@name='{site_name}']") + if existing_site is not None: + # Update existing site - make it larger and blue + existing_site.set("type", "sphere") # Ensure type is set + existing_site.set("size", "0.005") + existing_site.set("rgba", ik_site_color) + tip_sites_created.append(f"{finger} (updated)") + else: + # Create new tip site + site = ET.SubElement(body_elem, "site") + site.set("name", site_name) + site.set("type", "sphere") # Explicit type for visibility + site.set("pos", " ".join(f"{value:.5f}" for value in offset)) + site.set("size", "0.005") + site.set("rgba", ik_site_color) + tip_sites_created.append(f"{finger} (created)") + if tip_sites_created: + print(f"Tip sites: {', '.join(tip_sites_created)}") + + # Add visual markers for IK target bodies (MCP, PIP, IP) that might not have visual geoms + allowed_types = set(target_joint_types) + markers_added = [] + for finger_name, bodies in FINGER_TARGET_BODIES.items(): + for joint_type, (frame_type, frame_name) in bodies.items(): + if joint_type not in allowed_types: + continue + + if frame_type == "body": + # Add a visual site to the body for visibility + body_elem = root.find(f".//body[@name='{frame_name}']") + if body_elem is not None: + # Check if marker site already exists + marker_site_name = f"ik_marker_{frame_name}" + existing_marker = body_elem.find(f"./site[@name='{marker_site_name}']") + if existing_marker is not None: + # Update existing marker + existing_marker.set("size", "0.004") # Same size as tip sites + existing_marker.set("rgba", ik_site_color) + markers_added.append(f"{finger_name}_{joint_type} (updated)") + else: + # Create new marker site + site = ET.SubElement(body_elem, "site") + site.set("name", marker_site_name) + site.set("pos", "0 0 0") # At body origin + site.set("size", "0.004") # Same size as tip sites for consistency + site.set("rgba", ik_site_color) # Blue for all IK targets + markers_added.append(f"{finger_name}_{joint_type} (created)") + else: + print(f"Warning: Body '{frame_name}' not found in MJCF for {finger_name}_{joint_type}") + elif frame_type == "site": + # This is a tip site - find and update it (should already be handled above, but double-check) + found = False + for body in root.findall(".//body"): + site_elem = body.find(f"./site[@name='{frame_name}']") + if site_elem is not None: + site_elem.set("size", "0.005") + site_elem.set("rgba", ik_site_color) + found = True + markers_added.append(f"{finger_name}_{joint_type} (site updated)") + break + if not found: + print(f"Warning: Site '{frame_name}' not found in MJCF for {finger_name}_{joint_type}") + + if markers_added: + print(f"Added/updated {len(markers_added)} IK target markers: {', '.join(markers_added)}") return ET.tostring(root, encoding="unicode") @@ -101,11 +169,8 @@ def __init__( hold_last: bool = False, width: int = 1280, height: int = 720, - joint_smoothing: float = 1.0, + joint_smoothing: float = 0.7, target_joint_types: tuple[str, ...] = ("tip", "ip"), - auto_scale: bool = False, - auto_scale_once: bool = False, - auto_scale_update_rate: float = 0.1, ) -> None: """Initialize the HandPose ROS2 publisher. @@ -120,11 +185,8 @@ def __init__( hold_last: If True, publish last valid joint values when hand is not detected width: Camera frame width height: Camera frame height - joint_smoothing: Smoothing factor for joint positions (0.0-1.0). Default: 1.0 (no smoothing) + joint_smoothing: Smoothing factor for joint positions (0.0-1.0). Default: 0.7 target_joint_types: Tuple of joint types to target for IK (tip, ip, pip, mcp). Default: ("tip", "ip") - auto_scale: Enable automatic scale factor computation based on fingertip distances - auto_scale_once: Compute scale factor once on first hand detection (default: update every frame) - auto_scale_update_rate: Exponential smoothing rate for auto-scaling (0.0-1.0). Default: 0.1 """ super().__init__("handpose_hamer_publisher") @@ -154,27 +216,34 @@ def __init__( raise FileNotFoundError(f"Model file not found: {model_file}") self.get_logger().info(f"Loading MuJoCo model from {model_file}") - # Inject tip sites into MJCF (required for tip tracking, matches live_demo_ik.py) - xml_string = inject_target_bodies(model_file) + # Inject tip sites into MJCF (required for tip tracking) + xml_string = inject_target_bodies(model_file, target_joint_types) self.model = mujoco.MjModel.from_xml_string(xml_string) + # Verify tip sites exist in the loaded model + if "tip" in target_joint_types: + self.get_logger().info("Verifying tip sites in loaded model:") + data_temp = mujoco.MjData(self.model) + for finger in ["thumb", "index", "middle", "ring", "pinky"]: + site_name = f"right_{finger}_tip_site" + site_id = mujoco.mj_name2id(self.model, mujoco.mjtObj.mjOBJ_SITE, site_name) + if site_id >= 0: + site_pos = data_temp.site(site_id).xpos.copy() + self.get_logger().info( + f" ✓ {site_name}: found at position [{site_pos[0]:.4f}, {site_pos[1]:.4f}, {site_pos[2]:.4f}]" + ) + else: + self.get_logger().warn(f" ✗ {site_name}: NOT FOUND in model!") + # IK retargeting - # Use the same wrist_offset_palm as live_demo_ik.py to match USD model coordinate frame - # The default MJCF offset [0.002, -0.00144, -0.03872] may not match the USD model ik_cfg = ORCAHandIKConfig( scale_factor=scale, target_joint_types=target_joint_types, - wrist_offset_palm=np.array([0.000, 0.0, -0.05]), # Match live_demo_ik.py - auto_scale=auto_scale, - auto_scale_update_rate=auto_scale_update_rate if auto_scale else 0.0, - auto_scale_use_neutral_pose=True, + lm_damping=0.5, + ik_iterations=15, ) self.ik = ORCAHandIKRetargeting(self.model, config=ik_cfg) - # Track if we've computed the scale factor once (for auto-scale-once mode) - self.scale_computed_once = False - self.auto_scale_once = auto_scale_once - # Initialize joint position smoothing state and MuJoCo data for forward kinematics self.data = mujoco.MjData(self.model) self.smoothed_qpos = self.data.qpos.copy() @@ -224,16 +293,6 @@ def tick(self) -> None: joint_vals = None if hand is not None: try: - # Auto-scale (once mode) - if self.auto_scale_once and not self.scale_computed_once and self.ik.config.auto_scale: - # Temporarily disable continuous auto-scaling - self.ik.config.auto_scale = False - # Compute scale factor once - computed_scale = self.ik.compute_auto_scale_factor(hand, use_neutral_robot_pose=True) - self.ik.config.scale_factor = computed_scale - self.scale_computed_once = True - self.get_logger().info(f"[Auto-Scale] Computed scale factor: {computed_scale:.3f}") - # Update IK solver configuration with current robot state (required for proper IK solving) # This matches the approach in live_demo_ik.py self.data.qpos[:] = self.smoothed_qpos @@ -357,31 +416,15 @@ def main() -> None: parser.add_argument( "--joint-smoothing", type=float, - default=1.0, + default=0.7, help="Smoothing factor for joint positions (0.0-1.0). \ - Lower values = more smoothing, higher = less smoothing. Default: 1.0 (no smoothing)", + Lower values = more smoothing, higher = less smoothing. Default: 0.7", ) parser.add_argument( "--targets", type=str, - default="tip, ip", - help="Comma-separated joint targets (tip,ip,pip,mcp). Default: tip, ip", - ) - parser.add_argument( - "--auto-scale", - action="store_true", - help="Enable automatic scale factor computation based on fingertip distances", - ) - parser.add_argument( - "--auto-scale-once", - action="store_true", - help="Compute scale factor once on first hand detection (default: update every frame)", - ) - parser.add_argument( - "--auto-scale-update-rate", - type=float, - default=0.1, - help="Exponential smoothing rate for auto-scaling (0.0-1.0). Lower = smoother. Default: 0.1", + default="tip", + help="Comma-separated joint targets (tip,ip,pip,mcp). Default: tip", ) args = parser.parse_args() @@ -400,14 +443,6 @@ def main() -> None: if not (0.0 < args.joint_smoothing <= 1.0): parser.error("--joint-smoothing must be between 0.0 and 1.0 (exclusive of 0.0)") - # Validate auto-scale update rate - if args.auto_scale and not (0.0 <= args.auto_scale_update_rate <= 1.0): - parser.error("--auto-scale-update-rate must be between 0.0 and 1.0") - - # Warn if auto-scale-once is used without auto-scale - if args.auto_scale_once and not args.auto_scale: - parser.error("--auto-scale-once requires --auto-scale to be enabled") - # Initialize ROS2 rclpy.init() @@ -425,9 +460,6 @@ def main() -> None: height=args.height, joint_smoothing=args.joint_smoothing, target_joint_types=target_joints, - auto_scale=args.auto_scale, - auto_scale_once=args.auto_scale_once, - auto_scale_update_rate=args.auto_scale_update_rate, ) rclpy.spin(node) node.destroy_node() diff --git a/examples/live_demo_ik.py b/examples/live_demo_ik.py index 579b337..be9a7a9 100644 --- a/examples/live_demo_ik.py +++ b/examples/live_demo_ik.py @@ -53,8 +53,16 @@ def dual_window_process(frame_queue: mp.Queue, running_flag: Flag, window_name: cv2.destroyAllWindows() -def inject_target_bodies(mjcf_path: Path) -> str: - """Injects mocap bodies into the MJCF XML string for visualization.""" +def inject_target_bodies(mjcf_path: Path, target_joint_types: tuple[str, ...]) -> str: + """Inject mocap bodies for tracker sites and ensure IK target sites exist. + + Args: + mjcf_path: Path to MJCF model file + target_joint_types: Tuple of joint types to visualize (tip, ip, pip, mcp) + + Returns: + Modified MJCF XML string + """ tree = ET.parse(mjcf_path) root = tree.getroot() worldbody = root.find("worldbody") @@ -63,60 +71,121 @@ def inject_target_bodies(mjcf_path: Path) -> str: raise ValueError("Could not find worldbody in MJCF") # Convert relative paths to absolute paths - # MuJoCo can't resolve relative paths when loading from string model_dir = mjcf_path.parent for asset in root.findall(".//asset"): for mesh in asset.findall("mesh"): file_attr = mesh.get("file") if file_attr and not Path(file_attr).is_absolute(): - # Convert relative path to absolute abs_path = (model_dir / file_attr).resolve() mesh.set("file", str(abs_path)) - # Add mocap bodies for all keypoints (MCP, PIP, TIP, and IP for thumb) - # Use different colors for different joint types - joint_colors = { - "mcp": "0 1 0 0.7", # Green for MCP - "pip": "0 0 1 0.7", # Blue for PIP - "ip": "1 1 0 0.7", # Yellow for IP (thumb only) - "tip": "1 0 0 0.7", # Red for TIP + # All IK target sites use blue for consistency + ik_site_color = "0 0 1 0.9" # Blue for all IK targets + + # Tip site offsets: [x, y, z] where z is upward direction + tip_site_specs = { + "thumb": ("right_thumb_dp", np.array([0.0, 0.0, 0.025])), + "index": ("right_index_ip", np.array([0.0, 0.0, 0.035])), + "middle": ("right_middle_ip", np.array([0.0, 0.0, 0.037])), + "ring": ("right_ring_ip", np.array([0.0, 0.0, 0.036])), + "pinky": ("right_pinky_ip", np.array([0.0, 0.0, 0.032])), } - for finger, joints in FINGER_TARGET_BODIES.items(): - for joint_type in joints.keys(): - body = ET.SubElement(worldbody, "body") - body.set("name", f"target_{finger}_{joint_type}") - body.set("mocap", "true") - body.set("pos", "0 0 0") + if "tip" in target_joint_types: + tip_sites_created = [] + for finger, (parent_body, offset) in tip_site_specs.items(): + body_elem = root.find(f".//body[@name='{parent_body}']") + if body_elem is None: + print(f"Warning: Parent body '{parent_body}' not found for {finger} tip site") + continue + site_name = f"right_{finger}_tip_site" + existing_site = body_elem.find(f"./site[@name='{site_name}']") + if existing_site is not None: + # Update existing site - make it larger and blue + existing_site.set("type", "sphere") # Ensure type is set + existing_site.set("size", "0.005") + existing_site.set("rgba", ik_site_color) + tip_sites_created.append(f"{finger} (updated)") + else: + # Create new tip site + site = ET.SubElement(body_elem, "site") + site.set("name", site_name) + site.set("type", "sphere") # Explicit type for visibility + site.set("pos", " ".join(f"{value:.5f}" for value in offset)) + site.set("size", "0.005") + site.set("rgba", ik_site_color) + tip_sites_created.append(f"{finger} (created)") + print(f"Tip sites: {', '.join(tip_sites_created)}") + + # Add visual markers for IK target bodies (MCP, PIP, IP) that might not have visual geoms + allowed_types = set(target_joint_types) + markers_added = [] + for finger_name, bodies in FINGER_TARGET_BODIES.items(): + for joint_type, (frame_type, frame_name) in bodies.items(): + if joint_type not in allowed_types: + continue - # Add a visual sphere (smaller than before) - geom = ET.SubElement(body, "geom") - geom.set("type", "sphere") - geom.set("size", "0.003") # 3mm radius (smaller) - geom.set("rgba", joint_colors.get(joint_type, "0.5 0.5 0.5 0.7")) - geom.set("contype", "0") # No collision - geom.set("conaffinity", "0") + if frame_type == "body": + # Add a visual site to the body for visibility + body_elem = root.find(f".//body[@name='{frame_name}']") + if body_elem is not None: + # Check if marker site already exists + marker_site_name = f"ik_marker_{frame_name}" + existing_marker = body_elem.find(f"./site[@name='{marker_site_name}']") + if existing_marker is not None: + # Update existing marker + existing_marker.set("size", "0.004") # Same size as tip sites + existing_marker.set("rgba", ik_site_color) + markers_added.append(f"{finger_name}_{joint_type} (updated)") + else: + # Create new marker site + site = ET.SubElement(body_elem, "site") + site.set("name", marker_site_name) + site.set("pos", "0 0 0") # At body origin + site.set("size", "0.004") # Same size as tip sites for consistency + site.set("rgba", ik_site_color) # Blue for all IK targets + markers_added.append(f"{finger_name}_{joint_type} (created)") + else: + print(f"Warning: Body '{frame_name}' not found in MJCF for {finger_name}_{joint_type}") + elif frame_type == "site": + # This is a tip site - find and update it (should already be handled above, but double-check) + found = False + for body in root.findall(".//body"): + site_elem = body.find(f"./site[@name='{frame_name}']") + if site_elem is not None: + site_elem.set("size", "0.005") + site_elem.set("rgba", ik_site_color) + found = True + markers_added.append(f"{finger_name}_{joint_type} (site updated)") + break + if not found: + print(f"Warning: Site '{frame_name}' not found in MJCF for {finger_name}_{joint_type}") + + print(f"Added/updated {len(markers_added)} IK target markers: {', '.join(markers_added)}") + + # Add mocap bodies for tracker sites (MediaPipe landmarks) + # All tracker sites use red for consistency - these represent the actual tracked hand + tracker_color = "1 0 0 0.9" # Red for all tracker sites (actual hand) + + allowed_types = set(target_joint_types) + for finger_name, mp_mapping in MP_LANDMARK_INDICES.items(): + for joint_type, mp_idx in mp_mapping.items(): + if joint_type not in allowed_types: + continue - tip_site_specs = { - "thumb": ("right_thumb_dp", np.array([0.0, 0.0, 0.018])), - "index": ("right_index_ip", np.array([0.0, 0.0, 0.020])), - "middle": ("right_middle_ip", np.array([0.0, 0.0, 0.022])), - "ring": ("right_ring_ip", np.array([0.0, 0.0, 0.021])), - "pinky": ("right_pinky_ip", np.array([0.0, 0.0, 0.018])), - } + # Create mocap body for tracker site + mocap_body = ET.SubElement(worldbody, "body") + mocap_body.set("name", f"tracker_{finger_name}_{joint_type}") + mocap_body.set("mocap", "true") + mocap_body.set("pos", "0 0 0") - for finger, (parent_body, offset) in tip_site_specs.items(): - body_elem = root.find(f".//body[@name='{parent_body}']") - if body_elem is None: - continue - site_name = f"right_{finger}_tip_site" - if body_elem.find(f"./site[@name='{site_name}']") is not None: - continue - site = ET.SubElement(body_elem, "site") - site.set("name", site_name) - site.set("pos", " ".join(f"{value:.5f}" for value in offset)) - site.set("size", "0.0025") - site.set("rgba", "1 0.6 0.2 0.8") + # Add visual sphere for tracker site (larger, red) + geom = ET.SubElement(mocap_body, "geom") + geom.set("type", "sphere") + geom.set("size", "0.005") # 5mm radius (larger than IK targets for visibility) + geom.set("rgba", tracker_color) # Red for all tracker sites + geom.set("contype", "0") + geom.set("conaffinity", "0") return ET.tostring(root, encoding="unicode") @@ -253,7 +322,6 @@ async def main_async( label_lookup: dict[int, list[str]], camera_matrix: np.ndarray | None = None, joint_smoothing: float = 1.0, - auto_scale_once: bool = False, ) -> None: """Async main loop with keyboard handling.""" running = True @@ -279,9 +347,6 @@ async def key_handler(key: str) -> None: # Initialize joint position smoothing state smoothed_qpos = data.qpos.copy() - # Track if we've computed scale once (for auto-scale-once mode) - scale_computed_once = False - with mujoco.viewer.launch_passive(model, data) as viewer: while running and viewer.is_running() and cap.isOpened(): ret, frame = cap.read() @@ -298,16 +363,6 @@ async def key_handler(key: str) -> None: if hand_structures: structure = hand_structures[0] - # --- AUTO-SCALE (once mode) --- - if auto_scale_once and not scale_computed_once and ik_solver.config.auto_scale: - # Temporarily disable continuous auto-scaling - ik_solver.config.auto_scale = False - # Compute scale factor once - computed_scale = ik_solver.compute_auto_scale_factor(structure, use_neutral_robot_pose=True) - ik_solver.config.scale_factor = computed_scale - scale_computed_once = True - print(f"[Auto-Scale] Computed scale factor: {computed_scale:.3f}") - # --- IK SOLVE --- # 1. Update the configuration object with current robot state mujoco.mj_forward(model, data) @@ -410,7 +465,7 @@ def main() -> None: parser.add_argument( "--targets", type=str, - default="tip, ip", + default="tip", help="Comma-separated joint targets (tip,ip,pip,mcp). Default: tip, ip", ) parser.add_argument( @@ -429,25 +484,9 @@ def main() -> None: parser.add_argument( "--joint-smoothing", type=float, - default=1.0, + default=0.7, help="Smoothing factor for joint positions (0.0-1.0). \ - Lower values = more smoothing, higher = less smoothing. Default: 1.0 (no smoothing)", - ) - parser.add_argument( - "--auto-scale", - action="store_true", - help="Enable automatic scale factor computation based on fingertip distances", - ) - parser.add_argument( - "--auto-scale-once", - action="store_true", - help="Compute scale factor once on first hand detection (default: update every frame)", - ) - parser.add_argument( - "--auto-scale-update-rate", - type=float, - default=0.1, - help="Exponential smoothing rate for auto-scaling (0.0-1.0). Lower = smoother. Default: 0.1", + Lower values = more smoothing, higher = less smoothing. Default: 0.7", ) args = parser.parse_args() @@ -461,16 +500,15 @@ def main() -> None: parser.error(f"Unsupported target joint types: {', '.join(invalid)}") # 1. Setup paths and model - script_dir = Path(__file__).parent - model_path = script_dir.parent / "models" / args.model if not Path(args.model).is_absolute() else Path(args.model) - - if not model_path.exists(): - print(f"Error: Model not found at {model_path}") - return + model_file = Path(args.model) + if not model_file.is_absolute(): + model_file = Path(__file__).parent.parent / args.model + if not model_file.exists(): + raise FileNotFoundError(f"Model file not found: {model_file}") # 2. Inject visualization bodies (Mocap) - print("Injecting visualization targets...") - xml_string = inject_target_bodies(model_path) + print(f"Loading model from {model_file}") + xml_string = inject_target_bodies(model_file, target_joints) # 3. Load MuJoCo Model model = mujoco.MjModel.from_xml_string(xml_string) @@ -487,25 +525,38 @@ def main() -> None: ik_config = ORCAHandIKConfig( scale_factor=args.scale, - wrist_offset_palm=np.array([0.000, 0.0, -0.05]), target_joint_types=target_joints, - auto_scale=args.auto_scale, - auto_scale_update_rate=args.auto_scale_update_rate if args.auto_scale else 0.0, - auto_scale_use_neutral_pose=True, + lm_damping=0.5, + ik_iterations=15, ) ik_solver = ORCAHandIKRetargeting(model, config=ik_config) + # Verify tip sites exist in the loaded model + if "tip" in target_joints: + print("\nVerifying tip sites in loaded model:") + for finger in ["thumb", "index", "middle", "ring", "pinky"]: + site_name = f"right_{finger}_tip_site" + site_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_SITE, site_name) + if site_id >= 0: + site_pos = data.site(site_id).xpos.copy() + print(f" ✓ {site_name}: found at position [{site_pos[0]:.4f}, {site_pos[1]:.4f}, {site_pos[2]:.4f}]") + else: + print(f" ✗ {site_name}: NOT FOUND in model!") + allowed_joint_types = set(target_joints) # 5. Helper to map finger names and joint types to mocap body IDs + # Note: These use "tracker_" prefix to match the visualization bodies created by inject_target_bodies target_body_ids: dict[str, dict[str, int]] = {} for finger, joints in FINGER_TARGET_BODIES.items(): for joint_type in joints.keys(): if joint_type not in allowed_joint_types: continue finger_dict = target_body_ids.setdefault(finger, {}) - bid = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, f"target_{finger}_{joint_type}") + bid = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, f"tracker_{finger}_{joint_type}") + if bid < 0: + print(f"Warning: Mocap body 'tracker_{finger}_{joint_type}' not found") finger_dict[joint_type] = bid # 6. Main Loop @@ -538,14 +589,6 @@ def main() -> None: if not (0.0 < args.joint_smoothing <= 1.0): parser.error("--joint-smoothing must be between 0.0 and 1.0 (exclusive of 0.0)") - # Validate auto-scale update rate - if args.auto_scale and not (0.0 <= args.auto_scale_update_rate <= 1.0): - parser.error("--auto-scale-update-rate must be between 0.0 and 1.0") - - # Warn if auto-scale-once is used without auto-scale - if args.auto_scale_once and not args.auto_scale: - parser.error("--auto-scale-once requires --auto-scale to be enabled") - # Run async main loop asyncio.run( main_async( @@ -560,7 +603,6 @@ def main() -> None: mp_label_lookup, camera_matrix, joint_smoothing=args.joint_smoothing, - auto_scale_once=args.auto_scale_once, ) ) diff --git a/handpose/ik_retargeting.py b/handpose/ik_retargeting.py index 4aed301..72453a2 100644 --- a/handpose/ik_retargeting.py +++ b/handpose/ik_retargeting.py @@ -5,6 +5,7 @@ import mink import mujoco import numpy as np +from mink.limits import CollisionAvoidanceLimit, ConfigurationLimit, Limit from handpose.tracker.base import HandStructure @@ -44,22 +45,22 @@ }, "index": { "mcp": ("body", "right_index_mp"), - "pip": ("body", "right_index_pp"), + "pip": ("body", "right_index_ip"), # _ip body contains the PIP joint "tip": ("site", "right_index_tip_site"), }, "middle": { "mcp": ("body", "right_middle_mp"), - "pip": ("body", "right_middle_pp"), + "pip": ("body", "right_middle_ip"), # _ip body contains the PIP joint "tip": ("site", "right_middle_tip_site"), }, "ring": { "mcp": ("body", "right_ring_mp"), - "pip": ("body", "right_ring_pp"), + "pip": ("body", "right_ring_ip"), # _ip body contains the PIP joint "tip": ("site", "right_ring_tip_site"), }, "pinky": { "mcp": ("body", "right_pinky_mp"), - "pip": ("body", "right_pinky_pp"), + "pip": ("body", "right_pinky_ip"), # _ip body contains the PIP joint "tip": ("site", "right_pinky_tip_site"), }, } @@ -106,16 +107,23 @@ class ORCAHandIKConfig: # Default from MJCF: right_wrist joint pos="0.002 -0.00144 -0.03872" wrist_offset_palm: np.ndarray | None = None - # IK solver parameters + # IK solver parameters (matching manus Mink configs) dt: float = 0.05 # Timestep for IK integration (seconds) - damping: float = 1e-2 # Levenberg-Marquardt damping - solver: str = "daqp" # QP solver + damping: float = 1e-5 # Levenberg-Marquardt damping (matching manus: 1e-5) + solver: str = "quadprog" # QP solver (matching manus: "quadprog") ik_iterations: int = 5 # Number of IK passes to perform before returning (for better convergence) - # Task costs - position_cost: float = 3.0 # Cost for position tracking + # Task costs (matching manus Mink configs) + position_cost: float = 1.0 # Cost for position tracking (matching manus: 1.0) orientation_cost: float = 0.0 # Cost for orientation tracking - posture_cost: float = 1e-4 # Cost for posture task (keeps hand near neutral) + lm_damping: float = 1.0 # Levenberg-Marquardt damping for FrameTask (matching manus: 1.0) + + # Collision avoidance (optional) + use_collision_avoidance: bool = False # Enable CollisionAvoidanceLimit + collision_geom_pairs: list[tuple[list[str], list[str]]] | None = None # Geom pairs for collision avoidance + collision_gain: float = 0.85 # Collision avoidance gain (default from Mink) + collision_min_distance: float = 0.005 # Minimum distance between geoms (meters) + collision_detection_distance: float = 0.01 # Distance at which collision avoidance activates (meters) # Coordinate frame transformation # MediaPipe to Robot coordinate mapping: [MP_X, MP_Y, MP_Z] -> [Robot_X, Robot_Y, Robot_Z] @@ -123,13 +131,6 @@ class ORCAHandIKConfig: coord_transform: np.ndarray | None = None target_joint_types: tuple[str, ...] = ("tip",) - # Auto-scaling options - auto_scale: bool = False # If True, automatically computes scale_factor from fingertip distances - auto_scale_update_rate: float = ( - 0.1 # Exponential smoothing factor for auto-scaling (0.0 = no smoothing, 1.0 = instant) - ) - auto_scale_use_neutral_pose: bool = True # Use neutral robot pose for scale computation - def __post_init__(self) -> None: """Initialize default values if None.""" if self.wrist_offset_palm is None: @@ -168,10 +169,6 @@ def __init__( self.model = model self.configuration = mink.Configuration(model) - # Initialize config - if config is None: - config = ORCAHandIKConfig.default_config() - self.config = config # Identify the qpos indices for the finger joints we want to control @@ -201,7 +198,7 @@ def __init__( frame_type=frame_type, position_cost=self.config.position_cost, orientation_cost=self.config.orientation_cost, - lm_damping=self.config.damping, + lm_damping=self.config.lm_damping, ) finger_tasks[joint_type] = task if finger_tasks: @@ -213,127 +210,17 @@ def __init__( "(tip tracking requires fingertip sites to be injected)." ) - # We also need a posture task to encourage the hand to stay close to a "neutral" pose - # when not reaching for extremes. This prevents weird internal configurations. - self.posture_task = mink.PostureTask( - model, - cost=self.config.posture_cost, - lm_damping=self.config.damping, - ) - - self.target_pose = np.zeros(model.nq) - self.posture_task.set_target(self.target_pose) - - def compute_auto_scale_factor(self, hand_structure: HandStructure, use_neutral_robot_pose: bool = True) -> float: - """Compute automatic scale factor based on fingertip distances. - - Compares the distance from wrist to fingertips in the human hand - with the distance from wrist to fingertips in the ORCA hand model. - - Args: - hand_structure: The tracked human hand structure - use_neutral_robot_pose: If True, uses neutral robot pose (q=0). - If False, uses current robot configuration. - - Returns: - Scale factor (robot_fingertip_distance / human_fingertip_distance) - """ - # Get human hand fingertip positions relative to wrist - wrist_pos = hand_structure.wrist_position - human_fingertips = { - "thumb": hand_structure.thumb.tip, - "index": hand_structure.index.tip, - "middle": hand_structure.middle.tip, - "ring": hand_structure.ring.tip, - "pinky": hand_structure.pinky.tip, - } - - # Compute average distance from wrist to fingertips in human hand - human_distances = [] - for finger, tip_pos in human_fingertips.items(): - dist = np.linalg.norm(tip_pos - wrist_pos) - if dist > 0.01: # Filter out invalid/too-small distances - human_distances.append(dist) - - if len(human_distances) == 0: - return self.config.scale_factor # Fallback to current scale - - avg_human_distance = np.mean(human_distances) - - # Get robot hand fingertip positions - if use_neutral_robot_pose: - # Save current configuration - saved_q = self.configuration.q.copy() - # Set to neutral pose - self.configuration.q[:] = 0.0 - self.configuration.update() - - # Get wrist position in robot model - t_palm = self.configuration.get_transform_frame_to_world("right_palm", "body") - p_palm = t_palm.translation() - r_palm = t_palm.rotation().as_matrix() - wrist_offset_world = r_palm @ self.config.wrist_offset_palm - p_wrist_robot = p_palm + wrist_offset_world - - # Get fingertip positions from robot model using mink Configuration - robot_fingertips = {} - for finger_name in ["thumb", "index", "middle", "ring", "pinky"]: - if finger_name not in self.tasks: - continue - finger_tasks = self.tasks[finger_name] - if "tip" not in finger_tasks: - continue - - # Get the frame name and type - frame_type, frame_name = FINGER_TARGET_BODIES[finger_name]["tip"] - - # Get transform from mink Configuration - try: - if frame_type == "body": - t_tip = self.configuration.get_transform_frame_to_world(frame_name, "body") - else: # site - t_tip = self.configuration.get_transform_frame_to_world(frame_name, "site") - tip_pos = t_tip.translation() - robot_fingertips[finger_name] = tip_pos - except Exception: - # Fallback: try to get from MuJoCo model directly - obj_type = mujoco.mjtObj.mjOBJ_BODY if frame_type == "body" else mujoco.mjtObj.mjOBJ_SITE - frame_id = mujoco.mj_name2id(self.model, obj_type, frame_name) - if frame_id >= 0: - # Need to forward kinematics - use data - data = mujoco.MjData(self.model) - data.qpos[:] = self.configuration.q - mujoco.mj_forward(self.model, data) - if obj_type == mujoco.mjtObj.mjOBJ_SITE: - tip_pos = data.site(frame_id).xpos.copy() - else: - tip_pos = data.body(frame_id).xpos.copy() - robot_fingertips[finger_name] = tip_pos - - # Restore configuration if we changed it - if use_neutral_robot_pose: - self.configuration.q[:] = saved_q - self.configuration.update() - - # Compute average distance from wrist to fingertips in robot hand - robot_distances = [] - for finger, tip_pos in robot_fingertips.items(): - dist = np.linalg.norm(tip_pos - p_wrist_robot) - if dist > 0.01: # Filter out invalid/too-small distances - robot_distances.append(dist) - - if len(robot_distances) == 0: - return self.config.scale_factor # Fallback to current scale - - avg_robot_distance = np.mean(robot_distances) - - # Compute scale factor - if avg_human_distance > 1e-6: - scale_factor = avg_robot_distance / avg_human_distance - else: - return self.config.scale_factor # Fallback - - return scale_factor + # Initialize collision avoidance limits if enabled + self.limits: list[Limit] = [ConfigurationLimit(model=model)] + if self.config.use_collision_avoidance and self.config.collision_geom_pairs: + collision_limit = CollisionAvoidanceLimit( + model=model, + geom_pairs=self.config.collision_geom_pairs, + gain=self.config.collision_gain, + minimum_distance_from_collisions=self.config.collision_min_distance, + collision_detection_distance=self.config.collision_detection_distance, + ) + self.limits.append(collision_limit) def _hand_structure_to_landmarks(self, hand_structure: HandStructure) -> np.ndarray: """Convert HandStructure to 21x3 landmarks array (MediaPipe format). @@ -455,20 +342,6 @@ def solve(self, hand_input: HandStructure | np.ndarray) -> np.ndarray: Returns: The full qpos array for the robot. """ - # Auto-scale if enabled - if self.config.auto_scale and isinstance(hand_input, HandStructure): - new_scale = self.compute_auto_scale_factor( - hand_input, use_neutral_robot_pose=self.config.auto_scale_use_neutral_pose - ) - # Apply exponential smoothing to avoid jitter - if self.config.auto_scale_update_rate > 0.0: - self.config.scale_factor = ( - self.config.auto_scale_update_rate * new_scale - + (1.0 - self.config.auto_scale_update_rate) * self.config.scale_factor - ) - else: - self.config.scale_factor = new_scale - targets = self.compute_target_positions(hand_input) # Use configurable parameters @@ -478,8 +351,8 @@ def solve(self, hand_input: HandStructure | np.ndarray) -> np.ndarray: # Perform multiple IK iterations for better convergence for iteration in range(self.config.ik_iterations): - # Build active tasks list (posture task + finger tasks) - active_tasks: list[mink.Task] = [self.posture_task] + # Build active tasks list (finger tasks only) + active_tasks: list[mink.Task] = [] # Get palm transform and compute wrist position (consistent with compute_target_positions) # Recompute each iteration in case configuration changed @@ -520,8 +393,8 @@ def solve(self, hand_input: HandStructure | np.ndarray) -> np.ndarray: task.set_target(target_se3) active_tasks.append(task) - # Solve IK - vel = mink.solve_ik(self.configuration, active_tasks, dt, solver, damping) + # Solve IK with limits (collision avoidance if enabled) + vel = mink.solve_ik(self.configuration, active_tasks, dt, solver, damping, limits=self.limits) # Integrate velocity to update configuration self.configuration.integrate_inplace(vel, dt) diff --git a/models/orca_hand_fixed.mjcf b/models/orca_hand_fixed.mjcf index 89209ae..2c48d66 100644 --- a/models/orca_hand_fixed.mjcf +++ b/models/orca_hand_fixed.mjcf @@ -3,6 +3,11 @@ + + + + + @@ -26,8 +31,8 @@ - - + + diff --git a/pyproject.toml b/pyproject.toml index afe77f0..59a84fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,7 @@ module = [ 'sensor_msgs.*', 'h5py', "colorlogging", + 'mink.*', ] ignore_missing_imports = true diff --git a/scripts/record_and_process_video.py b/scripts/record_and_process_video.py new file mode 100644 index 0000000..fd54852 --- /dev/null +++ b/scripts/record_and_process_video.py @@ -0,0 +1,862 @@ +"""Record video, process with HAMER, visualize in MuJoCo, and concatenate videos. + +This script: +1. Listens for key press ('r' or 's') to start recording +2. Records video from camera +3. Saves the recorded video +4. Processes video with HAMER frame by frame +5. Opens MuJoCo and visualizes the processed hand tracking +6. Records the MuJoCo visualization +7. Concatenates both videos together +""" + +import argparse +import sys +import time +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Optional + +import cv2 +import mujoco +import mujoco.viewer +import numpy as np + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from handpose.ik_retargeting import ( + FINGER_TARGET_BODIES, + MP_LANDMARK_INDICES, + ORCAHandIKConfig, + ORCAHandIKRetargeting, +) +from handpose.tracker.base import HandStructure +from handpose.tracker.hamer import HaMeRTracker + + +def inject_visualization_bodies(mjcf_path: Path, target_joint_types: tuple[str, ...]) -> str: + """Inject mocap bodies for tracker sites and ensure IK target sites exist.""" + tree = ET.parse(mjcf_path) + root = tree.getroot() + worldbody = root.find("worldbody") + + if worldbody is None: + raise ValueError("Could not find worldbody in MJCF") + + # Convert relative paths to absolute paths + model_dir = mjcf_path.parent + for asset in root.findall(".//asset"): + for mesh in asset.findall("mesh"): + file_attr = mesh.get("file") + if file_attr and not Path(file_attr).is_absolute(): + abs_path = (model_dir / file_attr).resolve() + mesh.set("file", str(abs_path)) + + # IK target sites use blue + ik_site_color = "0 0 1 0.9" + + # Tip site offsets + tip_site_specs = { + "thumb": ("right_thumb_dp", np.array([0.0, 0.0, 0.025])), + "index": ("right_index_ip", np.array([0.0, 0.0, 0.035])), + "middle": ("right_middle_ip", np.array([0.0, 0.0, 0.037])), + "ring": ("right_ring_ip", np.array([0.0, 0.0, 0.036])), + "pinky": ("right_pinky_ip", np.array([0.0, 0.0, 0.032])), + } + + if "tip" in target_joint_types: + for finger, (parent_body, offset) in tip_site_specs.items(): + body_elem = root.find(f".//body[@name='{parent_body}']") + if body_elem is None: + continue + site_name = f"right_{finger}_tip_site" + existing_site = body_elem.find(f"./site[@name='{site_name}']") + if existing_site is not None: + existing_site.set("type", "sphere") + existing_site.set("size", "0.005") + existing_site.set("rgba", ik_site_color) + else: + site = ET.SubElement(body_elem, "site") + site.set("name", site_name) + site.set("type", "sphere") + site.set("pos", " ".join(f"{value:.5f}" for value in offset)) + site.set("size", "0.005") + site.set("rgba", ik_site_color) + + # Add visual markers for IK target bodies + allowed_types = set(target_joint_types) + for finger_name, bodies in FINGER_TARGET_BODIES.items(): + for joint_type, (frame_type, frame_name) in bodies.items(): + if joint_type not in allowed_types: + continue + + if frame_type == "body": + body_elem = root.find(f".//body[@name='{frame_name}']") + if body_elem is not None: + marker_site_name = f"ik_marker_{frame_name}" + existing_marker = body_elem.find(f"./site[@name='{marker_site_name}']") + if existing_marker is not None: + existing_marker.set("size", "0.004") + existing_marker.set("rgba", ik_site_color) + else: + site = ET.SubElement(body_elem, "site") + site.set("name", marker_site_name) + site.set("pos", "0 0 0") + site.set("size", "0.004") + site.set("rgba", ik_site_color) + + # Add mocap bodies for tracker sites (red) + tracker_color = "1 0 0 0.9" + # Add wrist tracker body + wrist_body = ET.SubElement(worldbody, "body") + wrist_body.set("name", "tracker_wrist") + wrist_body.set("mocap", "true") + wrist_body.set("pos", "0 0 0") + wrist_geom = ET.SubElement(wrist_body, "geom") + wrist_geom.set("type", "sphere") + wrist_geom.set("size", "0.005") + wrist_geom.set("rgba", tracker_color) + wrist_geom.set("contype", "0") + wrist_geom.set("conaffinity", "0") + + for finger_name, mp_mapping in MP_LANDMARK_INDICES.items(): + for joint_type, mp_idx in mp_mapping.items(): + if joint_type not in allowed_types: + continue + + mocap_body = ET.SubElement(worldbody, "body") + mocap_body.set("name", f"tracker_{finger_name}_{joint_type}") + mocap_body.set("mocap", "true") + mocap_body.set("pos", "0 0 0") + + geom = ET.SubElement(mocap_body, "geom") + geom.set("type", "sphere") + geom.set("size", "0.005") + geom.set("rgba", tracker_color) + geom.set("contype", "0") + geom.set("conaffinity", "0") + + # Add skeleton lines (cylinders) between joints for skeleton overlay + skeleton_color = "0 1 0 0.8" # Green skeleton lines + skeleton_connections = [ + # Thumb chain + ("tracker_thumb_mcp", "tracker_thumb_ip"), + ("tracker_thumb_ip", "tracker_thumb_tip"), + # Index chain + ("tracker_index_mcp", "tracker_index_pip"), + ("tracker_index_pip", "tracker_index_tip"), + # Middle chain + ("tracker_middle_mcp", "tracker_middle_pip"), + ("tracker_middle_pip", "tracker_middle_tip"), + # Ring chain + ("tracker_ring_mcp", "tracker_ring_pip"), + ("tracker_ring_pip", "tracker_ring_tip"), + # Pinky chain + ("tracker_pinky_mcp", "tracker_pinky_pip"), + ("tracker_pinky_pip", "tracker_pinky_tip"), + ] + + # Add wrist connections (from wrist to each finger MCP) + wrist_connections = [ + ("tracker_index_mcp", "tracker_wrist"), + ("tracker_middle_mcp", "tracker_wrist"), + ("tracker_ring_mcp", "tracker_wrist"), + ("tracker_pinky_mcp", "tracker_wrist"), + ("tracker_thumb_mcp", "tracker_wrist"), + ] + + # Create skeleton line geoms (will be positioned dynamically) + for start_joint, end_joint in skeleton_connections + wrist_connections: + # Create a mocap body for the skeleton line + line_body = ET.SubElement(worldbody, "body") + line_body.set("name", f"skeleton_line_{start_joint}_{end_joint}") + line_body.set("mocap", "true") + line_body.set("pos", "0 0 0") + + # Add cylinder geom for the line + line_geom = ET.SubElement(line_body, "geom") + line_geom.set("type", "cylinder") + line_geom.set("size", "0.001 0.02") # Thin cylinder (radius, half-height) - will be updated dynamically + line_geom.set("rgba", skeleton_color) + line_geom.set("contype", "0") + line_geom.set("conaffinity", "0") + + return ET.tostring(root, encoding="unicode") + + +def hand_structure_to_landmarks(structure: HandStructure) -> np.ndarray: + """Convert HandStructure to 21x3 landmarks array (MediaPipe format).""" + landmarks = np.zeros((21, 3)) + + landmarks[0] = structure.wrist_position + landmarks[1] = structure.thumb.mcp + landmarks[2] = structure.thumb.mcp + landmarks[3] = structure.thumb.ip if structure.thumb.ip is not None else structure.thumb.mcp + landmarks[4] = structure.thumb.tip + landmarks[5] = structure.index.mcp + landmarks[6] = structure.index.pip if structure.index.pip is not None else structure.index.mcp + landmarks[7] = structure.index.dip if structure.index.dip is not None else structure.index.tip + landmarks[8] = structure.index.tip + landmarks[9] = structure.middle.mcp + landmarks[10] = structure.middle.pip if structure.middle.pip is not None else structure.middle.mcp + landmarks[11] = structure.middle.dip if structure.middle.dip is not None else structure.middle.tip + landmarks[12] = structure.middle.tip + landmarks[13] = structure.ring.mcp + landmarks[14] = structure.ring.pip if structure.ring.pip is not None else structure.ring.mcp + landmarks[15] = structure.ring.dip if structure.ring.dip is not None else structure.ring.tip + landmarks[16] = structure.ring.tip + landmarks[17] = structure.pinky.mcp + landmarks[18] = structure.pinky.pip if structure.pinky.pip is not None else structure.pinky.mcp + landmarks[19] = structure.pinky.dip if structure.pinky.dip is not None else structure.pinky.tip + landmarks[20] = structure.pinky.tip + + return landmarks + + +def get_joint_position_world( + model: mujoco.MjModel, + data: mujoco.MjData, + joint_name: str, + hand_structure: HandStructure, + ik_solver: ORCAHandIKRetargeting, + target_joint_types: tuple[str, ...], +) -> np.ndarray | None: + """Get world position of a joint by name.""" + # Check if it's a mocap body we can query directly + if joint_name.startswith("tracker_"): + body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, joint_name) + if body_id >= 0: + mocap_idx = model.body_mocapid[body_id] + if mocap_idx >= 0 and mocap_idx < model.nmocap: + return data.mocap_pos[mocap_idx].copy() + + # Fallback: calculate from hand structure + if joint_name in ("wrist", "tracker_wrist"): + t_palm = ik_solver.configuration.get_transform_frame_to_world("right_palm", "body") + p_palm = t_palm.translation() + r_palm = t_palm.rotation().as_matrix() + wrist_offset_world = r_palm @ ik_solver.config.wrist_offset_palm + return p_palm + wrist_offset_world + + return None + + +def update_skeleton_lines( + model: mujoco.MjModel, + data: mujoco.MjData, + hand_structure: HandStructure, + target_joint_types: tuple[str, ...], + ik_solver: ORCAHandIKRetargeting, +) -> None: + """Update skeleton line positions and orientations between joints.""" + skeleton_connections = [ + ("tracker_thumb_mcp", "tracker_thumb_ip"), + ("tracker_thumb_ip", "tracker_thumb_tip"), + ("tracker_index_mcp", "tracker_index_pip"), + ("tracker_index_pip", "tracker_index_tip"), + ("tracker_middle_mcp", "tracker_middle_pip"), + ("tracker_middle_pip", "tracker_middle_tip"), + ("tracker_ring_mcp", "tracker_ring_pip"), + ("tracker_ring_pip", "tracker_ring_tip"), + ("tracker_pinky_mcp", "tracker_pinky_pip"), + ("tracker_pinky_pip", "tracker_pinky_tip"), + ("tracker_index_mcp", "tracker_wrist"), + ("tracker_middle_mcp", "tracker_wrist"), + ("tracker_ring_mcp", "tracker_wrist"), + ("tracker_pinky_mcp", "tracker_wrist"), + ("tracker_thumb_mcp", "tracker_wrist"), + ] + + for start_joint, end_joint in skeleton_connections: + p1 = get_joint_position_world(model, data, start_joint, hand_structure, ik_solver, target_joint_types) + p2 = get_joint_position_world(model, data, end_joint, hand_structure, ik_solver, target_joint_types) + + if p1 is None or p2 is None: + continue + + # Calculate midpoint and direction + midpoint = (p1 + p2) / 2.0 + direction = p2 - p1 + length = np.linalg.norm(direction) + + if length < 1e-6: + continue + + direction = direction / length + + # Create rotation quaternion to align cylinder with direction + # Default cylinder axis is [0, 0, 1], we want it along direction + z_axis = np.array([0, 0, 1]) + if np.abs(np.dot(direction, z_axis)) > 0.99: + # Parallel to z-axis, use identity + quat = np.array([1, 0, 0, 0]) + else: + # Calculate rotation axis and angle + rot_axis = np.cross(z_axis, direction) + rot_axis = rot_axis / np.linalg.norm(rot_axis) + cos_angle = np.dot(z_axis, direction) + angle = np.arccos(np.clip(cos_angle, -1, 1)) + quat = np.array([ + np.cos(angle / 2), + rot_axis[0] * np.sin(angle / 2), + rot_axis[1] * np.sin(angle / 2), + rot_axis[2] * np.sin(angle / 2), + ]) + + # Update mocap body position and orientation + line_body_name = f"skeleton_line_{start_joint}_{end_joint}" + body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, line_body_name) + if body_id >= 0: + mocap_idx = model.body_mocapid[body_id] + if mocap_idx >= 0 and mocap_idx < model.nmocap: + data.mocap_pos[mocap_idx] = midpoint + data.mocap_quat[mocap_idx] = quat + + +def update_tracker_sites( + model: mujoco.MjModel, + data: mujoco.MjData, + hand_structure: HandStructure, + target_joint_types: tuple[str, ...], + ik_solver: ORCAHandIKRetargeting, +) -> None: + """Update mocap body positions for tracker sites based on hand structure.""" + landmarks_hand_frame = hand_structure_to_landmarks(hand_structure) + + t_palm = ik_solver.configuration.get_transform_frame_to_world("right_palm", "body") + p_palm = t_palm.translation() + r_palm = t_palm.rotation().as_matrix() + + wrist_offset_world = r_palm @ ik_solver.config.wrist_offset_palm + p_wrist = p_palm + wrist_offset_world + + scale = ik_solver.config.scale_factor + coord_transform = ik_solver.config.coord_transform + + allowed_types = set(target_joint_types) + for finger_name, mp_mapping in MP_LANDMARK_INDICES.items(): + for joint_type, mp_idx in mp_mapping.items(): + if joint_type not in allowed_types: + continue + + landmark_pos_hand_frame = landmarks_hand_frame[mp_idx] + scaled_vec = landmark_pos_hand_frame * scale + target_pos_world = p_wrist + scaled_vec + + rel_vec = target_pos_world - p_wrist + transformed_vec = coord_transform @ rel_vec + final_pos = p_wrist + transformed_vec + + mocap_body_name = f"tracker_{finger_name}_{joint_type}" + body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, mocap_body_name) + if body_id >= 0: + mocap_idx = model.body_mocapid[body_id] + if mocap_idx >= 0 and mocap_idx < model.nmocap: + data.mocap_pos[mocap_idx] = final_pos + + # Update wrist tracker position + wrist_body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, "tracker_wrist") + if wrist_body_id >= 0: + wrist_mocap_idx = model.body_mocapid[wrist_body_id] + if wrist_mocap_idx >= 0 and wrist_mocap_idx < model.nmocap: + data.mocap_pos[wrist_mocap_idx] = p_wrist + + # Update skeleton lines after updating tracker sites + update_skeleton_lines(model, data, hand_structure, target_joint_types, ik_solver) + + +def record_camera_video(cap: cv2.VideoCapture, output_path: Path, fps: float = 30.0) -> bool: + """Record video from camera until 'q' is pressed. + + Returns True if recording was successful, False otherwise. + """ + print(f"\n{'='*70}") + print("VIDEO RECORDING") + print(f"{'='*70}") + print("Press 'r' or 's' to START recording") + print("Press 'q' to STOP recording and save") + print(f"{'='*70}\n") + + # Set camera FPS if possible + cap.set(cv2.CAP_PROP_FPS, fps) + + # Wait for start key + recording = False + out = None + frame_count = 0 + start_time = None + frame_time = 1.0 / fps # Time per frame in seconds + target_time = time.time() + frame_time + + while True: + ret, frame = cap.read() + if not ret: + print("Failed to read from camera") + if out is not None: + out.release() + cv2.destroyAllWindows() + return False + + display_frame = frame.copy() + if not recording: + cv2.putText( + display_frame, + "Press 'r' or 's' to START recording", + (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1, + (0, 255, 0), + 2, + ) + else: + elapsed = time.time() - start_time + cv2.putText( + display_frame, + f"RECORDING... {elapsed:.1f}s ({frame_count} frames) - Press 'q' to STOP", + (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.7, + (0, 0, 255), + 2, + ) + + cv2.imshow("Camera Feed - Recording", display_frame) + key = cv2.waitKey(1) & 0xFF + + if key == ord("q") and recording: + break + elif (key == ord("r") or key == ord("s")) and not recording: + recording = True + # Get frame dimensions + height, width = frame.shape[:2] + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height)) + if not out.isOpened(): + print(f"Error: Failed to open video writer for {output_path}") + cv2.destroyAllWindows() + return False + start_time = time.time() + print(f"Recording started at {fps} FPS. Saving to: {output_path}") + + if recording and out is not None: + # Write frame and control timing + out.write(frame) + frame_count += 1 + + # Sleep to maintain target FPS + sleep_time = max(0, target_time - time.time()) + if sleep_time > 0: + time.sleep(sleep_time) + target_time += frame_time + + + if out is not None: + out.release() + elapsed = time.time() - start_time if start_time else 0 + print(f"Recording stopped. Saved {frame_count} frames ({elapsed:.2f}s) to {output_path}") + print(f"Actual FPS: {frame_count / elapsed:.2f} (target: {fps:.2f})") + + cv2.destroyAllWindows() + return recording + + +def process_video_with_hamer( + video_path: Path, + tracker: HaMeRTracker, + ik_solver: ORCAHandIKRetargeting, + model: mujoco.MjModel, + data: mujoco.MjData, + target_joint_types: tuple[str, ...], +) -> list[tuple[np.ndarray, Optional[HandStructure]]]: + """Process video with HAMER and return frames with hand structures.""" + print(f"\n{'='*70}") + print("PROCESSING VIDEO WITH HAMER") + print(f"{'='*70}") + print(f"Processing: {video_path}") + + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + raise RuntimeError(f"Failed to open video: {video_path}") + + fps = cap.get(cv2.CAP_PROP_FPS) + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + print(f"Video: {frame_count} frames at {fps} FPS") + + processed_frames = [] + timestamp = 0.0 + dt = 1.0 / fps if fps > 0 else 1.0 / 30.0 + + frame_idx = 0 + while True: + ret, frame = cap.read() + if not ret: + break + + # Use previous frame's solution as warm start for better IK convergence + # Only reset on first frame + if frame_idx == 0: + mujoco.mj_resetData(model, data) + + timestamp = frame_idx * dt + hand_structures = tracker.detect_hands(frame, timestamp=timestamp) + + hand_structure = hand_structures[0] if hand_structures else None + + if hand_structure: + # Update IK solver with current state (warm start from previous frame) + mujoco.mj_forward(model, data) + ik_solver.configuration.update(data.qpos) + + # Solve IK + qpos = ik_solver.solve(hand_structure) + if not np.any(np.isnan(qpos)) and not np.any(np.isinf(qpos)): + data.qpos[:] = qpos + data.qvel[:] = 0 + + # Update tracker site positions + update_tracker_sites(model, data, hand_structure, target_joint_types, ik_solver) + + # Forward kinematics + mujoco.mj_forward(model, data) + + processed_frames.append((frame.copy(), hand_structure)) + frame_idx += 1 + + if frame_idx % 10 == 0: + print(f"Processed {frame_idx}/{frame_count} frames...") + + cap.release() + print(f"Processing complete. Processed {len(processed_frames)} frames.") + return processed_frames + + +def record_mujoco_visualization( + processed_frames: list[tuple[np.ndarray, Optional[HandStructure]]], + model: mujoco.MjModel, + data: mujoco.MjData, + output_path: Path, + fps: float = 30.0, + width: int = 1920, + height: int = 1080, + ik_solver: Optional[ORCAHandIKRetargeting] = None, + target_joint_types: tuple[str, ...] = ("tip",), +) -> None: + """Record MuJoCo visualization from processed frames.""" + print(f"\n{'='*70}") + print("RECORDING MUJOCO VISUALIZATION") + print(f"{'='*70}") + print(f"Recording {len(processed_frames)} frames to: {output_path}") + + # Create renderer + # The model should have framebuffer settings in MJCF for larger resolutions + try: + renderer = mujoco.Renderer(model, height=height, width=width, max_geom=10000) + except ValueError as e: + # Fallback to smaller resolution if framebuffer is too small + print(f"Warning: {e}") + print("Falling back to 640x480 resolution") + width = 640 + height = 480 + renderer = mujoco.Renderer(model, height=height, width=width, max_geom=10000) + + # Setup camera + cam = mujoco.MjvCamera() + cam.type = mujoco.mjtCamera.mjCAMERA_FREE + cam.distance = 0.6 # Further back to fit entire hand + cam.azimuth = -20 + cam.elevation = 35 + cam.lookat[:] = [0.04, 0.0, 0.08] # Look at palm area + + # Setup options - enable transparency and joint visualization for skeleton overlay + option = mujoco.MjvOption() + option.flags[mujoco.mjtVisFlag.mjVIS_CONTACTPOINT] = False + option.flags[mujoco.mjtVisFlag.mjVIS_CONTACTFORCE] = False + option.flags[mujoco.mjtVisFlag.mjVIS_TRANSPARENT] = True # Make meshes transparent + option.flags[mujoco.mjtVisFlag.mjVIS_JOINT] = True # Show joints for skeleton overlay + + # Create video writer + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height)) + + # Process each frame + for frame_idx, (frame, hand_structure) in enumerate(processed_frames): + # Only reset on first frame + if frame_idx == 0: + mujoco.mj_resetData(model, data) + + if hand_structure is not None and ik_solver is not None: + # Update IK solver configuration + mujoco.mj_forward(model, data) + ik_solver.configuration.update(data.qpos) + + # Solve IK + qpos = ik_solver.solve(hand_structure) + if not np.any(np.isnan(qpos)) and not np.any(np.isinf(qpos)): + data.qpos[:] = qpos + data.qvel[:] = 0 + + # Update tracker site positions + update_tracker_sites(model, data, hand_structure, target_joint_types, ik_solver) + + # Forward kinematics + mujoco.mj_forward(model, data) + + # Update scene with current state + renderer.update_scene(data, camera=cam, scene_option=option) + + # Render to numpy array (returns RGB, shape: (height, width, 3)) + rgb_frame = renderer.render() + + # Convert RGB to BGR for OpenCV + # Note: MuJoCo Renderer already returns correct orientation, no need to flip + bgr_frame = cv2.cvtColor(rgb_frame, cv2.COLOR_RGB2BGR) + + out.write(bgr_frame) + + if (frame_idx + 1) % 10 == 0: + print(f"Rendered {frame_idx + 1}/{len(processed_frames)} frames...") + + out.release() + renderer.close() + print(f"MuJoCo visualization recording complete. Saved to: {output_path}") + + +def concatenate_videos(video1_path: Path, video2_path: Path, output_path: Path) -> None: + """Concatenate two videos side by side.""" + print(f"\n{'='*70}") + print("CONCATENATING VIDEOS") + print(f"{'='*70}") + print(f"Concatenating {video1_path} and {video2_path}") + print(f"Output: {output_path}") + + cap1 = cv2.VideoCapture(str(video1_path)) + cap2 = cv2.VideoCapture(str(video2_path)) + + if not cap1.isOpened() or not cap2.isOpened(): + raise RuntimeError("Failed to open one or both videos") + + # Get properties + fps1 = cap1.get(cv2.CAP_PROP_FPS) + fps2 = cap2.get(cv2.CAP_PROP_FPS) + fps = min(fps1, fps2) if fps1 > 0 and fps2 > 0 else 30.0 + + width1 = int(cap1.get(cv2.CAP_PROP_FRAME_WIDTH)) + height1 = int(cap1.get(cv2.CAP_PROP_FRAME_HEIGHT)) + width2 = int(cap2.get(cv2.CAP_PROP_FRAME_WIDTH)) + height2 = int(cap2.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # Use the maximum height and sum of widths for side-by-side + max_height = max(height1, height2) + total_width = width1 + width2 + + # Resize videos to same height if needed + if height1 != height2: + scale1 = max_height / height1 + scale2 = max_height / height2 + width1 = int(width1 * scale1) + width2 = int(width2 * scale2) + total_width = width1 + width2 + + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + out = cv2.VideoWriter(str(output_path), fourcc, fps, (total_width, max_height)) + + frame_count = 0 + while True: + ret1, frame1 = cap1.read() + ret2, frame2 = cap2.read() + + if not ret1 or not ret2: + break + + # Resize if needed + if height1 != max_height: + frame1 = cv2.resize(frame1, (width1, max_height)) + if height2 != max_height: + frame2 = cv2.resize(frame2, (width2, max_height)) + + # Concatenate side by side + concatenated = np.hstack([frame1, frame2]) + out.write(concatenated) + + frame_count += 1 + if frame_count % 10 == 0: + print(f"Concatenated {frame_count} frames...") + + cap1.release() + cap2.release() + out.release() + print(f"Concatenation complete. Saved {frame_count} frames to {output_path}") + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Record video, process with HAMER, visualize in MuJoCo, and concatenate", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--camera", type=int, default=0, help="Camera device index") + parser.add_argument( + "--model", + type=str, + default="models/orca_hand_fixed.mjcf", + help="Path to MuJoCo MJCF model file", + ) + parser.add_argument( + "--targets", + type=str, + default="tip", + help="Comma-separated joint targets (tip,ip,pip,mcp)", + ) + parser.add_argument( + "--scale", + type=float, + default=1.0, + help="Hand scale factor (robot/human)", + ) + parser.add_argument( + "--fps", + type=float, + default=30.0, + help="Video FPS", + ) + parser.add_argument( + "--output-dir", + type=str, + default="recordings", + help="Output directory for videos", + ) + parser.add_argument( + "--video-path", + type=str, + default=None, + help="Path to input video file (if provided, skips camera recording and uses this video)", + ) + args = parser.parse_args() + + # Parse target joint types + raw_targets = [part.strip().lower() for part in args.targets.split(",")] + target_joints = tuple(dict.fromkeys(jt for jt in raw_targets if jt)) + if not target_joints: + target_joints = ("tip",) + supported_targets = {"tip", "ip", "pip", "mcp"} + invalid = [jt for jt in target_joints if jt not in supported_targets] + if invalid: + parser.error(f"Unsupported target joint types: {', '.join(invalid)}") + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Generate output filenames + timestamp = int(time.time()) + if args.video_path: + # Use input video name as base for output files + input_video_name = Path(args.video_path).stem + camera_video_path = Path(args.video_path) # Use input video directly + mujoco_video_path = output_dir / f"mujoco_{input_video_name}_{timestamp}.mp4" + final_video_path = output_dir / f"concatenated_{input_video_name}_{timestamp}.mp4" + else: + camera_video_path = output_dir / f"camera_recording_{timestamp}.mp4" + mujoco_video_path = output_dir / f"mujoco_recording_{timestamp}.mp4" + final_video_path = output_dir / f"concatenated_{timestamp}.mp4" + + # Load model + model_file = Path(args.model) + if not model_file.is_absolute(): + model_file = Path(__file__).parent.parent / args.model + if not model_file.exists(): + raise FileNotFoundError(f"Model file not found: {model_file}") + + print(f"Loading model from {model_file}") + xml_string = inject_visualization_bodies(model_file, target_joints) + model = mujoco.MjModel.from_xml_string(xml_string) + data = mujoco.MjData(model) + + # Initialize tracker and IK solver + tracker = HaMeRTracker() + ik_config = ORCAHandIKConfig( + scale_factor=args.scale, + target_joint_types=target_joints, + lm_damping=0.5, + ik_iterations=15, + ) + ik_solver = ORCAHandIKRetargeting(model, config=ik_config) + + # Verify tip sites exist in the loaded model + if "tip" in target_joints: + print("\nVerifying tip sites in loaded model:") + for finger in ["thumb", "index", "middle", "ring", "pinky"]: + site_name = f"right_{finger}_tip_site" + site_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_SITE, site_name) + if site_id >= 0: + site_pos = data.site(site_id).xpos.copy() + print(f" ✓ {site_name}: found at position [{site_pos[0]:.4f}, {site_pos[1]:.4f}, {site_pos[2]:.4f}]") + else: + print(f" ✗ {site_name}: NOT FOUND in model!") + + # Step 1: Record camera video or use provided video file + if args.video_path: + camera_video_path = Path(args.video_path) + if not camera_video_path.exists(): + raise FileNotFoundError(f"Video file not found: {camera_video_path}") + print(f"Using existing video: {camera_video_path}") + else: + cap = cv2.VideoCapture(args.camera) + if not cap.isOpened(): + raise RuntimeError(f"Failed to open camera {args.camera}") + + if not record_camera_video(cap, camera_video_path, args.fps): + print("Recording cancelled or failed.") + cap.release() + return + + cap.release() + + # Step 2: Process video with HAMER + processed_frames = process_video_with_hamer( + camera_video_path, + tracker, + ik_solver, + model, + data, + target_joints, + ) + + # Get video FPS (use input video FPS if provided, otherwise use args.fps) + if args.video_path: + cap = cv2.VideoCapture(str(camera_video_path)) + video_fps = cap.get(cv2.CAP_PROP_FPS) + cap.release() + if video_fps > 0: + fps = video_fps + else: + fps = args.fps + else: + fps = args.fps + + # Step 3: Record MuJoCo visualization + record_mujoco_visualization( + processed_frames, + model, + data, + mujoco_video_path, + fps, + ik_solver=ik_solver, + target_joint_types=target_joints, + ) + + # Step 4: Concatenate videos + concatenate_videos(camera_video_path, mujoco_video_path, final_video_path) + + print(f"\n{'='*70}") + print("ALL DONE!") + print(f"{'='*70}") + print(f"Camera video: {camera_video_path}") + print(f"MuJoCo video: {mujoco_video_path}") + print(f"Concatenated video: {final_video_path}") + print(f"{'='*70}\n") + + +if __name__ == "__main__": + main() diff --git a/scripts/visualize_tracker_ik_sites.py b/scripts/visualize_tracker_ik_sites.py new file mode 100644 index 0000000..ac99398 --- /dev/null +++ b/scripts/visualize_tracker_ik_sites.py @@ -0,0 +1,557 @@ +"""Visualize tracker sites and IK target sites on ORCA hand model. + +This script overlays: +1. Tracker sites (from MediaPipe/HaMeR hand tracking) - shown as colored spheres +2. IK target sites (on ORCA hand model) - shown as different colored spheres + +Both are color-coded and labeled for easy comparison. +""" + +import argparse +import multiprocessing as mp +import sys +import time +import xml.etree.ElementTree as ET +from pathlib import Path +from queue import Empty +from typing import Protocol + +import cv2 +import mujoco +import mujoco.viewer +import numpy as np + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from handpose.ik_retargeting import ( + FINGER_TARGET_BODIES, + MP_LANDMARK_INDICES, + ORCAHandIKConfig, + ORCAHandIKRetargeting, +) +from handpose.tracker.base import HandStructure +from handpose.tracker.hamer import HaMeRTracker +from handpose.tracker.mediapipe import MediaPipeTracker + + +def inject_visualization_bodies(mjcf_path: Path, target_joint_types: tuple[str, ...]) -> str: + """Inject mocap bodies for tracker sites and ensure IK target sites exist. + + Args: + mjcf_path: Path to MJCF model file + target_joint_types: Tuple of joint types to visualize (tip, ip, pip, mcp) + + Returns: + Modified MJCF XML string + """ + tree = ET.parse(mjcf_path) + root = tree.getroot() + worldbody = root.find("worldbody") + + if worldbody is None: + raise ValueError("Could not find worldbody in MJCF") + + # Convert relative paths to absolute paths + model_dir = mjcf_path.parent + for asset in root.findall(".//asset"): + for mesh in asset.findall("mesh"): + file_attr = mesh.get("file") + if file_attr and not Path(file_attr).is_absolute(): + abs_path = (model_dir / file_attr).resolve() + mesh.set("file", str(abs_path)) + + # All IK target sites use blue for consistency + ik_site_color = "0 0 1 0.9" # Blue for all IK targets + + # Tip site offsets: [x, y, z] where z is upward direction + tip_site_specs = { + "thumb": ("right_thumb_dp", np.array([0.0, 0.0, 0.025])), + "index": ("right_index_ip", np.array([0.0, 0.0, 0.035])), + "middle": ("right_middle_ip", np.array([0.0, 0.0, 0.037])), + "ring": ("right_ring_ip", np.array([0.0, 0.0, 0.036])), + "pinky": ("right_pinky_ip", np.array([0.0, 0.0, 0.032])), + } + + if "tip" in target_joint_types: + tip_sites_created = [] + for finger, (parent_body, offset) in tip_site_specs.items(): + body_elem = root.find(f".//body[@name='{parent_body}']") + if body_elem is None: + print(f"Warning: Parent body '{parent_body}' not found for {finger} tip site") + continue + site_name = f"right_{finger}_tip_site" + existing_site = body_elem.find(f"./site[@name='{site_name}']") + if existing_site is not None: + # Update existing site - make it larger and blue + existing_site.set("type", "sphere") # Ensure type is set + existing_site.set("size", "0.005") + existing_site.set("rgba", ik_site_color) + tip_sites_created.append(f"{finger} (updated)") + else: + # Create new tip site + site = ET.SubElement(body_elem, "site") + site.set("name", site_name) + site.set("type", "sphere") # Explicit type for visibility + site.set("pos", " ".join(f"{value:.5f}" for value in offset)) + site.set("size", "0.005") + site.set("rgba", ik_site_color) + tip_sites_created.append(f"{finger} (created)") + print(f"Tip sites: {', '.join(tip_sites_created)}") + + # Add visual markers for IK target bodies (MCP, PIP, IP) that might not have visual geoms + allowed_types = set(target_joint_types) + markers_added = [] + for finger_name, bodies in FINGER_TARGET_BODIES.items(): + for joint_type, (frame_type, frame_name) in bodies.items(): + if joint_type not in allowed_types: + continue + + if frame_type == "body": + # Add a visual site to the body for visibility + body_elem = root.find(f".//body[@name='{frame_name}']") + if body_elem is not None: + # Check if marker site already exists + marker_site_name = f"ik_marker_{frame_name}" + existing_marker = body_elem.find(f"./site[@name='{marker_site_name}']") + if existing_marker is not None: + # Update existing marker + existing_marker.set("size", "0.004") # Same size as tip sites + existing_marker.set("rgba", ik_site_color) + markers_added.append(f"{finger_name}_{joint_type} (updated)") + else: + # Create new marker site + site = ET.SubElement(body_elem, "site") + site.set("name", marker_site_name) + site.set("pos", "0 0 0") # At body origin + site.set("size", "0.004") # Same size as tip sites for consistency + site.set("rgba", ik_site_color) # Blue for all IK targets + markers_added.append(f"{finger_name}_{joint_type} (created)") + else: + print(f"Warning: Body '{frame_name}' not found in MJCF for {finger_name}_{joint_type}") + elif frame_type == "site": + # This is a tip site - find and update it (should already be handled above, but double-check) + found = False + for body in root.findall(".//body"): + site_elem = body.find(f"./site[@name='{frame_name}']") + if site_elem is not None: + site_elem.set("size", "0.005") + site_elem.set("rgba", ik_site_color) + found = True + markers_added.append(f"{finger_name}_{joint_type} (site updated)") + break + if not found: + print(f"Warning: Site '{frame_name}' not found in MJCF for {finger_name}_{joint_type}") + + print(f"Added/updated {len(markers_added)} IK target markers: {', '.join(markers_added)}") + + # Add mocap bodies for tracker sites (MediaPipe landmarks) + # All tracker sites use red for consistency - these represent the actual tracked hand + tracker_color = "1 0 0 0.9" # Red for all tracker sites (actual hand) + + allowed_types = set(target_joint_types) + for finger_name, mp_mapping in MP_LANDMARK_INDICES.items(): + for joint_type, mp_idx in mp_mapping.items(): + if joint_type not in allowed_types: + continue + + # Create mocap body for tracker site + mocap_body = ET.SubElement(worldbody, "body") + mocap_body.set("name", f"tracker_{finger_name}_{joint_type}") + mocap_body.set("mocap", "true") + mocap_body.set("pos", "0 0 0") + + # Add visual sphere for tracker site (larger, red) + geom = ET.SubElement(mocap_body, "geom") + geom.set("type", "sphere") + geom.set("size", "0.005") # 5mm radius (larger than IK targets for visibility) + geom.set("rgba", tracker_color) # Red for all tracker sites + geom.set("contype", "0") + geom.set("conaffinity", "0") + + return ET.tostring(root, encoding="unicode") + + +def hand_structure_to_landmarks(structure: HandStructure) -> np.ndarray: + """Convert HandStructure to 21x3 landmarks array (MediaPipe format). + + Returns landmarks in hand frame (wrist at origin). + """ + landmarks = np.zeros((21, 3)) + + # Wrist (index 0) + landmarks[0] = structure.wrist_position + + # Thumb: CMC(1), MCP(2), IP(3), tip(4) + landmarks[1] = structure.thumb.mcp # CMC approximate + landmarks[2] = structure.thumb.mcp + landmarks[3] = structure.thumb.ip if structure.thumb.ip is not None else structure.thumb.mcp + landmarks[4] = structure.thumb.tip + + # Index: MCP(5), PIP(6), DIP(7), tip(8) + landmarks[5] = structure.index.mcp + landmarks[6] = structure.index.pip if structure.index.pip is not None else structure.index.mcp + landmarks[7] = structure.index.dip if structure.index.dip is not None else structure.index.tip + landmarks[8] = structure.index.tip + + # Middle: MCP(9), PIP(10), DIP(11), tip(12) + landmarks[9] = structure.middle.mcp + landmarks[10] = structure.middle.pip if structure.middle.pip is not None else structure.middle.mcp + landmarks[11] = structure.middle.dip if structure.middle.dip is not None else structure.middle.tip + landmarks[12] = structure.middle.tip + + # Ring: MCP(13), PIP(14), DIP(15), tip(16) + landmarks[13] = structure.ring.mcp + landmarks[14] = structure.ring.pip if structure.ring.pip is not None else structure.ring.mcp + landmarks[15] = structure.ring.dip if structure.ring.dip is not None else structure.ring.tip + landmarks[16] = structure.ring.tip + + # Pinky: MCP(17), PIP(18), DIP(19), tip(20) + landmarks[17] = structure.pinky.mcp + landmarks[18] = structure.pinky.pip if structure.pinky.pip is not None else structure.pinky.mcp + landmarks[19] = structure.pinky.dip if structure.pinky.dip is not None else structure.pinky.tip + landmarks[20] = structure.pinky.tip + + return landmarks + + +def update_tracker_sites( + model: mujoco.MjModel, + data: mujoco.MjData, + hand_structure: HandStructure, + target_joint_types: tuple[str, ...], + ik_solver: ORCAHandIKRetargeting, +) -> None: + """Update mocap body positions for tracker sites based on hand structure. + + Args: + model: MuJoCo model + data: MuJoCo data + hand_structure: Tracked hand structure + target_joint_types: Joint types to visualize + ik_solver: IK solver (for coordinate transformation) + """ + landmarks_hand_frame = hand_structure_to_landmarks(hand_structure) + + # Get palm transform to convert hand frame to world frame + t_palm = ik_solver.configuration.get_transform_frame_to_world("right_palm", "body") + p_palm = t_palm.translation() + r_palm = t_palm.rotation().as_matrix() + + # Wrist offset in palm frame + wrist_offset_world = r_palm @ ik_solver.config.wrist_offset_palm + p_wrist = p_palm + wrist_offset_world + + # Scale factor + scale = ik_solver.config.scale_factor + + # Coordinate transformation + coord_transform = ik_solver.config.coord_transform + + allowed_types = set(target_joint_types) + for finger_name, mp_mapping in MP_LANDMARK_INDICES.items(): + for joint_type, mp_idx in mp_mapping.items(): + if joint_type not in allowed_types: + continue + + # Get landmark position in hand frame + landmark_pos_hand_frame = landmarks_hand_frame[mp_idx] + + # Scale to robot size + scaled_vec = landmark_pos_hand_frame * scale + + # Transform to world frame (anchor to wrist) + target_pos_world = p_wrist + scaled_vec + + # Apply coordinate transformation + rel_vec = target_pos_world - p_wrist + transformed_vec = coord_transform @ rel_vec + final_pos = p_wrist + transformed_vec + + # Update mocap body position + mocap_body_name = f"tracker_{finger_name}_{joint_type}" + body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, mocap_body_name) + if body_id >= 0: + # Get the mocap index for this body (mocap bodies have mocapid >= 0) + mocap_idx = model.body_mocapid[body_id] + if mocap_idx >= 0 and mocap_idx < model.nmocap: + data.mocap_pos[mocap_idx] = final_pos + + +def visualize_ik_target_sites( + model: mujoco.MjModel, + data: mujoco.MjData, + target_joint_types: tuple[str, ...], +) -> dict[str, dict[str, np.ndarray]]: + """Get positions of IK target sites on ORCA hand. + + Args: + model: MuJoCo model + data: MuJoCo data + target_joint_types: Joint types to visualize + + Returns: + Dictionary mapping finger names to joint_type -> position + """ + ik_positions = {} + allowed_types = set(target_joint_types) + + for finger_name, bodies in FINGER_TARGET_BODIES.items(): + finger_positions = {} + for joint_type, (frame_type, frame_name) in bodies.items(): + if joint_type not in allowed_types: + continue + + obj_type = mujoco.mjtObj.mjOBJ_BODY if frame_type == "body" else mujoco.mjtObj.mjOBJ_SITE + frame_id = mujoco.mj_name2id(model, obj_type, frame_name) + + if frame_id >= 0: + if obj_type == mujoco.mjtObj.mjOBJ_SITE: + pos = data.site(frame_id).xpos.copy() + else: + pos = data.body(frame_id).xpos.copy() + finger_positions[joint_type] = pos + + if finger_positions: + ik_positions[finger_name] = finger_positions + + return ik_positions + + +class Flag(Protocol): + value: int + + +def display_feed_process(frame_queue: mp.Queue, running_flag: Flag, window_name: str) -> None: + """Display frames in a separate process to avoid GLFW/OpenCV conflicts.""" + while True: + if not running_flag.value and frame_queue.empty(): + break + + try: + frame = frame_queue.get(timeout=0.05) + except Empty: + if not running_flag.value: + break + continue + + cv2.imshow(window_name, frame) + key = cv2.waitKey(1) & 0xFF + if key == ord("q"): + running_flag.value = 0 + break + + cv2.destroyAllWindows() + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Visualize tracker sites and IK target sites on ORCA hand", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--camera", type=int, default=0, help="Camera device index") + parser.add_argument( + "--model", + type=str, + default="models/orca_hand_fixed.mjcf", + help="Path to MuJoCo MJCF model file", + ) + parser.add_argument( + "--tracker", + type=str, + default="hamer", + choices=["hamer", "mediapipe"], + help="Hand tracking backend", + ) + parser.add_argument( + "--targets", + type=str, + default="tip", + help="Comma-separated joint targets (tip,ip,pip,mcp). Default: tip", + ) + parser.add_argument( + "--scale", + type=float, + default=1.0, + help="Hand scale factor (robot/human)", + ) + parser.add_argument( + "--show-feed", + action="store_true", + help="Show camera feed with hand tracking overlay", + ) + parser.add_argument( + "--joint-smoothing", + type=float, + default=0.7, + help="Smoothing factor for joint positions (0.0-1.0). \ + Lower values = more smoothing, higher = less smoothing. Default: 0.7", + ) + args = parser.parse_args() + + # Parse target joint types + raw_targets = [part.strip().lower() for part in args.targets.split(",")] + target_joints = tuple(dict.fromkeys(jt for jt in raw_targets if jt)) + if not target_joints: + target_joints = ("tip",) + supported_targets = {"tip", "ip", "pip", "mcp"} + invalid = [jt for jt in target_joints if jt not in supported_targets] + if invalid: + parser.error(f"Unsupported target joint types: {', '.join(invalid)}") + + # Load model + model_file = Path(args.model) + if not model_file.is_absolute(): + model_file = Path(__file__).parent.parent / args.model + if not model_file.exists(): + raise FileNotFoundError(f"Model file not found: {model_file}") + + print(f"Loading model from {model_file}") + xml_string = inject_visualization_bodies(model_file, target_joints) + model = mujoco.MjModel.from_xml_string(xml_string) + data = mujoco.MjData(model) + + # Verify tip sites exist in the loaded model + if "tip" in target_joints: + print("\nVerifying tip sites in loaded model:") + for finger in ["thumb", "index", "middle", "ring", "pinky"]: + site_name = f"right_{finger}_tip_site" + site_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_SITE, site_name) + if site_id >= 0: + site_pos = data.site(site_id).xpos.copy() + print(f" ✓ {site_name}: found at position [{site_pos[0]:.4f}, {site_pos[1]:.4f}, {site_pos[2]:.4f}]") + else: + print(f" ✗ {site_name}: NOT FOUND in model!") + + # Initialize tracker + if args.tracker == "hamer": + tracker = HaMeRTracker() + else: + tracker = MediaPipeTracker() + + # Initialize IK solver + ik_config = ORCAHandIKConfig( + scale_factor=args.scale, + target_joint_types=target_joints, + lm_damping=0.5, + ik_iterations=15, + ) + ik_solver = ORCAHandIKRetargeting(model, config=ik_config) + + # Open camera + cap = cv2.VideoCapture(args.camera) + if not cap.isOpened(): + raise RuntimeError(f"Failed to open camera {args.camera}") + + print("\n" + "=" * 70) + print("VISUALIZATION GUIDE:") + print("=" * 70) + print("TRACKER SITES (larger RED spheres - actual tracked hand from camera):") + print(" Red spheres = All tracker sites (thumb, index, middle, ring, pinky)") + print(" These represent the actual human hand position") + print("\nIK TARGET SITES (smaller BLUE spheres - on ORCA hand model):") + print(" Blue spheres = All IK target sites (thumb, index, middle, ring, pinky)") + print(" These represent where the robot hand should move") + print("\nNOTE: RED spheres (tracker) should align with BLUE spheres (IK targets)") + print(" when IK is solving correctly. RED = actual hand, BLUE = robot targets.") + print("\nPress 'q' in MuJoCo viewer to quit") + print("=" * 70 + "\n") + + # Validate joint smoothing value + if not (0.0 < args.joint_smoothing <= 1.0): + parser.error("--joint-smoothing must be between 0.0 and 1.0 (exclusive of 0.0)") + + start_time = time.time() + + # Initialize joint position smoothing state + smoothed_qpos = data.qpos.copy() + + # Set up multiprocessing for display window if requested + display_process: mp.Process | None = None + frame_queue: mp.Queue | None = None + running_flag: Flag | None = None + + if args.show_feed: + print("Starting camera feed display in separate process...") + frame_queue = mp.Queue(maxsize=2) # Small buffer to avoid lag + running_flag = mp.Value("i", 1) + display_process = mp.Process( + target=display_feed_process, + args=(frame_queue, running_flag, "Hand Tracking Feed"), + ) + display_process.start() + + try: + with mujoco.viewer.launch_passive(model, data) as viewer: + while viewer.is_running() and cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + timestamp = time.time() - start_time + hand_structures = tracker.detect_hands(frame, timestamp=timestamp) + + # Send frame to display process if enabled + if args.show_feed and frame_queue is not None and running_flag is not None: + if hand_structures: + vis_frame = tracker.visualize(frame, hand_structures) + else: + vis_frame = frame.copy() + try: + frame_queue.put_nowait(vis_frame) + except Exception: + pass # Queue full, skip this frame + + if hand_structures: + hand = hand_structures[0] + + # Update IK solver configuration with current smoothed state + mujoco.mj_forward(model, data) + data.qpos[:] = smoothed_qpos + mujoco.mj_forward(model, data) + ik_solver.configuration.update(data.qpos) + + # Solve IK + qpos = ik_solver.solve(hand) + + # Apply joint position smoothing + if not np.any(np.isnan(qpos)) and not np.any(np.isinf(qpos)): + # Exponential moving average: smoothed = joint_smoothing * new + (1 - joint_smoothing) * old + smoothed_qpos = args.joint_smoothing * qpos + (1.0 - args.joint_smoothing) * smoothed_qpos + data.qpos[:] = smoothed_qpos + data.qvel[:] = 0 + + # Update tracker site positions + update_tracker_sites(model, data, hand, target_joints, ik_solver) + + # Forward kinematics to update IK target site positions + mujoco.mj_forward(model, data) + + # Get IK target positions (sites are automatically rendered by MuJoCo) + ik_positions = visualize_ik_target_sites(model, data, target_joints) + + # Print current positions for debugging (first frame only) + if hand_structures: + print("\nCurrent IK Target Positions:") + for finger, positions in ik_positions.items(): + for joint_type, pos in positions.items(): + print(f" {finger}_{joint_type}: [{pos[0]:.4f}, {pos[1]:.4f}, {pos[2]:.4f}]") + + # Sync viewer + viewer.sync() + + finally: + # Clean up display process + if running_flag is not None: + running_flag.value = 0 + if display_process is not None: + display_process.join(timeout=1.0) + if display_process.is_alive(): + display_process.terminate() + cap.release() + print("\nVisualization closed.") + + +if __name__ == "__main__": + main()