diff --git a/vila_hd/nvila_hd_video/README.md b/vila_hd/nvila_hd_video/README.md
index f8bda527..40966d0a 100644
--- a/vila_hd/nvila_hd_video/README.md
+++ b/vila_hd/nvila_hd_video/README.md
@@ -50,15 +50,15 @@ num_video_frames_thumbnail = 64  # Total sampled frames for thumbnails
 max_tiles_video = 48             # Max spatial tiles per video (one tile is 392x392)
 
 # ----- AutoGaze args (tiles) -----
-gazing_ratio_tile = [0.2] + [0.06] * 15  # Per-frame max gazing ratios (single float or list)
-task_loss_requirement_tile = 0.6
+gazing_ratio_tile = [0.2] + [0.06] * 15  # Per-frame max gazing ratios (single float or list). Videos with higher resolution/FPS usually need lower gazing ratio.
+task_loss_requirement_tile = 0.6         # AutoGaze stops gazing at each frame when the estimated reconstruction loss of that frame is lower than this threshold.
 
 # ----- AutoGaze args (thumbnails) -----
-gazing_ratio_thumbnail = 1       # Set to None to skip gazing on thumbnails
+gazing_ratio_thumbnail = 1       # Set gazing ratio to 1 and task loss requirement to None to skip gazing on thumbnails
 task_loss_requirement_thumbnail = None
 
 # ----- Batching -----
-max_batch_size_autogaze = 16
+max_batch_size_autogaze = 16     # Set AutoGaze and SigLIP to use smaller mini-batch size if GPU memory is limited
 max_batch_size_siglip = 32
 
 # Load processor and model