From df4b69b566f4b87db958178cb55cf34b581f3290 Mon Sep 17 00:00:00 2001 From: Cao Hanzhe Date: Fri, 8 May 2026 23:29:06 +0800 Subject: [PATCH] fix: account for multi-turn chunks in agentic LR scheduler budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using AgentNativeStepEnvManager for step-level agentic training, each trajectory produces multiple training samples (one per agent turn). The base PPOConfig.set_max_steps() only accounts for rollout_batch_size (number of trajectories), causing the LR scheduler to exhaust its step budget far before all pipeline steps complete. For example, with 4 trajectories × ~10 turns each = ~40 training samples per pipeline step. With backward_batch_size=4, that's ~10 optimizer steps per pipeline step instead of the budgeted 1. This fix: - Adds estimated_chunks_per_traj config field (default 0 = auto-detect) - Overrides set_max_steps in AgenticConfig to multiply optimizer step budget by the chunks-per-trajectory estimate - Auto-detects the estimate from custom_envs max_steps (conservative: max_steps // 2) when not explicitly configured - Users can override via estimated_chunks_per_traj for precise control Fixes alibaba/ROLL#407 --- roll/pipeline/agentic/agentic_config.py | 87 +++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/roll/pipeline/agentic/agentic_config.py b/roll/pipeline/agentic/agentic_config.py index 231f3859d..7f07e8de4 100644 --- a/roll/pipeline/agentic/agentic_config.py +++ b/roll/pipeline/agentic/agentic_config.py @@ -200,6 +200,16 @@ class AgenticConfig(PPOConfig): open_feedback_turn: bool = field(default=False, metadata={"help": "open feedback turn"}) use_token_reward: bool = field(default=False, metadata={"help": "use token reward"}) + estimated_chunks_per_traj: int = field( + default=0, + metadata={ + "help": "Estimated average number of training chunks (turns) per trajectory for step-level " + "agentic training (e.g., AgentNativeStepEnvManager). When > 0, the LR scheduler " + "budget is multiplied by this factor to prevent premature exhaustion. " + "When 0 (default), auto-detected from custom_envs max_steps (conservative: max_steps // 2)." + }, + ) + batch_adjust_mode: Literal["copy", "delete", "auto", "random_sample"] = field( default="copy", metadata={"help": "batch adjust mode: copy or delete"} ) @@ -357,6 +367,83 @@ def __post_init__(self): # Apply OPD configuration at the end (handles student_train/student_infer/teacher mapping) self._apply_opd_config() + def _get_chunks_per_traj_estimate(self) -> int: + """Estimate the average number of training chunks per trajectory. + + For step-level env managers (e.g., AgentNativeStepEnvManager), each trajectory + produces multiple training samples (one per turn/step). This method estimates + the multiplier from custom_envs config or the explicit override field. + + Returns: + int: estimated chunks per trajectory (minimum 1) + """ + if self.estimated_chunks_per_traj > 0: + return self.estimated_chunks_per_traj + + # Auto-detect from custom_envs: look for max_steps in env configs + max_env_steps = 0 + if self.custom_envs: + for tag, cfg in self.custom_envs.items(): + if hasattr(cfg, 'max_steps') and cfg.max_steps is not None: + max_env_steps = max(max_env_steps, int(cfg.max_steps)) + elif isinstance(cfg, dict) and 'max_steps' in cfg: + max_env_steps = max(max_env_steps, int(cfg['max_steps'])) + + if max_env_steps > 1: + # Conservative estimate: half of max_steps as average turns per trajectory + estimate = max(1, max_env_steps // 2) + logger.info( + f"Auto-detected estimated_chunks_per_traj={estimate} " + f"(from custom_envs max_steps={max_env_steps}, using max_steps // 2)" + ) + return estimate + + return 1 + + def set_max_steps(self, max_steps: int): + """Override to account for multi-turn chunking in agentic training. + + For step-level env managers (AgentNativeStepEnvManager), each trajectory + produces multiple training samples (chunks), one per agent turn. The parent + implementation only accounts for rollout_batch_size (trajectories), causing + the LR scheduler to exhaust early when chunks >> trajectories. + + This override multiplies the optimizer step budget by an estimate of chunks + per trajectory to align the scheduler with actual training dynamics. + """ + chunks_multiplier = self._get_chunks_per_traj_estimate() + + actor_backward_batch_size = ( + self.actor_train.training_args.per_device_train_batch_size + * self.actor_train.training_args.gradient_accumulation_steps + ) + critic_backward_batch_size = ( + self.critic.training_args.per_device_train_batch_size + * self.critic.training_args.gradient_accumulation_steps + ) + + self.actor_train.training_args.max_steps = max(1, ( + max_steps + * self.rollout_batch_size + * self.actor_infer.generating_args.num_return_sequences + * self.ppo_epochs + * chunks_multiplier + // actor_backward_batch_size + )) + self.critic.training_args.max_steps = max(1, ( + max_steps + * self.rollout_batch_size + * self.actor_infer.generating_args.num_return_sequences + * chunks_multiplier + // critic_backward_batch_size + )) + + logger.info(f"pipeline max_steps: {self.max_steps} to {max_steps}") + logger.info(f"chunks_per_traj_multiplier: {chunks_multiplier}") + logger.info(f"actor train max_steps without dp_size: {self.actor_train.training_args.max_steps}") + logger.info(f"critic train max_steps without dp_size: {self.critic.training_args.max_steps}") + self.max_steps = max_steps + def make_env_configs(self, env_manager_config: EnvManagerConfig): # construct env configs env_configs = defaultdict(defaultdict)