请教一下Qwen2.5-3B-instruct用mix_chord训练，reward很低的原因

作者您好，非常感谢您的项目，我尝试了用mix_chord训练Qwen2.5-3B-Instruct，但是reward很低，想请教一下原因，是不是超参的问题，以下是我的配置文件，是在8张40G的A800跑的。
```yaml
project: "mix_chord"
name: "test_mix_2.5_3B"
checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
algorithm:
  algorithm_type: mix_chord
  repeat_times: 8 # or 16 for better performance in math related tasks
  optimizer:
    lr: 1e-6 # or 5e-6, larger lr with warm up can result in better performance for SFT training.
  kl_loss_fn_args:
    kl_coef: 0.0
  entropy_loss_fn: mix
  sample_strategy_args:
    expert_data_ratio: 0.20
  policy_loss_fn_args: # feel free to change, we encourage you to try out different hyperparameters
    mu_warmup_steps: 0  # 0 for chord-mu and chord-phi
    mu_decay_steps: 200 # 200 for chord-mu and 0 for chord-phi
    mu_peak: 0.9 # 0.9 for chord-mu and 0.1 for chord-phi
    mu_valley: 0.05 # 0.05 for chord-mu and 0.1 for chord-phi
    enable_phi_function: false # false for chord-mu and true for chord-phi
    clip_range: 0.2
    sft_loss_agg_mode: "token-mean"
    use_dynamic_bsz: true
    ppo_mini_batch_size: 80   # 320 = 256 + 64; if you set repeat times = 16, then it shoudle be 32 * 16 + 64
    ppo_micro_batch_size_per_gpu: 4
    ngpus_trainer: 4
    train_batch_size_expert: 16   # sft
    train_batch_size_usual: 64    # batchsize*repeat times
model:
  model_path: ${oc.env:TRINITY_MODEL_PATH, /home/dataset-assist-0/wl/cyt/models/Qwen2.5-3B-Instruct}
  max_response_tokens: 10240    # 控制 模型输出 的最大长度
  max_model_len: 11264          # 控制 模型输入 + 输出的总长度上限
cluster:
  node_num: 1
  gpu_per_node: 8
buffer:
  total_epochs: 2
  batch_size: 8
  train_batch_size: 80    # train_batch_size_usual + train_batch_size_expert
  explorer_input:
    taskset:
      name: openr1_data_filtered_int
      storage_type: file
      path: ${oc.env:TRINITY_TASKSET_PATH, /home/dataset-assist-0/wl/cyt/Trinity-RFT/examples/mix_chord/openr1_rl_dataset}
      format:
        prompt_key: 'problem'
        response_key: 'answer'
      rollout_args:
        temperature: 1.0
        logprobs: 0
      workflow_args:
        with_think: true
        #use_base: true   # added
    eval_tasksets:
      - name: AIME2024
        storage_type: file
        path: ${oc.env:TRINITY_TASKSET_PATH, /home/dataset-assist-0/wl/cyt/datasets/aime24}  # e.g. path to AIME2024
        split: 'train'
        repeat_times: 4
        format:
            prompt_key: 'problem'
            response_key: 'solution'
        rollout_args:
            temperature: 1.0
            top_p: 0.7
        default_reward_fn_type: 'math_boxed_reward'
        reward_fn_args: {}
    default_workflow_type: 'math_boxed_workflow'
  trainer_input:
    experience_buffer:
      name: math_buffer
      storage_type: queue
      path: 'sqlite:///test_mix_2.5_3B.db'            #change this
    auxiliary_buffers:
      sft_dataset:
        total_epochs: 25
        name: SFT_data
        storage_type: file
        schema_type: sft
        path: ${oc.env:TRINITY_SFT_DATASET_PATH, /home/dataset-assist-0/wl/cyt/datasets/open-r1/mixture-of-thoughts/math}
        split: 'train'
        format:
          prompt_type: messages
          messages_key: 'messages'
explorer:
  eval_interval: 20
  runner_per_model: 8
  rollout_model:
    engine_num: 4
    tensor_parallel_size: 1
    enable_prefix_caching: false
    enforce_eager: true
    dtype: bfloat16
    seed: 42
    gpu_memory_utilization: 0.9
synchronizer:
  sync_method: 'nccl'
  sync_interval: 1
  sync_timeout: 1200
trainer:
  save_interval: 50
  grad_clip: 1.0
  use_dynamic_bsz: true
  #ppo_max_token_len_per_gpu: 25600
  ulysses_sequence_parallel_size: 2
monitor:
  monitor_type: wandb
  monitor_args:
    api_key: 14d0d70e75f8d966867f6e0fc77783384993335e
```
下面是截图
<img width="1649" height="515" alt="Image" src="https://github.com/user-attachments/assets/b11cca7a-2516-4b69-ae46-f637b09b56ab" />
非常感谢

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

请教一下Qwen2.5-3B-instruct用mix_chord训练，reward很低的原因 #360

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

请教一下Qwen2.5-3B-instruct用mix_chord训练，reward很低的原因 #360

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions