From c6c758a33e8a06a26387fe072c5da2c41a7b0234 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Fri, 27 Mar 2026 15:34:21 +0800 Subject: [PATCH 1/4] add sp size to avoid of OOM --- autotest/config.yaml | 2 +- autotest/config/qwen3_5_recompute.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 7a2fe274e..563cb4117 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -390,7 +390,7 @@ case: assert_info: base_metric: qwen3-5-sft-recompute/625c0018/tracker.jsonl check_metrics: - grad_norm: 0.000001 + grad_norm: 0.02 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 diff --git a/autotest/config/qwen3_5_recompute.py b/autotest/config/qwen3_5_recompute.py index 4a72b4559..4cac7c514 100644 --- a/autotest/config/qwen3_5_recompute.py +++ b/autotest/config/qwen3_5_recompute.py @@ -43,6 +43,7 @@ model_cfg=moe_cfg, optim_cfg=optim_cfg, fsdp_cfg=fsdp_cfg, + sp_size=4, dataset_cfg=dataset_config, dataloader_cfg=dataloader_config, lr_cfg=lr_cfg, From 3db207a6bed30fee1ab065788145a296249a79e0 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Wed, 1 Apr 2026 15:43:07 +0800 Subject: [PATCH 2/4] bigger timeout because of more cases --- .github/workflows/e2e_test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 03febe772..0a6412921 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -29,6 +29,7 @@ jobs: ete_test: if: ${{!cancelled() }} runs-on: [h_cluster_ete] + timeout-minutes: 720 steps: - name: Clean workdir run: sudo git clean -ffdx From 92f65f6179df052ab932acef2937a57f827a771f Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Thu, 2 Apr 2026 09:50:45 +0800 Subject: [PATCH 3/4] add more loss verification --- autotest/config.yaml | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/autotest/config.yaml b/autotest/config.yaml index 563cb4117..e7f9576c9 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -40,6 +40,9 @@ case: base_metric: qwen3-sft/20251117105949/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -64,6 +67,9 @@ case: base_metric: qwen3-sft-ep8/cec3a8d2/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -88,6 +94,9 @@ case: base_metric: qwen3-sft-ep8/cec3a8d2_resume/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -109,6 +118,9 @@ case: base_metric: qwen3-sft-tp2/cec3a8d2/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -133,6 +145,9 @@ case: base_metric: qwen3-sft-recompute/d76995/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -159,6 +174,9 @@ case: base_metric: qwen3-sft-fp8/d76995/tracker.jsonl check_metrics: grad_norm: 0.1 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -183,6 +201,9 @@ case: base_metric: qwen3-sft/20251117105949/tracker.jsonl check_metrics: grad_norm: 1 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.02 lr: 0 timeout: 10800 @@ -204,6 +225,9 @@ case: base_metric: qwen3-sft-celoss/812c1021/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -226,6 +250,9 @@ case: base_metric: qwen3-30B-sp4-intralayer2/c0eba147/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -248,6 +275,9 @@ case: base_metric: qwen3-30B-sp8-intralayer2/c0eba147/tracker.jsonl check_metrics: grad_norm: 0.025 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -270,6 +300,9 @@ case: base_metric: gptoss-sft/7b774a0e2/tracker.jsonl check_metrics: grad_norm: 0.9 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.1 lr: 0 memory/max_memory_GB: 0.2 @@ -294,6 +327,9 @@ case: base_metric: qwen3-sft-cache/e968368a/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -317,6 +353,9 @@ case: base_metric: qwen3-sft-vl-dense/812c1021/tracker.jsonl check_metrics: grad_norm: 0.000001 + loss/maxvio: 0.000001 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 From 75112f256fc551db20814890a9dfbe6be4f03061 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Thu, 2 Apr 2026 09:55:56 +0800 Subject: [PATCH 4/4] add more loss verification --- autotest/config.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/autotest/config.yaml b/autotest/config.yaml index e7f9576c9..a8563d3be 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -380,6 +380,8 @@ case: base_metric: qwen3-5-sft-vl-moe/e968368a/tracker.jsonl check_metrics: grad_norm: 5 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 5 lr: 0 memory/max_memory_GB: 0.2 @@ -406,6 +408,8 @@ case: base_metric: qwen3-5-sft-fp8/625c0018/tracker.jsonl check_metrics: grad_norm: 0.1 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -430,6 +434,8 @@ case: base_metric: qwen3-5-sft-recompute/625c0018/tracker.jsonl check_metrics: grad_norm: 0.02 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -452,6 +458,8 @@ case: base_metric: qwen3-5-sft-tp2/625c0018/tracker.jsonl check_metrics: grad_norm: 0.05 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -474,6 +482,8 @@ case: base_metric: qwen3-5-sft-sp4-resume/625c0018/tracker.jsonl check_metrics: grad_norm: 0.02 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 @@ -496,6 +506,8 @@ case: base_metric: qwen3-5-sft-sp4-resume/625c0018_resume/tracker.jsonl check_metrics: grad_norm: 0.02 + loss/local_loss: 0.000001 + loss/reduced_balancing_loss: 0.000001 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2