Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f4b26a7
fleet glm lora
xiaoguoguo626807 Nov 25, 2025
26178fb
fleet glm lora
xiaoguoguo626807 Nov 25, 2025
2244951
revert lora layer
xiaoguoguo626807 Nov 25, 2025
9aff9f9
revert fleet
xiaoguoguo626807 Nov 25, 2025
9e663fc
Update trainer.py
xiaoguoguo626807 Nov 25, 2025
25b7208
fix conflict
xiaoguoguo626807 Dec 4, 2025
9c88c04
Merge branch 'develop', commit 'refs/pull/3003/head' of https://githu…
xiaoguoguo626807 Dec 5, 2025
e7bd4ce
support fleet save_pretrained from_pretrained
changeyoung98 Dec 5, 2025
1d066f1
fix
xiaoguoguo626807 Dec 5, 2025
b052ca7
add use_fleet flage
xiaoguoguo626807 Dec 5, 2025
716d68c
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleForme…
xiaoguoguo626807 Dec 5, 2025
c684ea5
fix trainargs
xiaoguoguo626807 Dec 5, 2025
8344c7e
temp test
xiaoguoguo626807 Dec 5, 2025
139b345
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleForme…
xiaoguoguo626807 Dec 5, 2025
474c4fe
Merge commit 'refs/pull/3003/head' of https://github.com/PaddlePaddle…
xiaoguoguo626807 Dec 5, 2025
6ba4ffe
recover
xiaoguoguo626807 Dec 5, 2025
ce4f9e5
merge develop
changeyoung98 Dec 5, 2025
9b654ac
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleForme…
xiaoguoguo626807 Dec 5, 2025
76c795a
Merge commit 'refs/pull/3003/head' of https://github.com/PaddlePaddle…
xiaoguoguo626807 Dec 5, 2025
a526db0
fix fuse
changeyoung98 Dec 5, 2025
757c03c
Merge commit 'refs/pull/3109/head' of https://github.com/PaddlePaddle…
xiaoguoguo626807 Dec 5, 2025
85bffce
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleForme…
xiaoguoguo626807 Dec 8, 2025
8d96e86
update fleet pt use cli
xiaoguoguo626807 Dec 8, 2025
0412ac2
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleForme…
xiaoguoguo626807 Dec 8, 2025
ef42bb9
add pt yaml
xiaoguoguo626807 Dec 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/experiments/paddlefleet/glm45_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@

import paddle
import paddle.nn.functional as F
from gpt_provider import GPTModelProvider
from paddlefleet.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec

from paddleformers.transformers.gpt_provider import GPTModelProvider

if TYPE_CHECKING:
from paddlefleet.spec_utils import LayerSpec

Expand Down Expand Up @@ -147,7 +148,7 @@ class GLM45AirModelDebugProvider(GLM45AirModelProvider106B):
num_nextn_predict_layers: Optional[int] = 0
use_bias: bool = False
vocab_size: int = 37888
sequence_parallel: bool = True
sequence_parallel: bool = True
expert_model_parallel_size: int = 16
tensor_model_parallel_size: int = 4

Expand Down
57 changes: 57 additions & 0 deletions examples/experiments/paddlefleet/pt_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
### data
dataset_type: "pretrain"
input_dir: "1.0 ../data/pre-training/llama_openwebtext_100k"
split: "998,1,1"
max_seq_len: 8192
mix_strategy: concat

### model
model_name_or_path: ../zai-org/GLM-4.5-Air
# attn_impl: flashmask

### finetuning
# base
stage: PT
fine_tuning: full
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 2
num_train_epochs: 1
max_steps: 50
eval_steps: 100
eval_iters: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 1
logging_dir: ./4_layer_tp4_ep8_pp4_sp2_glm_vdl_log
output_dir: ./checkpoints/4layer_tp4_ep8_pp4_sp2_glm_pretrain_ckpts
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 20
learning_rate: 1.0e-5

# performance
fuse_attention_qkv: true
fuse_attention_ffn: true
tensor_parallel_degree: 2
pipeline_parallel_degree: 1
use_expert_parallel: true
expert_parallel_degree: 4
sequence_parallel: true
sharding_parallel_config: split_param
amp_master_grad: true
sharding: stage1
recompute: false
bf16: true
fp16_opt_level: O2
load_via_cpu: true
save_to_hf: false
save_checkpoint_format: "flex_checkpoint"
load_checkpoint_format: "flex_checkpoint"

61 changes: 61 additions & 0 deletions examples/experiments/paddlefleet/pt_single_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
### data
dataset_type: "pretrain"
input_dir: "1.0 ../data/pre-training/llama_openwebtext_100k"
split: "998,1,1"
max_seq_len: 8192
mix_strategy: concat

### model
model_name_or_path: ../zai-org/GLM-4.5-Air
# attn_impl: flashmask

### finetuning
# base
stage: PT
fine_tuning: full
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1
num_train_epochs: 1
max_steps: 10
eval_steps: 100
eval_iters: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 1
logging_dir: ./4_layer_single_card_glm_vdl_log
output_dir: ./checkpoints/4_layer_single_card_glm_pretrain_ckpts
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 0
learning_rate: 1.0e-5

# special
max_grad_norm: 0
min_learning_rate: 0

# performance
fuse_attention_qkv: true
fuse_attention_ffn: true
tensor_parallel_degree: 1
pipeline_parallel_degree: 1
use_expert_parallel: false
sequence_parallel: false
sharding_parallel_config: split_param
amp_master_grad: true
sharding: stage1
recompute: false
bf16: true
fp16_opt_level: O2
load_via_cpu: true
save_to_hf: false
#unified_checkpoint: true
save_checkpoint_format: "flex_checkpoint"
load_checkpoint_format: "flex_checkpoint"

7 changes: 4 additions & 3 deletions examples/experiments/paddlefleet/qwen_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.

import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Callable, List, Optional, Union
from dataclasses import dataclass
from typing import TYPE_CHECKING, Callable, Optional, Union

import paddle
import paddle.nn.functional as F
from gpt_provider import GPTModelProvider
from paddlefleet.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec

from paddleformers.transformers.gpt_provider import GPTModelProvider

if TYPE_CHECKING:
from paddlefleet.spec_utils import LayerSpec

Expand Down
26 changes: 17 additions & 9 deletions examples/experiments/paddlefleet/run_glm45.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

START_RANK=0 # 改成真正执行的机器号
END_RANK=2 # 改成真正执行的机器号
END_RANK=1 # 改成真正执行的机器号

if [[ $rank -lt $START_RANK ]]; then
exit 0
Expand All @@ -33,11 +33,19 @@ rank=$(($rank-$START_RANK))
# 使用标准的FP32格式计算, 提升精度
export NVIDIA_TF32_OVERRIDE=0

python -m paddle.distributed.launch \
--log_dir ./outputs/output_$rank/paddle_distributed_logs \
--master $master:$port \
--nnodes $nnodes \
--rank $rank \
--run_mode=collective \
run_pretrain.py glm45.json \
--output_dir . # 改成自己的保存模型目录
root_path="/root/paddlejob/share-storage/gpfs/system-public/wangruting/wangruting"
export PYTHONPATH=$root_path/PaddleFleet/src:$root_path/PaddleFormers/examples/experiments/paddlefleet #修改为自己的paddlefleet路径
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# paddleformers-cli train ./examples/experiments/paddlefleet/pt_full.yaml

export CUDA_VISIBLE_DEVICES=0
paddleformers-cli train ./examples/experiments/paddlefleet/pt_single_full.yaml

# python -m paddle.distributed.launch \
# --log_dir ./outputs/output_$rank/paddle_distributed_logs \
# --master $master:$port \
# --nnodes $nnodes \
# --rank $rank \
# --run_mode=collective \
# ./examples/experiments/paddlefleet/run_pretrain.py ./examples/experiments/paddlefleet/glm45_single_card.json \
# --output_dir . # 改成自己的保存模型目录
6 changes: 4 additions & 2 deletions paddleformers/transformers/glm4_moe/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ class GLMMoEModelProvider(GPTModelProvider):

bias_activation_fusion: bool = True

transform_rules = {"tensor_parallel_degree": "tensor_model_parallel_size", "dtype": "params_dtype"}


def eager_attention_forward(
module: nn.Layer,
Expand Down Expand Up @@ -1493,7 +1495,7 @@ def forward(
)


class Glm4MoeForCausalLMFleet(Glm4MoePreTrainedModel):
class Glm4MoeForCausalLM(Glm4MoePreTrainedModel):
is_fleet = True

def __new__(cls, config):
Expand All @@ -1506,7 +1508,7 @@ def __new__(cls, config):
return gpt_model


class Glm4MoeForCausalLM(Glm4MoePreTrainedModel):
class Glm4MoeForCausalLMFleet(Glm4MoePreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
_tp_plan = {"lm_head": "colwise_rep"}
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
Expand Down