From 3c9f5c4953dae7ebb6bb888e52e63c6b8f1b1c68 Mon Sep 17 00:00:00 2001 From: NuojCheng Date: Mon, 15 Jun 2026 17:21:56 +0000 Subject: [PATCH] add ep as dp --- .../optimization/custom_mesh_and_rule.md | 4 + src/maxtext/common/common_types.py | 1 + .../configs/custom_mesh_and_rule/ep-as-dp.yml | 83 + tests/utils/sharding_dump.py | 7 + .../input_shardings.json | 178 ++ .../logical_shardings.json | 980 ++++++++ .../named_shardings.json | 2228 +++++++++++++++++ 7 files changed, 3481 insertions(+) create mode 100644 src/maxtext/configs/custom_mesh_and_rule/ep-as-dp.yml create mode 100644 tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/input_shardings.json create mode 100644 tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/logical_shardings.json create mode 100644 tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/named_shardings.json diff --git a/docs/guides/optimization/custom_mesh_and_rule.md b/docs/guides/optimization/custom_mesh_and_rule.md index 605d77e160..a2d9d1dfc7 100644 --- a/docs/guides/optimization/custom_mesh_and_rule.md +++ b/docs/guides/optimization/custom_mesh_and_rule.md @@ -51,6 +51,10 @@ This rule utilizes the `data`, `stage`, `fsdp`, and `expert` axes. Its defining Similar in philosophy to `ep-as-cp.yml`, this configuration explicitly includes the `context` axis in the mesh layout alongside `data`, `stage`, `fsdp`, and `expert`. While context sharding is mapped to the `context` axis globally, within MoE components, this `context` axis dynamically shifts to perform expert parallelism instead of FSDP. This custom rule supports using CP and EP together. +### `ep-as-dp.yml` + +Different with the rule in `base.yml`, this rule configures expert physical axis to function as data parallelism rather than FSDP. This removes the constraint where FSDPxEP is limited by specific model dimensions, particularly for small tensors such as attention projections. Ultimately, this change benefits large-scale training. + ### `pipeline-large-moe.yml` Designed specifically to optimize pipeline parallelism for extremely large-scale MoE jobs (such as DeepSeek models). It defines the physical axes: `data`, `stage`, `fsdp`, `tensor`, `context`, and `expert`. To prevent dimension limit errors, it intentionally disables expert weight sharding on the (typically small) `q_lora` dimension. Furthermore, tensor and expert parallelism are strictly preserved to support advanced pipelining features like `pipeline_fsdp_ag_one` and `pipeline_fsdp_ag_per_repeat`. diff --git a/src/maxtext/common/common_types.py b/src/maxtext/common/common_types.py index d4b52207fc..344b85a2f8 100644 --- a/src/maxtext/common/common_types.py +++ b/src/maxtext/common/common_types.py @@ -152,3 +152,4 @@ class CustomRule(enum.Enum): EP_AS_CP = "ep-as-cp" # Support EP only PIPELINE_LARGE_MOE = "pipeline-large-moe" FSDP_2D = "2d-fsdp" + EP_AS_DP = "ep-as-dp" diff --git a/src/maxtext/configs/custom_mesh_and_rule/ep-as-dp.yml b/src/maxtext/configs/custom_mesh_and_rule/ep-as-dp.yml new file mode 100644 index 0000000000..40e54ed1d3 --- /dev/null +++ b/src/maxtext/configs/custom_mesh_and_rule/ep-as-dp.yml @@ -0,0 +1,83 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This rule uses data, FSDP, FSDP_transpose and expert. Expert axis acts as +# data parallelism in components except core MoE part (between EP all2all). + +mesh_axes: ['data', 'fsdp', 'fsdp_transpose', 'expert'] +data_sharding: [['data', 'fsdp', 'fsdp_transpose', 'expert']] +logical_axis_rules: [ + # ========================================== + # Vocabulary Embedding + # ========================================== + # Vocab Activations + ['activation_embed_and_logits_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']], + ['activation_embed_and_logits_batch_sequence', ['data', 'fsdp', 'fsdp_transpose', 'expert']], + ['activation_vocab', []], + # Vocab Weights + ['vocab', []], + ['embed_vocab', ['fsdp', 'fsdp_transpose']], + # ========================================== + # Attention + # ========================================== + # Attention Activations + ['activation_batch_attn', ['data', 'fsdp', 'fsdp_transpose', 'expert']], + ['activation_heads', []], + ['activation_kv_heads', []], + ['activation_length_attn', ['context']], + ['activation_q_length', ['context']], + ['activation_kv_length', []], + ['activation_embed_attn', ['tensor', 'tensor_transpose']], + ['activation_kv', []], + ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']], + ['activation_kv_head_dim', []], + # Attention Weights + ['q_lora', ['fsdp']], + ["q_lora_up_proj", []], + ['kv_lora', ['fsdp']], + ["kv_lora_up_proj", []], + # ========================================== + # Mixture of Experts (MoE) + # ========================================== + # MoE Activations + ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', 'expert']], + ['activation_length_moe', []], + ['activation_norm_length_moe', []], + ['activation_embed_moe', []], + ['activation_mlp_moe', []], + ['activation_exp', ['expert']], + # MoE Weights + ['exp', 'expert'], + ['mlp_moe', ['fsdp_transpose']], + ['embed_moe', ['fsdp']], + # ========================================== + # Standard MLP / Dense Layers / Model Structure + # ========================================== + # Dense Activations + ['activation_mlp', []], + # Note activation batch and length also get used in vocab + ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']], + ['activation_length', []], + ['activation_norm_length', []], + ['activation_embed', []], + ['activation_stage', []], + # General Weights + ['mlp', ['fsdp_transpose']], + ['embed', ['fsdp', 'fsdp_transpose']], + ['embed', ['fsdp']], + # ========================================== + # Deprecated / Scheduled for Removal + # ========================================== + ['exp_with_fsdp', 'fsdp'], + ] diff --git a/tests/utils/sharding_dump.py b/tests/utils/sharding_dump.py index 7c8007b63f..c4e149669b 100644 --- a/tests/utils/sharding_dump.py +++ b/tests/utils/sharding_dump.py @@ -69,6 +69,13 @@ "2d-fsdp", ("ici_fsdp_parallelism=-1", "ici_fsdp_transpose_parallelism=2"), ), + ( + "deepseek2-16b", + "tpu7x-16", + 1, + "ep-as-dp", + ("ici_fsdp_parallelism=-1", "ici_expert_parallelism=2", "use_ring_of_experts=true"), + ), ("qwen3-0.6b", "tpu7x-16", 1, "", ()), ("gpt-oss-20b", "tpu7x-16", 1, "", ()), ("gpt-oss-20b", "tpu7x-16", 1, "", ("ici_fsdp_parallelism=-1", "ici_expert_parallelism=2")), diff --git a/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/input_shardings.json b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/input_shardings.json new file mode 100644 index 0000000000..f556d60c89 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/input_shardings.json @@ -0,0 +1,178 @@ +{ + "Activation Sharding Dump": [ + { + "deepseek/inputs: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "deepseek/pre_attention_norm: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "attention_mla/inputs_q: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch_attn', 'activation_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "attention_mla/inputs_kv: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch_attn', 'activation_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "attention_mla/q_nope: bfloat16[192,2048,16,128]": { + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_mla/q_pe: bfloat16[192,2048,16,64]": { + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_mla/query: bfloat16[192,2048,16,192]": { + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_mla/key_nope: bfloat16[192,2048,16,128]": { + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_mla/key_rope: bfloat16[192,2048,16,64]": { + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_mla/key: bfloat16[192,2048,16,192]": { + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_mla/value: bfloat16[192,2048,16,128]": { + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_op/arr: int8[1,4,4]": { + "logic_axes": "Unknown", + "PartitionSpec": "P(None, None)" + } + }, + { + "attention_op/arr: int32[2048]": { + "logic_axes": "Unknown", + "PartitionSpec": "P(None,)" + } + }, + { + "attention_op/query: bfloat16[192,16,2048,192]": { + "logic_axes": "Unknown", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_op/key: bfloat16[192,16,2048,192]": { + "logic_axes": "Unknown", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_op/value: bfloat16[192,16,2048,128]": { + "logic_axes": "Unknown", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "attention_mla/out: bfloat16[192,2048,16,128]": { + "logic_axes": "('activation_batch_attn', 'activation_length', 'activation_heads', 'activation_kv')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)" + } + }, + { + "deepseek/attention_result: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "deepseek/post_attention_norm: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "linears/x: bfloat16[192,2048,10944]": { + "logic_axes": "('activation_batch', 'activation_length', 'activation_mlp')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "deepseek/mlp: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "deepseek/x: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "moe/inputs: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch', 'activation_norm_length', None)", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "moe/gate_logits: bfloat16[192,2048,64]": { + "logic_axes": "('activation_batch', 'activation_norm_length', None)", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "moe/w0_kernel: bfloat16[64,2048,1408]": { + "logic_axes": "Unknown", + "PartitionSpec": "P('expert', None, None)" + } + }, + { + "moe/w1_kernel: bfloat16[64,2048,1408]": { + "logic_axes": "Unknown", + "PartitionSpec": "P('expert', None, None)" + } + }, + { + "moe/wo_kernel: bfloat16[64,1408,2048]": { + "logic_axes": "Unknown", + "PartitionSpec": "P('expert', None, None)" + } + }, + { + "linears/x: bfloat16[192,2048,2816]": { + "logic_axes": "('activation_batch', 'activation_length', 'activation_mlp')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + }, + { + "deepseek/mlp_lnx: bfloat16[192,2048,2048]": { + "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')", + "PartitionSpec": "P(('fsdp', 'expert'), None, None)" + } + } + ] +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/logical_shardings.json b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/logical_shardings.json new file mode 100644 index 0000000000..8d30b919f8 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/logical_shardings.json @@ -0,0 +1,980 @@ +{ + ".step": { + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed_vocab", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed_moe", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_moe", + "mlp_moe" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_moe", + "mlp_moe" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp_moe", + "embed_moe" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed_vocab" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed_vocab", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed_moe", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_moe", + "mlp_moe" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_moe", + "mlp_moe" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp_moe", + "embed_moe" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed_vocab" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "partition_spec": [ + "norm" + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "mlp" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "dense_layers", + "embed" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "dense_layers" + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "dense_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "dense_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "dense_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "partition_spec": [ + "embed_vocab", + "vocab" + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "partition_spec": [ + "embed_moe", + "moe_layers", + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_moe", + "mlp_moe" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "partition_spec": [ + "exp", + "moe_layers", + "embed_moe", + "mlp_moe" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "partition_spec": [ + "exp", + "moe_layers", + "mlp_moe", + "embed_moe" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "mlp" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "partition_spec": [ + "mlp", + "moe_layers", + "embed" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "partition_spec": [ + "norm", + "moe_layers" + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "partition_spec": [ + "heads", + "moe_layers", + "kv", + "embed" + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "q_heads", + "kv" + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "partition_spec": [ + "embed", + "moe_layers", + "kv_lora_up_proj" + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "partition_spec": [ + "kv_lora", + "moe_layers", + "kv_heads", + "kv_head_dim" + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "partition_spec": [ + "vocab", + "embed_vocab" + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file diff --git a/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/named_shardings.json b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/named_shardings.json new file mode 100644 index 0000000000..a19c528418 --- /dev/null +++ b/tests/utils/sharding_info/deepseek2-16b/tpu7x-16/slice_1/rule_ep-as-dp_ici_fsdp_parallelism=-1_ici_expert_parallelism=2_use_ring_of_experts=true/named_shardings.json @@ -0,0 +1,2228 @@ +{ + ".step": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [], + "shape": [] + }, + ".params/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null + ], + "shape": [ + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp_transpose", + null, + "fsdp" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 512, + 1 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null, + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null, + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".params/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null, + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".params/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null + ], + "shape": [ + 2048, + 102400 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp", + "fsdp_transpose" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp", + "fsdp_transpose" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp_transpose", + "fsdp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp_transpose", + null, + "fsdp" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 512, + 26 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null, + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null, + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".params/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null, + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".params/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.count": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [], + "shape": [] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp_transpose", + null, + "fsdp" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null, + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null, + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null, + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp", + "fsdp_transpose" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp", + "fsdp_transpose" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp_transpose", + "fsdp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp_transpose", + null, + "fsdp" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null, + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null, + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.mu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null, + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.mu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['decoder_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null + ], + "shape": [ + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 1, + 10944 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['mlp']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp_transpose", + null, + "fsdp" + ], + "shape": [ + 10944, + 1, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 512, + 1 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null, + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 16, + 1, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null, + null + ], + "shape": [ + 2048, + 1, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null + ], + "shape": [ + 2048, + 1, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['dense_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null, + null + ], + "shape": [ + 512, + 1, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['logits_dense']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null + ], + "shape": [ + 2048, + 102400 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['gate']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null + ], + "shape": [ + 2048, + 26, + 64 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_0']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp", + "fsdp_transpose" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wi_1']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp", + "fsdp_transpose" + ], + "shape": [ + 64, + 26, + 2048, + 1408 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['MoeBlock_0']/['wo']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "expert", + null, + "fsdp_transpose", + "fsdp" + ], + "shape": [ + 64, + 26, + 1408, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_0']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wi_1']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + "fsdp_transpose" + ], + "shape": [ + 2048, + 26, + 2816 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['DeepSeekMoeBlock_0']/['shared_experts']/['wo']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp_transpose", + null, + "fsdp" + ], + "shape": [ + 2816, + 26, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['post_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['pre_self_attention_layer_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 2048, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['kv_norm']/['scale']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null + ], + "shape": [ + 512, + 26 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['out']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + null, + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 16, + 26, + 128, + 2048 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['query']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null, + null + ], + "shape": [ + 2048, + 26, + 16, + 192 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_a']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + [ + "fsdp", + "fsdp_transpose" + ], + null, + null + ], + "shape": [ + 2048, + 26, + 576 + ] + }, + ".opt_state/[0]/.nu/['params']/['decoder']/['moe_layers']/['self_attention']/['wkv_b']/['kernel']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + "fsdp", + null, + null, + null + ], + "shape": [ + 512, + 26, + 16, + 256 + ] + }, + ".opt_state/[0]/.nu/['params']/['token_embedder']/['embedding']": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [ + null, + [ + "fsdp", + "fsdp_transpose" + ] + ], + "shape": [ + 102400, + 2048 + ] + }, + ".opt_state/[2]/.count": { + "mesh": { + "axis_names": [ + "data", + "fsdp", + "fsdp_transpose", + "expert" + ], + "shape": { + "data": 1, + "fsdp": 8, + "fsdp_transpose": 1, + "expert": 2 + } + }, + "partition_spec": [], + "shape": [] + } +} \ No newline at end of file