|
@@ -2,7 +2,7 @@
|
|
"cells": [
|
|
"cells": [
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "prerequisite-disaster",
|
|
|
|
|
|
+ "id": "selected-material",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"# Understanding Megatron-LM's core - MPU\n",
|
|
"# Understanding Megatron-LM's core - MPU\n",
|
|
@@ -26,7 +26,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "peaceful-article",
|
|
|
|
|
|
+ "id": "renewable-simon",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"---------------------------------------------------------------------------\n",
|
|
"---------------------------------------------------------------------------\n",
|
|
@@ -74,7 +74,7 @@
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"execution_count": 1,
|
|
- "id": "reflected-israeli",
|
|
|
|
|
|
+ "id": "greek-simpson",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -154,7 +154,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "moving-strip",
|
|
|
|
|
|
+ "id": "transparent-myanmar",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"---\n",
|
|
"---\n",
|
|
@@ -175,7 +175,7 @@
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"execution_count": 2,
|
|
- "id": "promotional-stack",
|
|
|
|
|
|
+ "id": "physical-lightweight",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [
|
|
"outputs": [
|
|
{
|
|
{
|
|
@@ -215,7 +215,7 @@
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"execution_count": 3,
|
|
- "id": "loose-haven",
|
|
|
|
|
|
+ "id": "confidential-mills",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [
|
|
"outputs": [
|
|
{
|
|
{
|
|
@@ -257,7 +257,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "nearby-immunology",
|
|
|
|
|
|
+ "id": "designed-guidance",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"----------------------------------------------------------------------\n",
|
|
"----------------------------------------------------------------------\n",
|
|
@@ -268,8 +268,8 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 5,
|
|
|
|
- "id": "consolidated-operation",
|
|
|
|
|
|
+ "execution_count": 3,
|
|
|
|
+ "id": "organized-orange",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -282,14 +282,14 @@
|
|
"import random\n",
|
|
"import random\n",
|
|
"from megatron import *\n",
|
|
"from megatron import *\n",
|
|
"from megatron.mpu.tests import *\n",
|
|
"from megatron.mpu.tests import *\n",
|
|
- "tensor_model_parallel_size=4\n",
|
|
|
|
|
|
+ "\n",
|
|
"from megatron.mpu.utils import *"
|
|
"from megatron.mpu.utils import *"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 6,
|
|
|
|
- "id": "expressed-builder",
|
|
|
|
|
|
+ "execution_count": 4,
|
|
|
|
+ "id": "compound-morning",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -391,8 +391,8 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 7,
|
|
|
|
- "id": "elect-detail",
|
|
|
|
|
|
+ "execution_count": 5,
|
|
|
|
+ "id": "editorial-refund",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -444,10 +444,9 @@
|
|
" print(\"per_partition_per_stride_size \",per_partition_per_stride_size)\n",
|
|
" print(\"per_partition_per_stride_size \",per_partition_per_stride_size)\n",
|
|
" weight_list = torch.split(master_weight, per_partition_per_stride_size,\n",
|
|
" weight_list = torch.split(master_weight, per_partition_per_stride_size,\n",
|
|
" dim=partition_dim)\n",
|
|
" dim=partition_dim)\n",
|
|
- " \n",
|
|
|
|
- " #print(\"weight_list\", [wl.size() for wl in weight_list] , len(weight_list))\n",
|
|
|
|
- " #print(\"----\"*5)\n",
|
|
|
|
- " tensor_model_parallel_gp=[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14, 15]]\n",
|
|
|
|
|
|
+ " ######## tensor_model_parallel_gp below is hard-coded for tensor_model_parallel_size= 2 , pipeline_model_parallel_size= 4 ########\n",
|
|
|
|
+ " ######## if you use other model parallel configuration , please copy and paste it below ########\n",
|
|
|
|
+ " tensor_model_parallel_gp=[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14, 15]] \n",
|
|
" my_weight_list = get_weight_list(master_weight,tensor_model_parallel_gp)\n",
|
|
" my_weight_list = get_weight_list(master_weight,tensor_model_parallel_gp)\n",
|
|
" \n",
|
|
" \n",
|
|
" with torch.no_grad():\n",
|
|
" with torch.no_grad():\n",
|
|
@@ -459,7 +458,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "dress-proportion",
|
|
|
|
|
|
+ "id": "distinguished-rhythm",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"## Peek inside Column Parallel Class"
|
|
"## Peek inside Column Parallel Class"
|
|
@@ -467,8 +466,8 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 8,
|
|
|
|
- "id": "german-method",
|
|
|
|
|
|
+ "execution_count": 6,
|
|
|
|
+ "id": "under-secondary",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [
|
|
"outputs": [
|
|
{
|
|
{
|
|
@@ -486,8 +485,8 @@
|
|
}
|
|
}
|
|
],
|
|
],
|
|
"source": [
|
|
"source": [
|
|
- "tensor_model_parallel_size= 4 \n",
|
|
|
|
- "pipeline_model_parallel_size= 2 \n",
|
|
|
|
|
|
+ "tensor_model_parallel_size= 2 \n",
|
|
|
|
+ "pipeline_model_parallel_size= 4 \n",
|
|
"input_size = 1024 # 1024 rows\n",
|
|
"input_size = 1024 # 1024 rows\n",
|
|
"output_size = 512 # 256 columns\n",
|
|
"output_size = 512 # 256 columns\n",
|
|
"which_model_parallel='col'\n",
|
|
"which_model_parallel='col'\n",
|
|
@@ -500,8 +499,8 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 9,
|
|
|
|
- "id": "ethical-secondary",
|
|
|
|
|
|
+ "execution_count": 7,
|
|
|
|
+ "id": "selective-snake",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -511,8 +510,8 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 10,
|
|
|
|
- "id": "republican-saint",
|
|
|
|
|
|
+ "execution_count": 8,
|
|
|
|
+ "id": "inside-france",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [
|
|
"outputs": [
|
|
{
|
|
{
|
|
@@ -521,7 +520,7 @@
|
|
"__main__.myColumnParallelLinear"
|
|
"__main__.myColumnParallelLinear"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
- "execution_count": 10,
|
|
|
|
|
|
+ "execution_count": 8,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
"output_type": "execute_result"
|
|
}
|
|
}
|
|
@@ -532,8 +531,8 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 11,
|
|
|
|
- "id": "sealed-eclipse",
|
|
|
|
|
|
+ "execution_count": 9,
|
|
|
|
+ "id": "agricultural-marine",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [
|
|
"outputs": [
|
|
{
|
|
{
|
|
@@ -542,7 +541,7 @@
|
|
"(1024, 512)"
|
|
"(1024, 512)"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
- "execution_count": 11,
|
|
|
|
|
|
+ "execution_count": 9,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
"output_type": "execute_result"
|
|
}
|
|
}
|
|
@@ -553,7 +552,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "played-romantic",
|
|
|
|
|
|
+ "id": "experienced-profit",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"----------------------------------------------------------------------\n",
|
|
"----------------------------------------------------------------------\n",
|
|
@@ -564,8 +563,8 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 53,
|
|
|
|
- "id": "logical-union",
|
|
|
|
|
|
+ "execution_count": 10,
|
|
|
|
+ "id": "mineral-adapter",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -672,8 +671,8 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 74,
|
|
|
|
- "id": "annual-commonwealth",
|
|
|
|
|
|
+ "execution_count": 11,
|
|
|
|
+ "id": "nearby-latino",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [
|
|
"outputs": [
|
|
{
|
|
{
|
|
@@ -707,8 +706,8 @@
|
|
}
|
|
}
|
|
],
|
|
],
|
|
"source": [
|
|
"source": [
|
|
- "tensor_model_parallel_size= 4 \n",
|
|
|
|
- "pipeline_model_parallel_size= 2 \n",
|
|
|
|
|
|
+ "tensor_model_parallel_size= 2 \n",
|
|
|
|
+ "pipeline_model_parallel_size= 4 \n",
|
|
"input_size = 1024 # first dimension of the matrix\n",
|
|
"input_size = 1024 # first dimension of the matrix\n",
|
|
"output_size = 512 # 2nd dimension of the matrix\n",
|
|
"output_size = 512 # 2nd dimension of the matrix\n",
|
|
"print(\"this is how A is sliced Row-wised ...\\n\")\n",
|
|
"print(\"this is how A is sliced Row-wised ...\\n\")\n",
|
|
@@ -723,7 +722,7 @@
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": 72,
|
|
"execution_count": 72,
|
|
- "id": "complex-ultimate",
|
|
|
|
|
|
+ "id": "economic-istanbul",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"outputs": [],
|
|
"source": [
|
|
"source": [
|
|
@@ -734,7 +733,7 @@
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": 58,
|
|
"execution_count": 58,
|
|
- "id": "neither-johnson",
|
|
|
|
|
|
+ "id": "pursuant-denial",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"outputs": [
|
|
"outputs": [
|
|
{
|
|
{
|
|
@@ -754,7 +753,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "wired-contents",
|
|
|
|
|
|
+ "id": "stretch-creature",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"--- \n",
|
|
"--- \n",
|
|
@@ -768,7 +767,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "patent-caution",
|
|
|
|
|
|
+ "id": "stopped-software",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"---\n",
|
|
"---\n",
|
|
@@ -780,7 +779,7 @@
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
- "id": "together-paragraph",
|
|
|
|
|
|
+ "id": "dated-garbage",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
"-----\n",
|
|
"-----\n",
|