From 8d05112a76bd5ec4173cf86723c86149daa2765c Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Wed, 31 Aug 2022 17:33:03 +0800
Subject: [PATCH 01/12] docs(distributed/collective): update collective ops
 docs

---
 docs/api/paddle/distributed/all_gather_cn.rst     |  2 +-
 docs/api/paddle/distributed/all_reduce_cn.rst     |  6 +++---
 docs/api/paddle/distributed/alltoall_cn.rst       |  2 +-
 docs/api/paddle/distributed/broadcast_cn.rst      |  4 ++--
 docs/api/paddle/distributed/recv_cn.rst           |  2 +-
 docs/api/paddle/distributed/reduce_cn.rst         |  6 +++---
 docs/api/paddle/distributed/reduce_scatter_cn.rst | 10 +++++-----
 docs/api/paddle/distributed/scatter_cn.rst        |  6 +++---
 docs/api/paddle/distributed/send_cn.rst           |  2 +-
 9 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/api/paddle/distributed/all_gather_cn.rst b/docs/api/paddle/distributed/all_gather_cn.rst
index e084cbb72b3..52a78a92b09 100644
--- a/docs/api/paddle/distributed/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/all_gather_cn.rst
@@ -7,7 +7,7 @@ all_gather
 .. py:function:: paddle.distributed.all_gather(tensor_list, tensor, group=0)
 
 进程组内所有进程的指定 tensor 进行聚合操作，并返回给所有进程聚合的结果。
-如下图所示，4 个 GPU 分别开启 4 个进程，每张卡上的数据用卡号代表，
+如下图所示，4 个 GPU 分别开启 1 个进程，每张卡上的数据用卡号代表，
 经过 all_gather 算子后，每张卡都会拥有所有卡的数据。
 
 .. image:: ./img/allgather.png
diff --git a/docs/api/paddle/distributed/all_reduce_cn.rst b/docs/api/paddle/distributed/all_reduce_cn.rst
index 62f876bc026..d5a7b98f179 100644
--- a/docs/api/paddle/distributed/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/all_reduce_cn.rst
@@ -7,7 +7,7 @@ all_reduce
 .. py:function:: paddle.distributed.all_reduce(tensor, op=ReduceOp.SUM, group=0)
 
 进程组内所有进程的指定 tensor 进行归约操作，并返回给所有进程归约的结果。
-如下图所示，4 个 GPU 分别开启 4 个进程，每张卡上的数据用卡号代表，规约操作为求和，
+如下图所示，4 个 GPU 分别开启 1 个进程，每张卡上的数据用卡号代表，规约操作为求和，
 经过 all_reduce 算子后，每张卡都会拥有所有卡数据的总和。
 
 .. image:: ./img/allreduce.png
@@ -17,8 +17,8 @@ all_reduce
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输入 Tensor，同时也会将归约结果返回至此 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64。
-    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD，可选) - 归约的具体操作，比如求和，取最大值，取最小值和求乘积，默认为求和归约。
+    - **tensor** (Tensor) - 操作的输入 Tensor，同时也会将归约结果返回至此 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的具体操作，比如求和，取最大值，取最小值和求乘积，默认为求和归约。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
 
 返回
diff --git a/docs/api/paddle/distributed/alltoall_cn.rst b/docs/api/paddle/distributed/alltoall_cn.rst
index 4ce0b1f44c1..b58f99a9714 100644
--- a/docs/api/paddle/distributed/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/alltoall_cn.rst
@@ -18,7 +18,7 @@ GPU1 卡的 out_tensor_list 包含 0_1 和 1_1。
 
 参数
 :::::::::
-    - **in_tensor_list** (list) - 包含所有输入 Tensors 的一个列表。在列表里面的所有元素都必须是一个 Tensor，Tensor 的数据类型必须是 float16、float32、 float64、int32、int64。
+    - **in_tensor_list** (list) - 包含所有输入 Tensors 的一个列表。在列表里面的所有元素都必须是一个 Tensor，Tensor 的数据类型必须是 float16、float32、float64、int32、int64、int8、uint8、bool。
     - **out_tensor_list** (Tensor) - 包含所有输出 Tensors 的一个列表。在列表里面的所有元素数据类型要和输入的 Tensors 数据类型一致。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
     - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。
diff --git a/docs/api/paddle/distributed/broadcast_cn.rst b/docs/api/paddle/distributed/broadcast_cn.rst
index f1436e67cad..79b666f502c 100644
--- a/docs/api/paddle/distributed/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/broadcast_cn.rst
@@ -7,7 +7,7 @@ broadcast
 .. py:function:: paddle.distributed.broadcast(tensor, src, group=0)
 
 广播一个 Tensor 给其他所有进程。
-如下图所示，4 个 GPU 分别开启 4 个进程，GPU0 卡拥有数据，经过 broadcast 算子后，会将这个数据传播到所有卡上。
+如下图所示，4 个 GPU 分别开启 1 个进程，GPU0 卡拥有数据，经过 broadcast 算子后，会将这个数据传播到所有卡上。
 
 .. image:: ./img/broadcast.png
   :width: 800
@@ -16,7 +16,7 @@ broadcast
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 如果当前进程编号是源，那么这个 Tensor 变量将被发送给其他进程，否则这个 Tensor 将接收源发送过来的数据。Tensor 的数据类型为：float16、float32、float64、int32、int64。
+    - **tensor** (Tensor) - 如果当前进程编号是源，那么这个 Tensor 变量将被发送给其他进程，否则这个 Tensor 将接收源发送过来的数据。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
     - **src** (int) - 发送源的进程编号。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
 
diff --git a/docs/api/paddle/distributed/recv_cn.rst b/docs/api/paddle/distributed/recv_cn.rst
index 3d60f5e4156..bf5f8cef3b8 100644
--- a/docs/api/paddle/distributed/recv_cn.rst
+++ b/docs/api/paddle/distributed/recv_cn.rst
@@ -10,7 +10,7 @@ recv
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 接收数据的 Tensor。数据类型为：float16、float32、float64、int32、int64。
+    - **tensor** (Tensor) - 接收数据的 Tensor。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
     - **src** (int) - 发送者的标识符。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
     - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。
diff --git a/docs/api/paddle/distributed/reduce_cn.rst b/docs/api/paddle/distributed/reduce_cn.rst
index 7d7ff16cfa4..b92438329b9 100644
--- a/docs/api/paddle/distributed/reduce_cn.rst
+++ b/docs/api/paddle/distributed/reduce_cn.rst
@@ -7,7 +7,7 @@ reduce
 .. py:function:: paddle.distributed.reduce(tensor, dst, op=ReduceOp.SUM, group=0)
 
 进程组内所有进程的指定 tensor 进行归约操作，并返回给所有进程归约的结果。
-如下图所示，4 个 GPU 分别开启 4 个进程，每张卡上的数据用卡号代表，reduce 的目标是第 0 张卡，
+如下图所示，4 个 GPU 分别开启 1 个进程，每张卡上的数据用卡号代表，reduce 的目标是第 0 张卡，
 规约操作是求和，经过 reduce 操作后，第 0 张卡会得到所有卡数据的总和。
 
 .. image:: ./img/reduce.png
@@ -17,9 +17,9 @@ reduce
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输入 Tensor，结果返回至目标进程号的 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64。
+    - **tensor** (Tensor) - 操作的输入 Tensor，结果返回至目标进程号的 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
     - **dst** (int) - 返回操作结果的目标进程编号。
-    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD，可选) - 归约的具体操作，比如求和，取最大值，取最小值和求乘积，默认为求和归约。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的具体操作，比如求和，取最大值，取最小值和求乘积，默认为求和归约。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
 
 返回
diff --git a/docs/api/paddle/distributed/reduce_scatter_cn.rst b/docs/api/paddle/distributed/reduce_scatter_cn.rst
index a1ece2d0446..d96d929ef3f 100644
--- a/docs/api/paddle/distributed/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/reduce_scatter_cn.rst
@@ -9,11 +9,11 @@ reduce_scatter
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 输出的张量。
-    - **tensor_list** (list[Tensor]) - 归约和切分的张量列表。
-    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD) - 操作类型，默认 ReduceOp.SUM。
-    - **group** (Group，可选) - 通信组；如果是 None，则使用默认通信组。
-    - **use_calc_stream** (bool，可选) - 决定是在计算流还是通信流上做该通信操作；默认为 True，表示在计算流。
+    - **tensor** (Tensor) – 输出的张量。
+    - **tensor_list** (list(Tensor)) – 归约和切分的张量列表。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD) – 操作类型，默认 ReduceOp.SUM。
+    - **group**: (Group, 可选) – 通信组；如果是 None，则使用默认通信组。
+    - **use_calc_stream**: (bool, 可选) – 决定是在计算流还是通信流上做该通信操作；默认为 True，表示在计算流。
 
 
 返回
diff --git a/docs/api/paddle/distributed/scatter_cn.rst b/docs/api/paddle/distributed/scatter_cn.rst
index 3e7a8ee4ef2..4be6ef35476 100644
--- a/docs/api/paddle/distributed/scatter_cn.rst
+++ b/docs/api/paddle/distributed/scatter_cn.rst
@@ -7,7 +7,7 @@ scatter
 .. py:function:: paddle.distributed.scatter(tensor, tensor_list=None, src=0, group=0)
 
 进程组内指定进程源的 tensor 列表分发到其他所有进程中。
-如下图所示，4 个 GPU 分别开启 4 个进程，scatter 的源选择为第 0 张卡，
+如下图所示，4 个 GPU 分别开启 1 个进程，scatter 的源选择为第 0 张卡，
 经过 scatter 算子后，会将第 0 张卡的数据平均分到所有卡上。
 
 .. image:: ./img/scatter.png
@@ -17,8 +17,8 @@ scatter
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输出 Tensor。Tensor 的数据类型为：float16、float32、float64、int32、int64。
-    - **tensor_list** (list，可选) - 操作的输入 Tensor 列表，默认为 None。列表中的每个元素均为 Tensor，每个 Tensor 的数据类型为：float16、float32、float64、int32、int64。
+    - **tensor** (Tensor) - 操作的输出 Tensor。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor_list** (list，可选) - 操作的输入 Tensor 列表，默认为 None。列表中的每个元素均为 Tensor，每个 Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
     - **src** (int，可选) - 操作的源进程号，该进程号的 Tensor 列表将分发到其他进程中。默认为 0。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
 
diff --git a/docs/api/paddle/distributed/send_cn.rst b/docs/api/paddle/distributed/send_cn.rst
index 6b9e855b8b2..9e5241817aa 100644
--- a/docs/api/paddle/distributed/send_cn.rst
+++ b/docs/api/paddle/distributed/send_cn.rst
@@ -10,7 +10,7 @@ send
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 需要发送的 Tensor。数据类型为：float16、float32、float64、int32、int64。
+    - **tensor** (Tensor) - 需要发送的 Tensor。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
     - **dst** (int) - 接收者的标识符。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
     - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。

From a317f407621be147a31c5931c426601f305302f7 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Thu, 1 Sep 2022 19:49:48 +0800
Subject: [PATCH 02/12] docs(distributed/collective): add missing collective
 ops

---
 docs/api/paddle/distributed/Overview_cn.rst   | 10 ++++++++-
 .../distributed/destroy_process_group_cn.rst  | 21 +++++++++++++++++++
 docs/api/paddle/distributed/irecv_cn.rst      |  2 +-
 docs/api/paddle/distributed/isend_cn.rst      |  2 +-
 docs/api/paddle/distributed/recv_cn.rst       |  2 +-
 5 files changed, 33 insertions(+), 4 deletions(-)
 create mode 100644 docs/api/paddle/distributed/destroy_process_group_cn.rst

diff --git a/docs/api/paddle/distributed/Overview_cn.rst b/docs/api/paddle/distributed/Overview_cn.rst
index af54f30d659..69de5de3ae5 100644
--- a/docs/api/paddle/distributed/Overview_cn.rst
+++ b/docs/api/paddle/distributed/Overview_cn.rst
@@ -53,6 +53,8 @@ paddle.distributed.fleet 是分布式训练的统一入口 API，用于配置分
     " :ref:`spawn <cn_api_distributed_spawn>` ", "启动分布式训练进程，仅支持集合通信架构"
     " :ref:`get_rank <cn_api_distributed_get_rank>` ", "获取当前进程的 rank 值"
     " :ref:`get_world_size <cn_api_distributed_get_world_size>` ", "获取当前进程数"
+    " :ref:`new_group <cn_api_distributed_new_group>` ", "创建分布式通信组"
+    " :ref:`destroy_process_group <cn_api_distributed_destroy_process_group>` ", "销毁分布式通信组"
 
 .. _03:
 
@@ -80,14 +82,20 @@ paddle.distributed.fleet 是分布式训练的统一入口 API，用于配置分
 
 
     " :ref:`reduce <cn_api_distributed_reduce>` ", "规约，规约进程组内的 tensor，返回结果至指定进程"
-    " :ref:`ReduceOP <cn_api_distributed_ReduceOp>` ", "规约，指定逐元素规约操作"
+    " :ref:`ReduceOp <cn_api_distributed_ReduceOp>` ", "规约，指定逐元素规约操作"
     " :ref:`all_reduce <cn_api_distributed_all_reduce>` ", "组规约，规约进程组内的 tensor，结果广播至每个进程"
     " :ref:`all_gather <cn_api_distributed_all_gather>` ", "组聚合，聚合进程组内的 tensor，结果广播至每个进程"
     " :ref:`all_gather_object <cn_api_distributed_all_gather_object>` ", "组聚合，聚合进程组内的 object，结果广播至每个进程"
+    " :ref:`alltoall <cn_api_distributed_alltoall>` ", "分发 tensor 列表到每个进程并进行聚合"
     " :ref:`broadcast <cn_api_distributed_broadcast>` ", "广播一个 tensor 到每个进程"
     " :ref:`scatter <cn_api_distributed_scatter>` ", "分发 tensor 到每个进程"
     " :ref:`split <cn_api_distributed_split>` ", "切分参数到多个设备"
     " :ref:`barrier <cn_api_distributed_barrier>` ", "同步路障，进行阻塞操作，实现组内所有进程的同步"
+    " :ref:`send <cn_api_distributed_send>` ", "发送一个 tensor 到指定的接收者"
+    " :ref:`recv <cn_api_distributed_recv>` ", "接收一个来自指定发送者的 tensor"
+    " :ref:`isend <cn_api_distributed_isend>` ", "异步发送一个 tensor 到指定的接收者"
+    " :ref:`irecv <cn_api_distributed_irecv>` ", "异步接收一个来自指定发送者的 tensor"
+    " :ref:`reduce_scatter <cn_api_paddle_distributed_reduce_scatter>` ", "规约，然后将 tensor 列表分散到组中的所有进程上"
 
 .. _05:
 
diff --git a/docs/api/paddle/distributed/destroy_process_group_cn.rst b/docs/api/paddle/distributed/destroy_process_group_cn.rst
new file mode 100644
index 00000000000..9ef54142a5b
--- /dev/null
+++ b/docs/api/paddle/distributed/destroy_process_group_cn.rst
@@ -0,0 +1,21 @@
+.. _cn_api_distributed_destroy_process_group:
+
+destroy_process_group
+-------------------------------
+
+
+.. py:function:: destroy_process_group(group=None)
+
+销毁一个指定的通信组。
+
+参数
+:::::::::
+    - group (ProcessGroup, 可选): 待销毁的通信组。所有通信组都会被销毁（包括默认的通信组），并且整个分布式环境也会回到未被初始化的状态。
+
+返回
+:::::::::
+无
+
+代码示例
+::::::::::::
+COPY-FROM: paddle.distributed.destroy_process_group
diff --git a/docs/api/paddle/distributed/irecv_cn.rst b/docs/api/paddle/distributed/irecv_cn.rst
index 8092dd2f27d..a7d2bca5662 100644
--- a/docs/api/paddle/distributed/irecv_cn.rst
+++ b/docs/api/paddle/distributed/irecv_cn.rst
@@ -9,7 +9,7 @@ irecv
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 要接受的张量。其数据类型应为 float16、float32、float64、int32 或 int64。
+    - **tensor** (Tensor) - 要接受的张量。其数据类型应为 float16、float32、float64、int32、int64、int8、uint8、bool。
     - **src** (int) - 接受节点的全局 rank 号。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认的全局组。默认值：None。
 
diff --git a/docs/api/paddle/distributed/isend_cn.rst b/docs/api/paddle/distributed/isend_cn.rst
index 1081594381d..3b0af08e89f 100644
--- a/docs/api/paddle/distributed/isend_cn.rst
+++ b/docs/api/paddle/distributed/isend_cn.rst
@@ -9,7 +9,7 @@ isend
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 要发送的张量。其数据类型应为 float16、float32、float64、int32 或 int64。
+    - **tensor** (Tensor) - 要发送的张量。其数据类型应为 float16、float32、float64、int32、int64、int8、uint8、bool。
     - **dst** (int) - 目标节点的全局 rank 号。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认的全局组。默认值：None。
 
diff --git a/docs/api/paddle/distributed/recv_cn.rst b/docs/api/paddle/distributed/recv_cn.rst
index bf5f8cef3b8..ebeb666888c 100644
--- a/docs/api/paddle/distributed/recv_cn.rst
+++ b/docs/api/paddle/distributed/recv_cn.rst
@@ -6,7 +6,7 @@ recv
 
 .. py:function:: paddle.distributed.recv(tensor, src=0, group=None, use_calc_stream=True)
 
-发送 tensor 到指定接收者。
+接收一个来自指定发送者的 tensor。
 
 参数
 :::::::::

From 6078404a085feb2f5cc2512d60e007349f37dcec Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Fri, 2 Sep 2022 19:37:25 +0800
Subject: [PATCH 03/12] docs(distributed/collective): add alltoall_single

---
 docs/api/paddle/distributed/Overview_cn.rst   |  9 ++++---
 .../paddle/distributed/alltoall_single_cn.rst | 25 +++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 docs/api/paddle/distributed/alltoall_single_cn.rst

diff --git a/docs/api/paddle/distributed/Overview_cn.rst b/docs/api/paddle/distributed/Overview_cn.rst
index 69de5de3ae5..80aceaa562c 100644
--- a/docs/api/paddle/distributed/Overview_cn.rst
+++ b/docs/api/paddle/distributed/Overview_cn.rst
@@ -87,14 +87,15 @@ paddle.distributed.fleet 是分布式训练的统一入口 API，用于配置分
     " :ref:`all_gather <cn_api_distributed_all_gather>` ", "组聚合，聚合进程组内的 tensor，结果广播至每个进程"
     " :ref:`all_gather_object <cn_api_distributed_all_gather_object>` ", "组聚合，聚合进程组内的 object，结果广播至每个进程"
     " :ref:`alltoall <cn_api_distributed_alltoall>` ", "分发 tensor 列表到每个进程并进行聚合"
+    " :ref:`alltoall_single <cn_api_distributed_alltoall_single>` ", "分发单个 tensor 到每个进程并聚合至目标 tensor"
     " :ref:`broadcast <cn_api_distributed_broadcast>` ", "广播一个 tensor 到每个进程"
     " :ref:`scatter <cn_api_distributed_scatter>` ", "分发 tensor 到每个进程"
     " :ref:`split <cn_api_distributed_split>` ", "切分参数到多个设备"
     " :ref:`barrier <cn_api_distributed_barrier>` ", "同步路障，进行阻塞操作，实现组内所有进程的同步"
-    " :ref:`send <cn_api_distributed_send>` ", "发送一个 tensor 到指定的接收者"
-    " :ref:`recv <cn_api_distributed_recv>` ", "接收一个来自指定发送者的 tensor"
-    " :ref:`isend <cn_api_distributed_isend>` ", "异步发送一个 tensor 到指定的接收者"
-    " :ref:`irecv <cn_api_distributed_irecv>` ", "异步接收一个来自指定发送者的 tensor"
+    " :ref:`send <cn_api_distributed_send>` ", "发送一个 tensor 到指定的进程"
+    " :ref:`recv <cn_api_distributed_recv>` ", "接收一个来自指定进程的 tensor"
+    " :ref:`isend <cn_api_paddle_distributed_isend>` ", "异步发送一个 tensor 到指定的进程"
+    " :ref:`irecv <cn_api_paddle_distributed_irecv>` ", "异步接收一个来自指定进程的 tensor"
     " :ref:`reduce_scatter <cn_api_paddle_distributed_reduce_scatter>` ", "规约，然后将 tensor 列表分散到组中的所有进程上"
 
 .. _05:
diff --git a/docs/api/paddle/distributed/alltoall_single_cn.rst b/docs/api/paddle/distributed/alltoall_single_cn.rst
new file mode 100644
index 00000000000..af5f6babaf1
--- /dev/null
+++ b/docs/api/paddle/distributed/alltoall_single_cn.rst
@@ -0,0 +1,25 @@
+.. _cn_api_distributed_alltoall_single:
+
+alltoall_single
+-------------------------------
+
+
+.. py:function:: alltoall_single(in_tensor, out_tensor, in_split_sizes=None, out_split_sizes=None, group=None, use_calc_stream=True)
+
+将输入的 tensor 分发到所有进程，并将接收到的 tensor 聚合到 out_tensor 中。
+
+参数
+:::::::::
+    - in_tensor (Tensor): 输入的 tensor，其数据类型必须是 float16、float32、float64、int32、int64、int8、uint8、bool。
+    - out_tensor (Tensor): 输出的 tensor，其数据类型与输入的 tensor 一致。
+    - in_split_sizes (list[int]，可选): 对 in_tensor 的 dim[0] 进行切分的大小。若该参数未指定，in_tensor 将被均匀切分到各个进程中（需要确保 in_tensor 的大小能够被组中的进程数整除）。默认值：None。
+    - out_split_sizes (list[int]，可选): 对 out_tensor 的 dim[0] 进行切分的大小。若该参数未指定，out_tensor 将均匀地聚合来自各个进程的数据（需要确保 out_tensor 的大小能够被组中的进程数整除）。默认值：None。
+    - use_calc_stream (bool，可选) - 标识使用计算流（若为 True）还是通信流。默认值：True。
+
+返回
+:::::::::
+若 use_calc_stream=True，无返回值；若 use_calc_stream=False，返回一个 Task。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.alltoall_single

From 6a373740729332c9061d8d50c2163b4ff27b1145 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Wed, 7 Sep 2022 19:05:16 +0800
Subject: [PATCH 04/12] docs(distributed/collective): add bfloat16 for ops

---
 docs/api/paddle/distributed/all_gather_cn.rst      | 4 ++--
 docs/api/paddle/distributed/all_reduce_cn.rst      | 2 +-
 docs/api/paddle/distributed/alltoall_cn.rst        | 2 +-
 docs/api/paddle/distributed/alltoall_single_cn.rst | 2 +-
 docs/api/paddle/distributed/broadcast_cn.rst       | 2 +-
 docs/api/paddle/distributed/irecv_cn.rst           | 2 +-
 docs/api/paddle/distributed/isend_cn.rst           | 2 +-
 docs/api/paddle/distributed/recv_cn.rst            | 2 +-
 docs/api/paddle/distributed/reduce_cn.rst          | 2 +-
 docs/api/paddle/distributed/reduce_scatter_cn.rst  | 4 ++--
 docs/api/paddle/distributed/scatter_cn.rst         | 4 ++--
 docs/api/paddle/distributed/send_cn.rst            | 2 +-
 12 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/api/paddle/distributed/all_gather_cn.rst b/docs/api/paddle/distributed/all_gather_cn.rst
index 52a78a92b09..14866b55f64 100644
--- a/docs/api/paddle/distributed/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/all_gather_cn.rst
@@ -17,8 +17,8 @@ all_gather
 
 参数
 :::::::::
-    - **tensor_list** (list) - 操作的输出 Tensor 列表。列表中的每个元素均为 Tensor，每个 Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、complex64、complex128。
-    - **tensor** (Tensor) - 操作的输入 Tensor。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、complex64、complex128。
+    - **tensor_list** (list) - 操作的输出 Tensor 列表。列表中的每个元素均为 Tensor，每个 Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16、complex64、complex128。
+    - **tensor** (Tensor) - 操作的输入 Tensor。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16、complex64、complex128。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
 
 返回
diff --git a/docs/api/paddle/distributed/all_reduce_cn.rst b/docs/api/paddle/distributed/all_reduce_cn.rst
index d5a7b98f179..1358c0daa97 100644
--- a/docs/api/paddle/distributed/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/all_reduce_cn.rst
@@ -17,7 +17,7 @@ all_reduce
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输入 Tensor，同时也会将归约结果返回至此 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor** (Tensor) - 操作的输入 Tensor，同时也会将归约结果返回至此 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的具体操作，比如求和，取最大值，取最小值和求乘积，默认为求和归约。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
 
diff --git a/docs/api/paddle/distributed/alltoall_cn.rst b/docs/api/paddle/distributed/alltoall_cn.rst
index b58f99a9714..0b06a6518a5 100644
--- a/docs/api/paddle/distributed/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/alltoall_cn.rst
@@ -18,7 +18,7 @@ GPU1 卡的 out_tensor_list 包含 0_1 和 1_1。
 
 参数
 :::::::::
-    - **in_tensor_list** (list) - 包含所有输入 Tensors 的一个列表。在列表里面的所有元素都必须是一个 Tensor，Tensor 的数据类型必须是 float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **in_tensor_list** (list) - 包含所有输入 Tensors 的一个列表。在列表里面的所有元素都必须是一个 Tensor，Tensor 的数据类型必须是 float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **out_tensor_list** (Tensor) - 包含所有输出 Tensors 的一个列表。在列表里面的所有元素数据类型要和输入的 Tensors 数据类型一致。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
     - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。
diff --git a/docs/api/paddle/distributed/alltoall_single_cn.rst b/docs/api/paddle/distributed/alltoall_single_cn.rst
index af5f6babaf1..f5ef77a4227 100644
--- a/docs/api/paddle/distributed/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/alltoall_single_cn.rst
@@ -10,7 +10,7 @@ alltoall_single
 
 参数
 :::::::::
-    - in_tensor (Tensor): 输入的 tensor，其数据类型必须是 float16、float32、float64、int32、int64、int8、uint8、bool。
+    - in_tensor (Tensor): 输入的 tensor，其数据类型必须是 float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - out_tensor (Tensor): 输出的 tensor，其数据类型与输入的 tensor 一致。
     - in_split_sizes (list[int]，可选): 对 in_tensor 的 dim[0] 进行切分的大小。若该参数未指定，in_tensor 将被均匀切分到各个进程中（需要确保 in_tensor 的大小能够被组中的进程数整除）。默认值：None。
     - out_split_sizes (list[int]，可选): 对 out_tensor 的 dim[0] 进行切分的大小。若该参数未指定，out_tensor 将均匀地聚合来自各个进程的数据（需要确保 out_tensor 的大小能够被组中的进程数整除）。默认值：None。
diff --git a/docs/api/paddle/distributed/broadcast_cn.rst b/docs/api/paddle/distributed/broadcast_cn.rst
index 79b666f502c..5149b1483cc 100644
--- a/docs/api/paddle/distributed/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/broadcast_cn.rst
@@ -16,7 +16,7 @@ broadcast
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 如果当前进程编号是源，那么这个 Tensor 变量将被发送给其他进程，否则这个 Tensor 将接收源发送过来的数据。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor** (Tensor) - 如果当前进程编号是源，那么这个 Tensor 变量将被发送给其他进程，否则这个 Tensor 将接收源发送过来的数据。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **src** (int) - 发送源的进程编号。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
 
diff --git a/docs/api/paddle/distributed/irecv_cn.rst b/docs/api/paddle/distributed/irecv_cn.rst
index a7d2bca5662..956e1cc9c5b 100644
--- a/docs/api/paddle/distributed/irecv_cn.rst
+++ b/docs/api/paddle/distributed/irecv_cn.rst
@@ -9,7 +9,7 @@ irecv
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 要接受的张量。其数据类型应为 float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor** (Tensor) - 要接受的张量。其数据类型应为 float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **src** (int) - 接受节点的全局 rank 号。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认的全局组。默认值：None。
 
diff --git a/docs/api/paddle/distributed/isend_cn.rst b/docs/api/paddle/distributed/isend_cn.rst
index 3b0af08e89f..3e9d87fada5 100644
--- a/docs/api/paddle/distributed/isend_cn.rst
+++ b/docs/api/paddle/distributed/isend_cn.rst
@@ -9,7 +9,7 @@ isend
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 要发送的张量。其数据类型应为 float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor** (Tensor) - 要发送的张量。其数据类型应为 float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **dst** (int) - 目标节点的全局 rank 号。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认的全局组。默认值：None。
 
diff --git a/docs/api/paddle/distributed/recv_cn.rst b/docs/api/paddle/distributed/recv_cn.rst
index ebeb666888c..0b1c7d4da93 100644
--- a/docs/api/paddle/distributed/recv_cn.rst
+++ b/docs/api/paddle/distributed/recv_cn.rst
@@ -10,7 +10,7 @@ recv
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 接收数据的 Tensor。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor** (Tensor) - 接收数据的 Tensor。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **src** (int) - 发送者的标识符。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
     - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。
diff --git a/docs/api/paddle/distributed/reduce_cn.rst b/docs/api/paddle/distributed/reduce_cn.rst
index b92438329b9..3dea1e949bc 100644
--- a/docs/api/paddle/distributed/reduce_cn.rst
+++ b/docs/api/paddle/distributed/reduce_cn.rst
@@ -17,7 +17,7 @@ reduce
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输入 Tensor，结果返回至目标进程号的 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor** (Tensor) - 操作的输入 Tensor，结果返回至目标进程号的 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **dst** (int) - 返回操作结果的目标进程编号。
     - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的具体操作，比如求和，取最大值，取最小值和求乘积，默认为求和归约。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
diff --git a/docs/api/paddle/distributed/reduce_scatter_cn.rst b/docs/api/paddle/distributed/reduce_scatter_cn.rst
index d96d929ef3f..9b45edf3a06 100644
--- a/docs/api/paddle/distributed/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/reduce_scatter_cn.rst
@@ -9,8 +9,8 @@ reduce_scatter
 
 参数
 :::::::::
-    - **tensor** (Tensor) – 输出的张量。
-    - **tensor_list** (list(Tensor)) – 归约和切分的张量列表。
+    - **tensor** (Tensor) – 输出的张量。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **tensor_list** (list(Tensor)) – 归约和切分的张量列表。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD) – 操作类型，默认 ReduceOp.SUM。
     - **group**: (Group, 可选) – 通信组；如果是 None，则使用默认通信组。
     - **use_calc_stream**: (bool, 可选) – 决定是在计算流还是通信流上做该通信操作；默认为 True，表示在计算流。
diff --git a/docs/api/paddle/distributed/scatter_cn.rst b/docs/api/paddle/distributed/scatter_cn.rst
index 4be6ef35476..d68309b8bc0 100644
--- a/docs/api/paddle/distributed/scatter_cn.rst
+++ b/docs/api/paddle/distributed/scatter_cn.rst
@@ -17,8 +17,8 @@ scatter
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输出 Tensor。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
-    - **tensor_list** (list，可选) - 操作的输入 Tensor 列表，默认为 None。列表中的每个元素均为 Tensor，每个 Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor** (Tensor) - 操作的输出 Tensor。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **tensor_list** (list，可选) - 操作的输入 Tensor 列表，默认为 None。列表中的每个元素均为 Tensor，每个 Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **src** (int，可选) - 操作的源进程号，该进程号的 Tensor 列表将分发到其他进程中。默认为 0。
     - **group** (int，可选) - 工作的进程组编号，默认为 0。
 
diff --git a/docs/api/paddle/distributed/send_cn.rst b/docs/api/paddle/distributed/send_cn.rst
index 9e5241817aa..75ac348a073 100644
--- a/docs/api/paddle/distributed/send_cn.rst
+++ b/docs/api/paddle/distributed/send_cn.rst
@@ -10,7 +10,7 @@ send
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 需要发送的 Tensor。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool。
+    - **tensor** (Tensor) - 需要发送的 Tensor。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **dst** (int) - 接收者的标识符。
     - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
     - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。

From 57a805b490ffa0e6a5d6df59b17a3575e335d428 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Tue, 11 Oct 2022 15:35:41 +0800
Subject: [PATCH 05/12] docs(distributed/collective): revision & add stream
 apis

---
 docs/api/paddle/distributed/Overview_cn.rst   | 60 +++++++++++++------
 docs/api/paddle/distributed/ReduceOp_cn.rst   |  2 +-
 docs/api/paddle/distributed/all_gather_cn.rst | 18 +++---
 .../distributed/all_gather_object_cn.rst      | 15 ++---
 docs/api/paddle/distributed/all_reduce_cn.rst | 18 +++---
 docs/api/paddle/distributed/alltoall_cn.rst   | 22 +++----
 .../paddle/distributed/alltoall_single_cn.rst | 20 ++++---
 docs/api/paddle/distributed/barrier_cn.rst    |  4 +-
 docs/api/paddle/distributed/broadcast_cn.rst  | 19 +++---
 docs/api/paddle/distributed/irecv_cn.rst      | 20 +++----
 docs/api/paddle/distributed/isend_cn.rst      | 21 ++++---
 docs/api/paddle/distributed/recv_cn.rst       | 16 ++---
 docs/api/paddle/distributed/reduce_cn.rst     | 22 ++++---
 .../paddle/distributed/reduce_scatter_cn.rst  | 26 ++++----
 docs/api/paddle/distributed/scatter_cn.rst    | 22 ++++---
 docs/api/paddle/distributed/send_cn.rst       | 16 ++---
 .../distributed/stream/all_gather_cn.rst      | 32 ++++++++++
 .../distributed/stream/all_reduce_cn.rst      | 30 ++++++++++
 .../paddle/distributed/stream/alltoall_cn.rst | 33 ++++++++++
 .../distributed/stream/alltoall_single_cn.rst | 32 ++++++++++
 .../distributed/stream/broadcast_cn.rst       | 30 ++++++++++
 .../api/paddle/distributed/stream/recv_cn.rst | 30 ++++++++++
 .../paddle/distributed/stream/reduce_cn.rst   | 31 ++++++++++
 .../distributed/stream/reduce_scatter_cn.rst  | 34 +++++++++++
 .../paddle/distributed/stream/scatter_cn.rst  | 34 +++++++++++
 .../api/paddle/distributed/stream/send_cn.rst | 30 ++++++++++
 26 files changed, 501 insertions(+), 136 deletions(-)
 create mode 100644 docs/api/paddle/distributed/stream/all_gather_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/all_reduce_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/alltoall_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/alltoall_single_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/broadcast_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/recv_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/reduce_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/scatter_cn.rst
 create mode 100644 docs/api/paddle/distributed/stream/send_cn.rst

diff --git a/docs/api/paddle/distributed/Overview_cn.rst b/docs/api/paddle/distributed/Overview_cn.rst
index 80aceaa562c..9ff00bc0708 100644
--- a/docs/api/paddle/distributed/Overview_cn.rst
+++ b/docs/api/paddle/distributed/Overview_cn.rst
@@ -9,7 +9,8 @@ paddle.distributed 目录包含的 API 支撑飞桨框架大规模分布式训
 -  :ref:`环境配置和训练启动管理 <02>`
 -  :ref:`数据加载 <03>`
 -  :ref:`集合通信算法 API <04>`
--  :ref:`RPC API <05>`
+-  :ref:`Stream 集合通信高级 API <05>`
+-  :ref:`RPC API <06>`
 
 .. _01:
 
@@ -71,32 +72,53 @@ paddle.distributed.fleet 是分布式训练的统一入口 API，用于配置分
 
 .. _04:
 
-集合通信算法 API
+集合通信 API
 ::::::::::::::::::::::
 
-在集群上，对多设备的进程组的参数数据 tensor 或 object 进行计算处理。
+在集群上，对多设备的进程组的参数数据 tensor 或 object 进行计算处理，包括规约、聚合、广播、分发等。
 
 .. csv-table::
     :header: "API 名称", "API 功能"
     :widths: 20, 50
 
-
-    " :ref:`reduce <cn_api_distributed_reduce>` ", "规约，规约进程组内的 tensor，返回结果至指定进程"
-    " :ref:`ReduceOp <cn_api_distributed_ReduceOp>` ", "规约，指定逐元素规约操作"
-    " :ref:`all_reduce <cn_api_distributed_all_reduce>` ", "组规约，规约进程组内的 tensor，结果广播至每个进程"
-    " :ref:`all_gather <cn_api_distributed_all_gather>` ", "组聚合，聚合进程组内的 tensor，结果广播至每个进程"
-    " :ref:`all_gather_object <cn_api_distributed_all_gather_object>` ", "组聚合，聚合进程组内的 object，结果广播至每个进程"
-    " :ref:`alltoall <cn_api_distributed_alltoall>` ", "分发 tensor 列表到每个进程并进行聚合"
-    " :ref:`alltoall_single <cn_api_distributed_alltoall_single>` ", "分发单个 tensor 到每个进程并聚合至目标 tensor"
-    " :ref:`broadcast <cn_api_distributed_broadcast>` ", "广播一个 tensor 到每个进程"
-    " :ref:`scatter <cn_api_distributed_scatter>` ", "分发 tensor 到每个进程"
-    " :ref:`split <cn_api_distributed_split>` ", "切分参数到多个设备"
-    " :ref:`barrier <cn_api_distributed_barrier>` ", "同步路障，进行阻塞操作，实现组内所有进程的同步"
-    " :ref:`send <cn_api_distributed_send>` ", "发送一个 tensor 到指定的进程"
+    " :ref:`ReduceOp <cn_api_distributed_ReduceOp>` ", "规约操作的类型"
+    " :ref:`reduce <cn_api_distributed_reduce>` ", "规约，规约进程组内的一个 tensor，随后将结果发送到指定进程"
+    " :ref:`all_reduce <cn_api_distributed_all_reduce>` ", "组规约，规约进程组内的 tensor，随后将结果发送到每个进程"
+    " :ref:`all_gather <cn_api_distributed_all_gather>` ", "组聚合，聚合进程组内的 tensor，随后将结果发送到每个进程"
+    " :ref:`all_gather_object <cn_api_distributed_all_gather_object>` ", "组聚合，聚合进程组内的 object，随后将结果发送到每个进程"
+    " :ref:`alltoall <cn_api_distributed_alltoall>` ", "将一组 tensor 分发到每个进程并进行聚合"
+    " :ref:`alltoall_single <cn_api_distributed_alltoall_single>` ", "将一个 tensor 分发到每个进程并聚合到目标 tensor"
+    " :ref:`broadcast <cn_api_distributed_broadcast>` ", "将一个 tensor 发送到每个进程"
+    " :ref:`scatter <cn_api_distributed_scatter>` ", "将一组 tensor 分发到每个进程"
+    " :ref:`reduce_scatter <cn_api_distributed_reduce_scatter>` ", "规约一组 tensor，随后将规约结果分发到每个进程"
+    " :ref:`isend <cn_api_distributed_isend>` ", "异步发送一个 tensor 到指定进程"
+    " :ref:`irecv <cn_api_distributed_irecv>` ", "异步接收一个来自指定进程的 tensor"
+    " :ref:`send <cn_api_distributed_send>` ", "发送一个 tensor 到指定进程"
     " :ref:`recv <cn_api_distributed_recv>` ", "接收一个来自指定进程的 tensor"
-    " :ref:`isend <cn_api_paddle_distributed_isend>` ", "异步发送一个 tensor 到指定的进程"
-    " :ref:`irecv <cn_api_paddle_distributed_irecv>` ", "异步接收一个来自指定进程的 tensor"
-    " :ref:`reduce_scatter <cn_api_paddle_distributed_reduce_scatter>` ", "规约，然后将 tensor 列表分散到组中的所有进程上"
+    " :ref:`barrier <cn_api_distributed_barrier>` ", "同步路障，阻塞操作以实现组内进程同步"
+
+.. _05:
+
+Stream 集合通信高级 API
+::::::::::::::::::::::
+
+paddle.distributed.stream 在集合通信 API 的基础上，提供更统一的语义和对计算流的更精细的控制能力，有助于在特定场景下提高性能。
+
+.. csv-table::
+    :header: "API 名称", "API 功能"
+    :widths: 30, 50
+
+
+    " :ref:`stream.reduce <cn_api_distributed_stream_reduce>` ", "规约，规约进程组内的 tensor，随后将结果发送到指定进程"
+    " :ref:`stream.all_reduce <cn_api_distributed_stream_all_reduce>` ", "组规约，规约进程组内的 tensor，随后将结果发送到每个进程"
+    " :ref:`stream.all_gather <cn_api_distributed_stream_all_gather>` ", "组聚合，聚合进程组内的 tensor，随后将结果发送到每个进程"
+    " :ref:`stream.alltoall <cn_api_distributed_stream_alltoall>` ", "分发一组 tensor 到每个进程并进行聚合"
+    " :ref:`stream.alltoall_single <cn_api_distributed_stream_alltoall_single>` ", "分发一个 tensor 到每个进程并聚合到目标 tensor"
+    " :ref:`stream.broadcast <cn_api_distributed_stream_broadcast>` ", "发送一个 tensor 到每个进程"
+    " :ref:`stream.scatter <cn_api_distributed_stream_scatter>` ", "分发一个 tensor 到每个进程"
+    " :ref:`stream.reduce_scatter <cn_api_distributed_stream_reduce_scatter>` ", "规约一组 tensor，随后将规约结果分发到每个进程"
+    " :ref:`stream.send <cn_api_distributed_stream_send>` ", "发送一个 tensor 到指定进程"
+    " :ref:`stream.recv <cn_api_distributed_stream_recv>` ", "接收一个来自指定进程的 tensor"
 
 .. _05:
 
diff --git a/docs/api/paddle/distributed/ReduceOp_cn.rst b/docs/api/paddle/distributed/ReduceOp_cn.rst
index 19f5bb54ddc..95a145b50f7 100644
--- a/docs/api/paddle/distributed/ReduceOp_cn.rst
+++ b/docs/api/paddle/distributed/ReduceOp_cn.rst
@@ -5,7 +5,7 @@ ReduceOp
 
 .. py:class:: paddle.distributed.ReduceOp()
 
-指定规约类操作的逐元素操作类型，需要是下述值之一：
+指定规约操作的类型，必须是下述值之一：
 
     ReduceOp.SUM
 
diff --git a/docs/api/paddle/distributed/all_gather_cn.rst b/docs/api/paddle/distributed/all_gather_cn.rst
index 14866b55f64..c9766cdb948 100644
--- a/docs/api/paddle/distributed/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/all_gather_cn.rst
@@ -4,11 +4,12 @@ all_gather
 -------------------------------
 
 
-.. py:function:: paddle.distributed.all_gather(tensor_list, tensor, group=0)
+.. py:function:: paddle.distributed.all_gather(tensor_list, tensor, group=None, sync_op=True)
 
-进程组内所有进程的指定 tensor 进行聚合操作，并返回给所有进程聚合的结果。
-如下图所示，4 个 GPU 分别开启 1 个进程，每张卡上的数据用卡号代表，
-经过 all_gather 算子后，每张卡都会拥有所有卡的数据。
+组聚合，聚合进程组内的指定 tensor，随后将聚合后的 tensor 列表发送到每个进程。
+
+如下图所示，4 个 GPU 分别开启 1 个进程，进程拥有的数据用其在组内的 rank 表示。
+聚合操作后，每个进程都会得到所有进程拥有的数据。
 
 .. image:: ./img/allgather.png
   :width: 800
@@ -17,13 +18,14 @@ all_gather
 
 参数
 :::::::::
-    - **tensor_list** (list) - 操作的输出 Tensor 列表。列表中的每个元素均为 Tensor，每个 Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16、complex64、complex128。
-    - **tensor** (Tensor) - 操作的输入 Tensor。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16、complex64、complex128。
-    - **group** (int，可选) - 工作的进程组编号，默认为 0。
+    - **tensor_list** (List[Tensor]) - 用于保存聚合结果的 tensor 列表。若不为空，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
+    - **tensor** (Tensor) - 待聚合的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16、complex64、complex128。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-无
+无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/all_gather_object_cn.rst b/docs/api/paddle/distributed/all_gather_object_cn.rst
index 21d265ebad7..fcf946135a9 100644
--- a/docs/api/paddle/distributed/all_gather_object_cn.rst
+++ b/docs/api/paddle/distributed/all_gather_object_cn.rst
@@ -4,22 +4,23 @@ all_gather_object
 -------------------------------
 
 
-.. py:function:: paddle.distributed.all_gather_object(object_list, object, group=0)
-
-进程组内所有进程指定的 picklable 对象进行聚合操作，并返回给所有进程聚合的结果。和 all_gather 类似，但可以传入自定义的 python 对象。
+.. py:function:: paddle.distributed.all_gather_object(object_list, obj, group=None)
 
 .. warning::
   该 API 只支持动态图模式。
 
+组聚合，聚合进程组内指定的 picklable 对象，随后将聚合后的对象列表发送到每个进程。
+过程与 ``all_gather`` 类似，但可以传入自定义的 python 对象。
+
 参数
 :::::::::
-    - **object_list** (list) - 操作的输出 Object 列表。
-    - **object** (Any) - 操作的输入 Object，需要保证输入自定义的 Object 是 picklable 的。
-    - **group** (int，可选) - 工作的进程组编号，默认为 0。
+    - **object_list** (List[Any]) - 用于保存聚合结果的列表。
+    - **object** (Any) - 待聚合的对象。需要保证该对象是 picklable 的。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
 
 返回
 :::::::::
-无
+无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/all_reduce_cn.rst b/docs/api/paddle/distributed/all_reduce_cn.rst
index 1358c0daa97..de20a7a2039 100644
--- a/docs/api/paddle/distributed/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/all_reduce_cn.rst
@@ -4,11 +4,12 @@ all_reduce
 -------------------------------
 
 
-.. py:function:: paddle.distributed.all_reduce(tensor, op=ReduceOp.SUM, group=0)
+.. py:function:: paddle.distributed.all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True)
 
-进程组内所有进程的指定 tensor 进行归约操作，并返回给所有进程归约的结果。
-如下图所示，4 个 GPU 分别开启 1 个进程，每张卡上的数据用卡号代表，规约操作为求和，
-经过 all_reduce 算子后，每张卡都会拥有所有卡数据的总和。
+规约进程组内的一个 tensor，随后将结果发送到每个进程。
+
+如下图所示，4 个 GPU 分别开启 1 个进程，进程拥有的数据用其在组内的 rank 表示，规约操作为求和。
+规约操作后，每个进程都会得到所有进程数据的总和。
 
 .. image:: ./img/allreduce.png
   :width: 800
@@ -17,13 +18,14 @@ all_reduce
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输入 Tensor，同时也会将归约结果返回至此 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的具体操作，比如求和，取最大值，取最小值和求乘积，默认为求和归约。
-    - **group** (int，可选) - 工作的进程组编号，默认为 0。
+    - **tensor** (Tensor) - 输入的 tensor。返回结果也将保存到该 tensor 中。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的操作类型，包括求和、取最大值、取最小值和求乘积。默认为求和。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-无
+返回 Task 实例。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/alltoall_cn.rst b/docs/api/paddle/distributed/alltoall_cn.rst
index 0b06a6518a5..97354561fc4 100644
--- a/docs/api/paddle/distributed/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/alltoall_cn.rst
@@ -4,12 +4,14 @@ alltoall
 -------------------------------
 
 
-.. py:function:: paddle.distributed.alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True)
+.. py:function:: paddle.distributed.alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True)
 
-将 in_tensor_list 里面的 tensors 按照卡数均分并按照卡的顺序分发到所有参与的卡并将结果 tensors 汇总到 out_tensor_list。
-如下图所示，GPU0 卡的 in_tensor_list 会按照两张卡拆分成 0_0 和 0_1， GPU1 卡的 in_tensor_list 同样拆分成 1_0 和 1_1，经过 alltoall 算子后，
-GPU0 卡的 0_0 会发送给 GPU0，GPU0 卡的 0_1 会发送给 GPU1，GPU1 卡的 1_0 会发送给 GPU0，GPU1 卡的 1_1 会发送给 GPU1，所以 GPU0 卡的 out_tensor_list 包含 0_0 和 1_0，
-GPU1 卡的 out_tensor_list 包含 0_1 和 1_1。
+将 in_tensor_list 中的一组 tensor 分发到每个进程，随后在每个进程上将分发结果聚合到 out_tensor_list。
+
+如下图所示，2 个 GPU 分别开启 1 个进程，rank=0 的进程的 in_tensor_list 包含 0_0 和 0_1 两个 tensor，rank=1 的进程的 in_tensor_list 包含 1_0 和 1_1 两个 tensor。
+操作后，rank=0 的进程的 out_tensor_list 会包含 0_0 和 1_0 两个 tensor，rank=1 的进程的 out_tensor_list 会包含 0_0 和 1_1 两个 tensor。
+
+简单来说，该操作类似于 scatter + gather。更直观地，如果将全部进程上的数据看作一个矩阵，该操作类似于对矩阵进行转置。
 
 .. image:: ./img/alltoall.png
   :width: 800
@@ -18,14 +20,14 @@ GPU1 卡的 out_tensor_list 包含 0_1 和 1_1。
 
 参数
 :::::::::
-    - **in_tensor_list** (list) - 包含所有输入 Tensors 的一个列表。在列表里面的所有元素都必须是一个 Tensor，Tensor 的数据类型必须是 float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **out_tensor_list** (Tensor) - 包含所有输出 Tensors 的一个列表。在列表里面的所有元素数据类型要和输入的 Tensors 数据类型一致。
-    - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
-    - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。
+    - **in_tensor_list** (List[Tensor]) - 输入的 tensor 列表。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **out_tensor_list** (List[Tensor]) - 用于保存操作结果的 tensor 列表。其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-无
+无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/alltoall_single_cn.rst b/docs/api/paddle/distributed/alltoall_single_cn.rst
index f5ef77a4227..b83894428cb 100644
--- a/docs/api/paddle/distributed/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/alltoall_single_cn.rst
@@ -4,21 +4,25 @@ alltoall_single
 -------------------------------
 
 
-.. py:function:: alltoall_single(in_tensor, out_tensor, in_split_sizes=None, out_split_sizes=None, group=None, use_calc_stream=True)
+.. py:function:: paddle.distributed.alltoall_single(in_tensor, out_tensor, in_split_sizes=None, out_split_sizes=None, group=None, sync_op=True)
 
-将输入的 tensor 分发到所有进程，并将接收到的 tensor 聚合到 out_tensor 中。
+.. warning::
+  该 API 只支持动态图模式。
+
+将输入的 tensor 分发到每个进程，随后在每个进程上将分发结果聚合到 out_tensor 中。
 
 参数
 :::::::::
-    - in_tensor (Tensor): 输入的 tensor，其数据类型必须是 float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - out_tensor (Tensor): 输出的 tensor，其数据类型与输入的 tensor 一致。
-    - in_split_sizes (list[int]，可选): 对 in_tensor 的 dim[0] 进行切分的大小。若该参数未指定，in_tensor 将被均匀切分到各个进程中（需要确保 in_tensor 的大小能够被组中的进程数整除）。默认值：None。
-    - out_split_sizes (list[int]，可选): 对 out_tensor 的 dim[0] 进行切分的大小。若该参数未指定，out_tensor 将均匀地聚合来自各个进程的数据（需要确保 out_tensor 的大小能够被组中的进程数整除）。默认值：None。
-    - use_calc_stream (bool，可选) - 标识使用计算流（若为 True）还是通信流。默认值：True。
+    - **in_tensor** (Tensor): 输入的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **out_tensor** (Tensor): 用于保存操作结果的 tensor，数据类型必须与输入的 tensor 保持一致。
+    - **in_split_sizes** (List[int]，可选): 对 in_tensor 的 dim[0] 进行切分的大小。默认为 None，即将 in_tensor 均匀地分发到各个进程中（需要确保 in_tensor 的大小能够被组中的进程数整除）。
+    - **out_split_sizes** (List[int]，可选): 对 out_tensor 的 dim[0] 进行切分的大小。默认为 None，即 out_tensor 将均匀地聚合来自各个进程的数据（需要确保 out_tensor 的大小能够被组中的进程数整除）。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-若 use_calc_stream=True，无返回值；若 use_calc_stream=False，返回一个 Task。
+若为同步操作，无返回值；若为异步操作，返回 Task 实例。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/barrier_cn.rst b/docs/api/paddle/distributed/barrier_cn.rst
index b61554060e2..d2ea046feeb 100644
--- a/docs/api/paddle/distributed/barrier_cn.rst
+++ b/docs/api/paddle/distributed/barrier_cn.rst
@@ -4,13 +4,13 @@ barrier
 -------------------------------
 
 
-.. py:function:: paddle.distributed.barrier(group=0)
+.. py:function:: paddle.distributed.barrier(group=None)
 
 同步进程组内的所有进程。
 
 参数
 :::::::::
-    - **group** (int，可选) - 工作的进程组编号，默认为 0。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
 
 返回
 :::::::::
diff --git a/docs/api/paddle/distributed/broadcast_cn.rst b/docs/api/paddle/distributed/broadcast_cn.rst
index 5149b1483cc..a70050e8e6b 100644
--- a/docs/api/paddle/distributed/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/broadcast_cn.rst
@@ -4,10 +4,12 @@ broadcast
 -------------------------------
 
 
-.. py:function:: paddle.distributed.broadcast(tensor, src, group=0)
+.. py:function:: paddle.distributed.broadcast(tensor, src, group=None, sync_op=True)
 
-广播一个 Tensor 给其他所有进程。
-如下图所示，4 个 GPU 分别开启 1 个进程，GPU0 卡拥有数据，经过 broadcast 算子后，会将这个数据传播到所有卡上。
+将一个 tensor 发送到每个进程。
+
+如下图所示，4 个 GPU 分别开启 1 个进程，rank=0 的进程拥有数据 0。
+广播操作后，数据 0 会被发送到所有进程上。
 
 .. image:: ./img/broadcast.png
   :width: 800
@@ -16,13 +18,16 @@ broadcast
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 如果当前进程编号是源，那么这个 Tensor 变量将被发送给其他进程，否则这个 Tensor 将接收源发送过来的数据。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **src** (int) - 发送源的进程编号。
-    - **group** (int，可选) - 工作的进程组编号，默认为 0。
+    - **tensor** (Tensor) - 在目标进程上为待广播的 tensor，在其他进程上为用于接收广播结果的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **src** (int) - 目标进程的 rank，该进程传入的 tensor 将被发送到其他进程上。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-无
+动态图模式下，若为同步操作，返回 None；若为异步操作，返回 Task 实例。
+
+静态图模式下，返回 None。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/irecv_cn.rst b/docs/api/paddle/distributed/irecv_cn.rst
index 956e1cc9c5b..696cabc8c49 100644
--- a/docs/api/paddle/distributed/irecv_cn.rst
+++ b/docs/api/paddle/distributed/irecv_cn.rst
@@ -1,26 +1,26 @@
-.. _cn_api_paddle_distributed_irecv:
+.. _cn_api_distributed_irecv:
 
 irecv
 -------------------------------
 
 
 .. py:function:: paddle.distributed.irecv(tensor, src=None, group=None)
-异步接受发送来的 tensor。
+
+.. warning::
+  该 API 只支持动态图模式。
+
+异步接收一个来自指定进程的 tensor。
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 要接受的张量。其数据类型应为 float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **src** (int) - 接受节点的全局 rank 号。
-    - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认的全局组。默认值：None。
+    - **tensor** (Tensor) - 用于接收数据的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **src** (int) - 目标进程的 rank，将接收来自该进程的 tensor。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
 
 
 返回
 :::::::::
-返回 Task。
-
-注意
-:::::::::
-当前只支持动态图
+返回 Task 实例。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/isend_cn.rst b/docs/api/paddle/distributed/isend_cn.rst
index 3e9d87fada5..73645fda513 100644
--- a/docs/api/paddle/distributed/isend_cn.rst
+++ b/docs/api/paddle/distributed/isend_cn.rst
@@ -1,27 +1,26 @@
-.. _cn_api_paddle_distributed_isend:
+.. _cn_api_distributed_isend:
 
 isend
 -------------------------------
 
 
 .. py:function:: paddle.distributed.isend(tensor, dst, group=None)
-异步的将 ``tensor`` 发送到指定的 rank 进程上。
 
-参数
-:::::::::
-    - **tensor** (Tensor) - 要发送的张量。其数据类型应为 float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **dst** (int) - 目标节点的全局 rank 号。
-    - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认的全局组。默认值：None。
+.. warning::
+  该 API 只支持动态图模式。
 
+异步发送一个 tensor 到指定进程。
 
-返回
+参数
 :::::::::
-返回 Task。
+    - **tensor** (Tensor) - 待发送的 Tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **dst** (int) - 目标进程的 rank，传入的 tensor 将发送到该进程。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
 
 
-注意
+返回
 :::::::::
-当前只支持动态图
+返回 Task 实例。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/recv_cn.rst b/docs/api/paddle/distributed/recv_cn.rst
index 0b1c7d4da93..c9d4c0906ff 100644
--- a/docs/api/paddle/distributed/recv_cn.rst
+++ b/docs/api/paddle/distributed/recv_cn.rst
@@ -4,20 +4,22 @@ recv
 -------------------------------
 
 
-.. py:function:: paddle.distributed.recv(tensor, src=0, group=None, use_calc_stream=True)
+.. py:function:: paddle.distributed.recv(tensor, src=0, group=None, sync_op=True)
 
-接收一个来自指定发送者的 tensor。
+接收一个来自指定进程的 tensor。
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 接收数据的 Tensor。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **src** (int) - 发送者的标识符。
-    - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
-    - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。
+    - **tensor** (Tensor) - 用于接收数据的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **src** (int，可选) - 目标进程的 rank，将接收来自该进程的 tensor。默认为 0，即接收来自 rank=0 的进程的 tensor。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-无
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+
+静态图模式下，无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/reduce_cn.rst b/docs/api/paddle/distributed/reduce_cn.rst
index 3dea1e949bc..5df20ebb1d7 100644
--- a/docs/api/paddle/distributed/reduce_cn.rst
+++ b/docs/api/paddle/distributed/reduce_cn.rst
@@ -4,11 +4,12 @@ reduce
 -------------------------------
 
 
-.. py:function:: paddle.distributed.reduce(tensor, dst, op=ReduceOp.SUM, group=0)
+.. py:function:: paddle.distributed.reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True)
 
-进程组内所有进程的指定 tensor 进行归约操作，并返回给所有进程归约的结果。
-如下图所示，4 个 GPU 分别开启 1 个进程，每张卡上的数据用卡号代表，reduce 的目标是第 0 张卡，
-规约操作是求和，经过 reduce 操作后，第 0 张卡会得到所有卡数据的总和。
+规约进程组内的一个 tensor，随后将结果发送到指定进程。
+
+如下图所示，4 个 GPU 分别开启 1 个进程，进程拥有的数据用其在组内的 rank 表示，规约的目标是 rank=0 的进程，规约操作为求和。
+规约操作后，rank=0 的进程会得到所有进程数据的总和。
 
 .. image:: ./img/reduce.png
   :width: 800
@@ -17,14 +18,17 @@ reduce
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输入 Tensor，结果返回至目标进程号的 Tensor 中。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **dst** (int) - 返回操作结果的目标进程编号。
-    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的具体操作，比如求和，取最大值，取最小值和求乘积，默认为求和归约。
-    - **group** (int，可选) - 工作的进程组编号，默认为 0。
+    - **tensor** (Tensor) - 输入的 tensor。在目标进程上，返回结果将保存到该 tensor 中。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **dst** (int) - 目标进程的 rank，规约结果将发送到该进程。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的操作类型，包括求和、取最大值、取最小值和求乘积。默认为求和。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-无
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+
+静态图模式下，无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/reduce_scatter_cn.rst b/docs/api/paddle/distributed/reduce_scatter_cn.rst
index 9b45edf3a06..87f5bb3a3d9 100644
--- a/docs/api/paddle/distributed/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/reduce_scatter_cn.rst
@@ -1,28 +1,28 @@
-.. _cn_api_paddle_distributed_reduce_scatter:
+.. _cn_api_distributed_reduce_scatter:
 
 reduce_scatter
 -------------------------------
 
 
-.. py:function:: paddle.distributed.reduce_scatter(tensor, tensor_list, op=ReduceOp.SUM, group=None, use_calc_stream=True)
-规约，然后将张量列表分散到组中的所有进程上
+.. py:function:: paddle.distributed.reduce_scatter(tensor, tensor_list, op=ReduceOp.SUM, group=None, sync_op=True)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+规约一组 tensor，随后将规约结果分发到每个进程。
 
 参数
 :::::::::
-    - **tensor** (Tensor) – 输出的张量。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **tensor_list** (list(Tensor)) – 归约和切分的张量列表。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD) – 操作类型，默认 ReduceOp.SUM。
-    - **group**: (Group, 可选) – 通信组；如果是 None，则使用默认通信组。
-    - **use_calc_stream**: (bool, 可选) – 决定是在计算流还是通信流上做该通信操作；默认为 True，表示在计算流。
+    - **tensor** (Tensor) – 用于接收数据的 tensor，数据类型必须与输入的 tensor 列表保持一致。
+    - **tensor_list** (List[Tensor]) – 将被规约和分发的 tensor 列表。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的操作类型，包括求和、取最大值、取最小值和求乘积。默认为求和。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 
 返回
 :::::::::
-返回 Task。
-
-注意
-:::::::::
-当前只支持动态图
+若为同步操作，无返回值；若为异步操作，返回 Task 实例。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/scatter_cn.rst b/docs/api/paddle/distributed/scatter_cn.rst
index d68309b8bc0..8bcfd461fc8 100644
--- a/docs/api/paddle/distributed/scatter_cn.rst
+++ b/docs/api/paddle/distributed/scatter_cn.rst
@@ -4,11 +4,12 @@ scatter
 -------------------------------
 
 
-.. py:function:: paddle.distributed.scatter(tensor, tensor_list=None, src=0, group=0)
+.. py:function:: paddle.distributed.scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True)
 
-进程组内指定进程源的 tensor 列表分发到其他所有进程中。
-如下图所示，4 个 GPU 分别开启 1 个进程，scatter 的源选择为第 0 张卡，
-经过 scatter 算子后，会将第 0 张卡的数据平均分到所有卡上。
+将一组来自指定进程的 tensor 分发到每个进程。
+
+如下图所示，4 个 GPU 分别开启 1 个进程，将分发 rank=0 的进程拥有的数据。
+分发操作后，rank=0 的进程拥有的数据被平均分配到每个进程上。
 
 .. image:: ./img/scatter.png
   :width: 800
@@ -17,14 +18,17 @@ scatter
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 操作的输出 Tensor。Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **tensor_list** (list，可选) - 操作的输入 Tensor 列表，默认为 None。列表中的每个元素均为 Tensor，每个 Tensor 的数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **src** (int，可选) - 操作的源进程号，该进程号的 Tensor 列表将分发到其他进程中。默认为 0。
-    - **group** (int，可选) - 工作的进程组编号，默认为 0。
+    - **tensor** (Tensor) - 用于接收数据的 tensor，数据类型必须与输入的 tensor 列表保持一致。
+    - **tensor_list** (List[Tensor]，可选) - 将被分发的 tensor 列表。默认为 None。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **src** (int，可选) - 目标进程的 rank，该进程的 tensor 列表将被分发到其他进程中。默认为 0，即分发 rank=0 的进程上的 tensor 列表。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-无
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+
+静态图模式下，无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/send_cn.rst b/docs/api/paddle/distributed/send_cn.rst
index 75ac348a073..3dfb79fdf35 100644
--- a/docs/api/paddle/distributed/send_cn.rst
+++ b/docs/api/paddle/distributed/send_cn.rst
@@ -4,20 +4,22 @@ send
 -------------------------------
 
 
-.. py:function:: paddle.distributed.send(tensor, dst=0, group=None, use_calc_stream=True)
+.. py:function:: paddle.distributed.send(tensor, dst=0, group=None, sync_op=True)
 
-发送 tensor 到指定接收者。
+发送一个 tensor 到指定进程。
 
 参数
 :::::::::
-    - **tensor** (Tensor) - 需要发送的 Tensor。数据类型为：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **dst** (int) - 接收者的标识符。
-    - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
-    - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True。
+    - **tensor** (Tensor) - 待发送的 Tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **dst** (int，可选) - 目标进程的 rank，传入的 tensor 将发送到该进程。默认为 0，即发送到 rank=0 的进程。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
 
 返回
 :::::::::
-无
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+
+静态图模式下，无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/all_gather_cn.rst b/docs/api/paddle/distributed/stream/all_gather_cn.rst
new file mode 100644
index 00000000000..975dd7b7cba
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/all_gather_cn.rst
@@ -0,0 +1,32 @@
+.. _cn_api_distributed_stream_all_gather:
+
+all_gather
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.all_gather(tensor_or_tensor_list, tensor, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+组聚合，聚合进程组内的指定 tensor，随后将聚合结果发送到每个进程。
+
+参见 `paddle.distributed.all_gather`。
+
+参数
+:::::::::
+    - **tensor_or_tensor_list** (Tensor|List[Tensor]) - 用于保存聚合结果。
+    若为 tensor，该 tensor 的大小必须与所有待聚合的 tensor 沿 dim[0] 拼接后的大小相同。
+    若为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
+    - **tensor** (Tensor) - 待聚合的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.all_gather
diff --git a/docs/api/paddle/distributed/stream/all_reduce_cn.rst b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
new file mode 100644
index 00000000000..0c7954cc546
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
@@ -0,0 +1,30 @@
+.. _cn_api_distributed_stream_all_reduce:
+
+all_reduce
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+组规约，规约进程组内的一个 tensor，随后将结果发送到每个进程。
+
+参见 `paddle.distributed.all_reduce`。
+
+参数
+:::::::::
+    - **tensor** (Tensor) - 输入的 tensor。返回结果也将保存到该 tensor 中。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的操作类型，包括求和、取最大值、取最小值和求乘积。默认为求和。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.all_reduce
diff --git a/docs/api/paddle/distributed/stream/alltoall_cn.rst b/docs/api/paddle/distributed/stream/alltoall_cn.rst
new file mode 100644
index 00000000000..a5854870ee8
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/alltoall_cn.rst
@@ -0,0 +1,33 @@
+.. _cn_api_distributed_stream_alltoall:
+
+alltoall
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.alltoall(out_tensor_or_tensor_list, in_tensor_or_tensor_list, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+将一个或一组 tensor 分发到每个进程，随后在每个进程上聚合分发结果。
+
+参见 `paddle.distributed.alltoall`。
+
+参数
+:::::::::
+    - **out_tensor_or_tensor_list** (Tensor|List[Tensor]) - 用于保存操作结果。
+    若输入数据为 tensor，该参数必须为 tensor，且大小与所有输入的 tensor 沿 dim[0] 拼接后的大小相同。
+    若输入数据为 tensor 列表，该参数必须为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
+    - **in_tensor_or_tensor_list** (Tensor|List[Tensor]) - 输入的数据，可以是一个 tensor 或 tensor 列表。
+    支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.alltoall
diff --git a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
new file mode 100644
index 00000000000..f199c1ae03a
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
@@ -0,0 +1,32 @@
+.. _cn_api_distributed_stream_alltoall_single:
+
+alltoall_single
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.alltoall_single(out_tensor, in_tensor, out_split_sizes=None, in_split_sizes=None, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+将一个 tensor 分发到每个进程，随后在每个进程上聚合分发结果。与 ``alltoall`` 相比，可以更精细地控制分发过程。
+
+参见 `paddle.distributed.alltoall_single`。
+
+参数
+:::::::::
+    - **out_tensor** (Tensor): 用于保存操作结果的 tensor，数据类型必须与输入的 tensor 保持一致。
+    - **in_tensor** (Tensor): 输入的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **out_split_sizes** (List[int]，可选): 对 out_tensor 的 dim[0] 进行切分的大小。默认为 None，即 out_tensor 将均匀地聚合来自各个进程的数据（需要确保 out_tensor 的大小能够被组中的进程数整除）。
+    - **in_split_sizes** (List[int]，可选): 对 in_tensor 的 dim[0] 进行切分的大小。默认为 None，即将 in_tensor 均匀地分发到各个进程中（需要确保 in_tensor 的大小能够被组中的进程数整除）。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.alltoall_single
diff --git a/docs/api/paddle/distributed/stream/broadcast_cn.rst b/docs/api/paddle/distributed/stream/broadcast_cn.rst
new file mode 100644
index 00000000000..216faccc878
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/broadcast_cn.rst
@@ -0,0 +1,30 @@
+.. _cn_api_distributed_stream_broadcast:
+
+broadcast
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+将一个 tensor 发送到每个进程。
+
+参见 `paddle.distributed.broadcast`。
+
+参数
+:::::::::
+    - **tensor** (Tensor) - 在目标进程上为待广播的 tensor，在其他进程上为用于接收广播结果的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **src** (int，可选) - 目标进程的 rank，该进程传入的 tensor 将被发送到其他进程上。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.broadcast
diff --git a/docs/api/paddle/distributed/stream/recv_cn.rst b/docs/api/paddle/distributed/stream/recv_cn.rst
new file mode 100644
index 00000000000..0bc6dc43a8b
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/recv_cn.rst
@@ -0,0 +1,30 @@
+.. _cn_api_distributed_stream_recv:
+
+recv
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+接收一个来自指定进程的 tensor。
+
+参见 `paddle.distributed.recv`。
+
+参数
+:::::::::
+    - **tensor** (Tensor) - 用于接收数据的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **src** (int，可选) - 目标进程的 rank，将接收来自该进程的 tensor。默认为 0，即接收来自 rank=0 的进程的 tensor。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.recv
diff --git a/docs/api/paddle/distributed/stream/reduce_cn.rst b/docs/api/paddle/distributed/stream/reduce_cn.rst
new file mode 100644
index 00000000000..fd9e1676791
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/reduce_cn.rst
@@ -0,0 +1,31 @@
+.. _cn_api_distributed_stream_reduce:
+
+reduce
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.reduce(tensor, dst=0, op=ReduceOp.SUM, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+规约进程组内的一个 tensor，随后将结果发送到指定进程。
+
+参见 `paddle.distributed.reduce`。
+
+参数
+:::::::::
+    - **tensor** (Tensor) - 输入的 tensor。在目标进程上，返回结果将保存到该 tensor 中。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **dst** (int，可选) - 目标进程的 rank，规约结果将发送到该进程。默认为 0，即结果将发送到 rank=0 的进程。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的操作类型，包括求和、取最大值、取最小值和求乘积。默认为求和。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.reduce
diff --git a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
new file mode 100644
index 00000000000..1e61f06e589
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
@@ -0,0 +1,34 @@
+.. _cn_api_distributed_stream_reduce_scatter:
+
+reduce_scatter
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.reduce_scatter(tensor, tensor_or_tensor_list, op=ReduceOp.SUM, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+规约一组 tensor，随后将规约结果分发到每个进程。
+
+参见 `paddle.distributed.reduce`。
+
+参数
+:::::::::
+    - **tensor** (Tensor) – 用于接收数据的 tensor，数据类型必须与输入保持一致。
+    - **tensor_or_tensor_list** (Tensor|List[Tensor]) - 输入的数据，可以是一个 tensor 或 tensor 列表。
+    若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。
+    支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的操作类型，包括求和、取最大值、取最小值和求乘积。默认为求和。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.reduce_scatter
diff --git a/docs/api/paddle/distributed/stream/scatter_cn.rst b/docs/api/paddle/distributed/stream/scatter_cn.rst
new file mode 100644
index 00000000000..d51fde74779
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/scatter_cn.rst
@@ -0,0 +1,34 @@
+.. _cn_api_distributed_stream_scatter:
+
+scatter
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.scatter(tensor, tensor_or_tensor_list=None, src=0, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+将一组来自指定进程的 tensor 分发到每个进程。
+
+参见 `paddle.distributed.scatter`。
+
+参数
+:::::::::
+    - **tensor** (Tensor) - 用于接收数据的 tensor，数据类型必须与输入保持一致。
+    - **tensor_or_tensor_list** (Tensor|List[Tensor]，可选) - 待分发的数据，可以是一个 tensor 或 tensor 列表。
+    若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。
+    支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    默认为 None，因为非目标进程上的该参数将被忽略。
+    - **src** (int，可选) - 目标进程的 rank，该进程的 tensor 列表将被分发到其他进程中。默认为 0，即分发 rank=0 的进程上的 tensor 列表。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.scatter
diff --git a/docs/api/paddle/distributed/stream/send_cn.rst b/docs/api/paddle/distributed/stream/send_cn.rst
new file mode 100644
index 00000000000..b5219185b84
--- /dev/null
+++ b/docs/api/paddle/distributed/stream/send_cn.rst
@@ -0,0 +1,30 @@
+.. _cn_api_distributed_stream_send:
+
+send
+-------------------------------
+
+
+.. py:function:: paddle.distributed.stream.send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False)
+
+.. warning::
+  该 API 只支持动态图模式。
+
+发送一个 tensor 到指定进程。
+
+参见 `paddle.distributed.send`。
+
+参数
+:::::::::
+    - **tensor** (Tensor) - 待发送的 Tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **dst** (int，可选) - 目标进程的 rank，传入的 tensor 将发送到该进程。默认为 0，即发送到 rank=0 的进程。
+    - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
+    - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
+    - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
+
+返回
+:::::::::
+返回 Task 实例。
+
+代码示例
+:::::::::
+COPY-FROM: paddle.distributed.stream.send

From a9b1d0ab6213286323700696f443ae81b62ca477 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Tue, 11 Oct 2022 16:31:12 +0800
Subject: [PATCH 06/12] docs(distributed/collective): revision & update styles

---
 docs/api/paddle/distributed/Overview_cn.rst   | 20 +++++++++----------
 .../distributed/all_gather_object_cn.rst      |  6 +++---
 .../paddle/distributed/reduce_scatter_cn.rst  |  4 ++--
 .../distributed/stream/all_gather_cn.rst      | 12 +++++------
 .../distributed/stream/all_reduce_cn.rst      |  8 ++++----
 .../paddle/distributed/stream/alltoall_cn.rst | 13 ++++++------
 .../distributed/stream/alltoall_single_cn.rst |  8 ++++----
 .../distributed/stream/broadcast_cn.rst       |  6 +++---
 .../api/paddle/distributed/stream/recv_cn.rst |  6 +++---
 .../paddle/distributed/stream/reduce_cn.rst   |  6 +++---
 .../distributed/stream/reduce_scatter_cn.rst  | 10 +++++-----
 .../paddle/distributed/stream/scatter_cn.rst  | 12 +++++------
 .../api/paddle/distributed/stream/send_cn.rst |  6 +++---
 13 files changed, 58 insertions(+), 59 deletions(-)

diff --git a/docs/api/paddle/distributed/Overview_cn.rst b/docs/api/paddle/distributed/Overview_cn.rst
index 9ff00bc0708..221207315ab 100644
--- a/docs/api/paddle/distributed/Overview_cn.rst
+++ b/docs/api/paddle/distributed/Overview_cn.rst
@@ -82,12 +82,12 @@ paddle.distributed.fleet 是分布式训练的统一入口 API，用于配置分
     :widths: 20, 50
 
     " :ref:`ReduceOp <cn_api_distributed_ReduceOp>` ", "规约操作的类型"
-    " :ref:`reduce <cn_api_distributed_reduce>` ", "规约，规约进程组内的一个 tensor，随后将结果发送到指定进程"
-    " :ref:`all_reduce <cn_api_distributed_all_reduce>` ", "组规约，规约进程组内的 tensor，随后将结果发送到每个进程"
-    " :ref:`all_gather <cn_api_distributed_all_gather>` ", "组聚合，聚合进程组内的 tensor，随后将结果发送到每个进程"
-    " :ref:`all_gather_object <cn_api_distributed_all_gather_object>` ", "组聚合，聚合进程组内的 object，随后将结果发送到每个进程"
+    " :ref:`reduce <cn_api_distributed_reduce>` ", "规约进程组内的 tensor，随后将结果发送到指定进程"
+    " :ref:`all_reduce <cn_api_distributed_all_reduce>` ", "规约进程组内的 tensor，随后将结果发送到每个进程"
+    " :ref:`all_gather <cn_api_distributed_all_gather>` ", "聚合进程组内的 tensor，随后将结果发送到每个进程"
+    " :ref:`all_gather_object <cn_api_distributed_all_gather_object>` ", "聚合进程组内的 object，随后将结果发送到每个进程"
     " :ref:`alltoall <cn_api_distributed_alltoall>` ", "将一组 tensor 分发到每个进程并进行聚合"
-    " :ref:`alltoall_single <cn_api_distributed_alltoall_single>` ", "将一个 tensor 分发到每个进程并聚合到目标 tensor"
+    " :ref:`alltoall_single <cn_api_distributed_alltoall_single>` ", "将一个 tensor 分发到每个进程并进行聚合"
     " :ref:`broadcast <cn_api_distributed_broadcast>` ", "将一个 tensor 发送到每个进程"
     " :ref:`scatter <cn_api_distributed_scatter>` ", "将一组 tensor 分发到每个进程"
     " :ref:`reduce_scatter <cn_api_distributed_reduce_scatter>` ", "规约一组 tensor，随后将规约结果分发到每个进程"
@@ -106,14 +106,14 @@ paddle.distributed.stream 在集合通信 API 的基础上，提供更统一的
 
 .. csv-table::
     :header: "API 名称", "API 功能"
-    :widths: 30, 50
+    :widths: 25, 50
 
 
-    " :ref:`stream.reduce <cn_api_distributed_stream_reduce>` ", "规约，规约进程组内的 tensor，随后将结果发送到指定进程"
-    " :ref:`stream.all_reduce <cn_api_distributed_stream_all_reduce>` ", "组规约，规约进程组内的 tensor，随后将结果发送到每个进程"
-    " :ref:`stream.all_gather <cn_api_distributed_stream_all_gather>` ", "组聚合，聚合进程组内的 tensor，随后将结果发送到每个进程"
+    " :ref:`stream.reduce <cn_api_distributed_stream_reduce>` ", "规约进程组内的 tensor，随后将结果发送到指定进程"
+    " :ref:`stream.all_reduce <cn_api_distributed_stream_all_reduce>` ", "规约进程组内的 tensor，随后将结果发送到每个进程"
+    " :ref:`stream.all_gather <cn_api_distributed_stream_all_gather>` ", "聚合进程组内的 tensor，随后将结果发送到每个进程"
     " :ref:`stream.alltoall <cn_api_distributed_stream_alltoall>` ", "分发一组 tensor 到每个进程并进行聚合"
-    " :ref:`stream.alltoall_single <cn_api_distributed_stream_alltoall_single>` ", "分发一个 tensor 到每个进程并聚合到目标 tensor"
+    " :ref:`stream.alltoall_single <cn_api_distributed_stream_alltoall_single>` ", "分发一个 tensor 到每个进程并进行聚合"
     " :ref:`stream.broadcast <cn_api_distributed_stream_broadcast>` ", "发送一个 tensor 到每个进程"
     " :ref:`stream.scatter <cn_api_distributed_stream_scatter>` ", "分发一个 tensor 到每个进程"
     " :ref:`stream.reduce_scatter <cn_api_distributed_stream_reduce_scatter>` ", "规约一组 tensor，随后将规约结果分发到每个进程"
diff --git a/docs/api/paddle/distributed/all_gather_object_cn.rst b/docs/api/paddle/distributed/all_gather_object_cn.rst
index fcf946135a9..156d61555f7 100644
--- a/docs/api/paddle/distributed/all_gather_object_cn.rst
+++ b/docs/api/paddle/distributed/all_gather_object_cn.rst
@@ -6,12 +6,12 @@ all_gather_object
 
 .. py:function:: paddle.distributed.all_gather_object(object_list, obj, group=None)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 组聚合，聚合进程组内指定的 picklable 对象，随后将聚合后的对象列表发送到每个进程。
 过程与 ``all_gather`` 类似，但可以传入自定义的 python 对象。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **object_list** (List[Any]) - 用于保存聚合结果的列表。
diff --git a/docs/api/paddle/distributed/reduce_scatter_cn.rst b/docs/api/paddle/distributed/reduce_scatter_cn.rst
index 87f5bb3a3d9..0e67eecf670 100644
--- a/docs/api/paddle/distributed/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/reduce_scatter_cn.rst
@@ -6,11 +6,11 @@ reduce_scatter
 
 .. py:function:: paddle.distributed.reduce_scatter(tensor, tensor_list, op=ReduceOp.SUM, group=None, sync_op=True)
 
+规约一组 tensor，随后将规约结果分发到每个进程。
+
 .. warning::
   该 API 只支持动态图模式。
 
-规约一组 tensor，随后将规约结果分发到每个进程。
-
 参数
 :::::::::
     - **tensor** (Tensor) – 用于接收数据的 tensor，数据类型必须与输入的 tensor 列表保持一致。
diff --git a/docs/api/paddle/distributed/stream/all_gather_cn.rst b/docs/api/paddle/distributed/stream/all_gather_cn.rst
index 975dd7b7cba..13171d318d6 100644
--- a/docs/api/paddle/distributed/stream/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_gather_cn.rst
@@ -6,18 +6,18 @@ all_gather
 
 .. py:function:: paddle.distributed.stream.all_gather(tensor_or_tensor_list, tensor, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
-组聚合，聚合进程组内的指定 tensor，随后将聚合结果发送到每个进程。
+聚合进程组内的指定 tensor，随后将聚合结果发送到每个进程。
 
 参见 `paddle.distributed.all_gather`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor_or_tensor_list** (Tensor|List[Tensor]) - 用于保存聚合结果。
-    若为 tensor，该 tensor 的大小必须与所有待聚合的 tensor 沿 dim[0] 拼接后的大小相同。
-    若为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
+      - 若为 tensor，该 tensor 的大小必须与所有待聚合的 tensor 沿 dim[0] 拼接后的大小相同。
+      - 若为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
     - **tensor** (Tensor) - 待聚合的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
diff --git a/docs/api/paddle/distributed/stream/all_reduce_cn.rst b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
index 0c7954cc546..21533429b4f 100644
--- a/docs/api/paddle/distributed/stream/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
@@ -6,13 +6,13 @@ all_reduce
 
 .. py:function:: paddle.distributed.stream.all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
-组规约，规约进程组内的一个 tensor，随后将结果发送到每个进程。
+规约进程组内的一个 tensor，随后将结果发送到每个进程。
 
 参见 `paddle.distributed.all_reduce`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) - 输入的 tensor。返回结果也将保存到该 tensor 中。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
diff --git a/docs/api/paddle/distributed/stream/alltoall_cn.rst b/docs/api/paddle/distributed/stream/alltoall_cn.rst
index a5854870ee8..ee33f7ab1d2 100644
--- a/docs/api/paddle/distributed/stream/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_cn.rst
@@ -6,20 +6,19 @@ alltoall
 
 .. py:function:: paddle.distributed.stream.alltoall(out_tensor_or_tensor_list, in_tensor_or_tensor_list, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 将一个或一组 tensor 分发到每个进程，随后在每个进程上聚合分发结果。
 
 参见 `paddle.distributed.alltoall`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **out_tensor_or_tensor_list** (Tensor|List[Tensor]) - 用于保存操作结果。
-    若输入数据为 tensor，该参数必须为 tensor，且大小与所有输入的 tensor 沿 dim[0] 拼接后的大小相同。
-    若输入数据为 tensor 列表，该参数必须为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
-    - **in_tensor_or_tensor_list** (Tensor|List[Tensor]) - 输入的数据，可以是一个 tensor 或 tensor 列表。
-    支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+      - 若输入数据为 tensor，该参数必须为 tensor，且大小与所有输入的 tensor 沿 dim[0] 拼接后的大小相同。
+      - 若输入数据为 tensor 列表，该参数必须为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
+    - **in_tensor_or_tensor_list** (Tensor|List[Tensor]) - 输入的数据，可以是一个 tensor 或 tensor 列表。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
     - **use_calc_stream** (bool，可选) - 该操作是否在计算流上进行。默认为 False，即不在计算流上进行。该参数旨在提高同步操作的性能，请确保在充分了解其含义的情况下调整该参数的值。
diff --git a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
index f199c1ae03a..6e745a7f65d 100644
--- a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
@@ -6,18 +6,18 @@ alltoall_single
 
 .. py:function:: paddle.distributed.stream.alltoall_single(out_tensor, in_tensor, out_split_sizes=None, in_split_sizes=None, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 将一个 tensor 分发到每个进程，随后在每个进程上聚合分发结果。与 ``alltoall`` 相比，可以更精细地控制分发过程。
 
 参见 `paddle.distributed.alltoall_single`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **out_tensor** (Tensor): 用于保存操作结果的 tensor，数据类型必须与输入的 tensor 保持一致。
     - **in_tensor** (Tensor): 输入的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    - **out_split_sizes** (List[int]，可选): 对 out_tensor 的 dim[0] 进行切分的大小。默认为 None，即 out_tensor 将均匀地聚合来自各个进程的数据（需要确保 out_tensor 的大小能够被组中的进程数整除）。
+    - **out_split_sizes** (List[int]，可选): 对 out_tensor 的 dim[0] 进行切分的大小。默认为 None，即 out_tensor 将均匀地聚合各个进程的数据（需要确保 out_tensor 的大小能够被组中的进程数整除）。
     - **in_split_sizes** (List[int]，可选): 对 in_tensor 的 dim[0] 进行切分的大小。默认为 None，即将 in_tensor 均匀地分发到各个进程中（需要确保 in_tensor 的大小能够被组中的进程数整除）。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
diff --git a/docs/api/paddle/distributed/stream/broadcast_cn.rst b/docs/api/paddle/distributed/stream/broadcast_cn.rst
index 216faccc878..ddda102939f 100644
--- a/docs/api/paddle/distributed/stream/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/stream/broadcast_cn.rst
@@ -6,13 +6,13 @@ broadcast
 
 .. py:function:: paddle.distributed.stream.broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 将一个 tensor 发送到每个进程。
 
 参见 `paddle.distributed.broadcast`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) - 在目标进程上为待广播的 tensor，在其他进程上为用于接收广播结果的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
diff --git a/docs/api/paddle/distributed/stream/recv_cn.rst b/docs/api/paddle/distributed/stream/recv_cn.rst
index 0bc6dc43a8b..a92b75af795 100644
--- a/docs/api/paddle/distributed/stream/recv_cn.rst
+++ b/docs/api/paddle/distributed/stream/recv_cn.rst
@@ -6,13 +6,13 @@ recv
 
 .. py:function:: paddle.distributed.stream.recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 接收一个来自指定进程的 tensor。
 
 参见 `paddle.distributed.recv`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) - 用于接收数据的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
diff --git a/docs/api/paddle/distributed/stream/reduce_cn.rst b/docs/api/paddle/distributed/stream/reduce_cn.rst
index fd9e1676791..f24322bf94e 100644
--- a/docs/api/paddle/distributed/stream/reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_cn.rst
@@ -6,13 +6,13 @@ reduce
 
 .. py:function:: paddle.distributed.stream.reduce(tensor, dst=0, op=ReduceOp.SUM, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 规约进程组内的一个 tensor，随后将结果发送到指定进程。
 
 参见 `paddle.distributed.reduce`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) - 输入的 tensor。在目标进程上，返回结果将保存到该 tensor 中。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
diff --git a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
index 1e61f06e589..139f787b535 100644
--- a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
@@ -6,19 +6,19 @@ reduce_scatter
 
 .. py:function:: paddle.distributed.stream.reduce_scatter(tensor, tensor_or_tensor_list, op=ReduceOp.SUM, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 规约一组 tensor，随后将规约结果分发到每个进程。
 
 参见 `paddle.distributed.reduce`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) – 用于接收数据的 tensor，数据类型必须与输入保持一致。
     - **tensor_or_tensor_list** (Tensor|List[Tensor]) - 输入的数据，可以是一个 tensor 或 tensor 列表。
-    若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。
-    支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+      - 若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。
+      - 支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的操作类型，包括求和、取最大值、取最小值和求乘积。默认为求和。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
diff --git a/docs/api/paddle/distributed/stream/scatter_cn.rst b/docs/api/paddle/distributed/stream/scatter_cn.rst
index d51fde74779..38ad6ffa343 100644
--- a/docs/api/paddle/distributed/stream/scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/scatter_cn.rst
@@ -6,20 +6,20 @@ scatter
 
 .. py:function:: paddle.distributed.stream.scatter(tensor, tensor_or_tensor_list=None, src=0, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 将一组来自指定进程的 tensor 分发到每个进程。
 
 参见 `paddle.distributed.scatter`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) - 用于接收数据的 tensor，数据类型必须与输入保持一致。
     - **tensor_or_tensor_list** (Tensor|List[Tensor]，可选) - 待分发的数据，可以是一个 tensor 或 tensor 列表。
-    若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。
-    支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-    默认为 None，因为非目标进程上的该参数将被忽略。
+      - 若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。
+      - 支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+      - 默认为 None，因为非目标进程上的该参数将被忽略。
     - **src** (int，可选) - 目标进程的 rank，该进程的 tensor 列表将被分发到其他进程中。默认为 0，即分发 rank=0 的进程上的 tensor 列表。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
diff --git a/docs/api/paddle/distributed/stream/send_cn.rst b/docs/api/paddle/distributed/stream/send_cn.rst
index b5219185b84..a5f6239360a 100644
--- a/docs/api/paddle/distributed/stream/send_cn.rst
+++ b/docs/api/paddle/distributed/stream/send_cn.rst
@@ -6,13 +6,13 @@ send
 
 .. py:function:: paddle.distributed.stream.send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 发送一个 tensor 到指定进程。
 
 参见 `paddle.distributed.send`。
 
+.. warning::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) - 待发送的 Tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。

From 9114536fad74dc34341c7c60133e57533fedfe22 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Tue, 11 Oct 2022 17:51:52 +0800
Subject: [PATCH 07/12] docs(distributed/collective): update return vals

---
 docs/api/paddle/distributed/all_gather_object_cn.rst      | 2 +-
 docs/api/paddle/distributed/all_reduce_cn.rst             | 2 +-
 docs/api/paddle/distributed/alltoall_single_cn.rst        | 8 ++++----
 docs/api/paddle/distributed/barrier_cn.rst                | 2 +-
 docs/api/paddle/distributed/broadcast_cn.rst              | 4 ++--
 docs/api/paddle/distributed/destroy_process_group_cn.rst  | 2 +-
 docs/api/paddle/distributed/irecv_cn.rst                  | 8 ++++----
 docs/api/paddle/distributed/isend_cn.rst                  | 8 ++++----
 docs/api/paddle/distributed/recv_cn.rst                   | 2 +-
 docs/api/paddle/distributed/reduce_cn.rst                 | 2 +-
 docs/api/paddle/distributed/reduce_scatter_cn.rst         | 4 ++--
 docs/api/paddle/distributed/scatter_cn.rst                | 2 +-
 docs/api/paddle/distributed/send_cn.rst                   | 2 +-
 docs/api/paddle/distributed/stream/all_gather_cn.rst      | 4 ++--
 docs/api/paddle/distributed/stream/all_reduce_cn.rst      | 4 ++--
 docs/api/paddle/distributed/stream/alltoall_cn.rst        | 4 ++--
 docs/api/paddle/distributed/stream/alltoall_single_cn.rst | 4 ++--
 docs/api/paddle/distributed/stream/broadcast_cn.rst       | 4 ++--
 docs/api/paddle/distributed/stream/recv_cn.rst            | 4 ++--
 docs/api/paddle/distributed/stream/reduce_cn.rst          | 4 ++--
 docs/api/paddle/distributed/stream/reduce_scatter_cn.rst  | 4 ++--
 docs/api/paddle/distributed/stream/scatter_cn.rst         | 4 ++--
 docs/api/paddle/distributed/stream/send_cn.rst            | 4 ++--
 23 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/docs/api/paddle/distributed/all_gather_object_cn.rst b/docs/api/paddle/distributed/all_gather_object_cn.rst
index 156d61555f7..438a0e13f51 100644
--- a/docs/api/paddle/distributed/all_gather_object_cn.rst
+++ b/docs/api/paddle/distributed/all_gather_object_cn.rst
@@ -9,7 +9,7 @@ all_gather_object
 组聚合，聚合进程组内指定的 picklable 对象，随后将聚合后的对象列表发送到每个进程。
 过程与 ``all_gather`` 类似，但可以传入自定义的 python 对象。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
diff --git a/docs/api/paddle/distributed/all_reduce_cn.rst b/docs/api/paddle/distributed/all_reduce_cn.rst
index de20a7a2039..81ecf20ad71 100644
--- a/docs/api/paddle/distributed/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/all_reduce_cn.rst
@@ -25,7 +25,7 @@ all_reduce
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/alltoall_single_cn.rst b/docs/api/paddle/distributed/alltoall_single_cn.rst
index b83894428cb..ba3ff094ed8 100644
--- a/docs/api/paddle/distributed/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/alltoall_single_cn.rst
@@ -6,11 +6,11 @@ alltoall_single
 
 .. py:function:: paddle.distributed.alltoall_single(in_tensor, out_tensor, in_split_sizes=None, out_split_sizes=None, group=None, sync_op=True)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 将输入的 tensor 分发到每个进程，随后在每个进程上将分发结果聚合到 out_tensor 中。
 
+.. note::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **in_tensor** (Tensor): 输入的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
@@ -22,7 +22,7 @@ alltoall_single
 
 返回
 :::::::::
-若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/barrier_cn.rst b/docs/api/paddle/distributed/barrier_cn.rst
index d2ea046feeb..ea734c5a6c5 100644
--- a/docs/api/paddle/distributed/barrier_cn.rst
+++ b/docs/api/paddle/distributed/barrier_cn.rst
@@ -14,7 +14,7 @@ barrier
 
 返回
 :::::::::
-无
+无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/broadcast_cn.rst b/docs/api/paddle/distributed/broadcast_cn.rst
index a70050e8e6b..296670fa74c 100644
--- a/docs/api/paddle/distributed/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/broadcast_cn.rst
@@ -25,9 +25,9 @@ broadcast
 
 返回
 :::::::::
-动态图模式下，若为同步操作，返回 None；若为异步操作，返回 Task 实例。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
 
-静态图模式下，返回 None。
+静态图模式下，无返回值。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/destroy_process_group_cn.rst b/docs/api/paddle/distributed/destroy_process_group_cn.rst
index 9ef54142a5b..77f08c1ef7a 100644
--- a/docs/api/paddle/distributed/destroy_process_group_cn.rst
+++ b/docs/api/paddle/distributed/destroy_process_group_cn.rst
@@ -14,7 +14,7 @@ destroy_process_group
 
 返回
 :::::::::
-无
+无返回值。
 
 代码示例
 ::::::::::::
diff --git a/docs/api/paddle/distributed/irecv_cn.rst b/docs/api/paddle/distributed/irecv_cn.rst
index 696cabc8c49..4c559c39c24 100644
--- a/docs/api/paddle/distributed/irecv_cn.rst
+++ b/docs/api/paddle/distributed/irecv_cn.rst
@@ -6,11 +6,11 @@ irecv
 
 .. py:function:: paddle.distributed.irecv(tensor, src=None, group=None)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 异步接收一个来自指定进程的 tensor。
 
+.. note::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) - 用于接收数据的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
@@ -20,7 +20,7 @@ irecv
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/isend_cn.rst b/docs/api/paddle/distributed/isend_cn.rst
index 73645fda513..2a1205d6ff4 100644
--- a/docs/api/paddle/distributed/isend_cn.rst
+++ b/docs/api/paddle/distributed/isend_cn.rst
@@ -6,11 +6,11 @@ isend
 
 .. py:function:: paddle.distributed.isend(tensor, dst, group=None)
 
-.. warning::
-  该 API 只支持动态图模式。
-
 异步发送一个 tensor 到指定进程。
 
+.. note::
+  该 API 只支持动态图模式。
+
 参数
 :::::::::
     - **tensor** (Tensor) - 待发送的 Tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
@@ -20,7 +20,7 @@ isend
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/recv_cn.rst b/docs/api/paddle/distributed/recv_cn.rst
index c9d4c0906ff..aa54629186a 100644
--- a/docs/api/paddle/distributed/recv_cn.rst
+++ b/docs/api/paddle/distributed/recv_cn.rst
@@ -17,7 +17,7 @@ recv
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/reduce_cn.rst b/docs/api/paddle/distributed/reduce_cn.rst
index 5df20ebb1d7..42371bff08b 100644
--- a/docs/api/paddle/distributed/reduce_cn.rst
+++ b/docs/api/paddle/distributed/reduce_cn.rst
@@ -26,7 +26,7 @@ reduce
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/reduce_scatter_cn.rst b/docs/api/paddle/distributed/reduce_scatter_cn.rst
index 0e67eecf670..ecb49db04bd 100644
--- a/docs/api/paddle/distributed/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/reduce_scatter_cn.rst
@@ -8,7 +8,7 @@ reduce_scatter
 
 规约一组 tensor，随后将规约结果分发到每个进程。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -22,7 +22,7 @@ reduce_scatter
 
 返回
 :::::::::
-若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/scatter_cn.rst b/docs/api/paddle/distributed/scatter_cn.rst
index 8bcfd461fc8..e2d1013de12 100644
--- a/docs/api/paddle/distributed/scatter_cn.rst
+++ b/docs/api/paddle/distributed/scatter_cn.rst
@@ -26,7 +26,7 @@ scatter
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/send_cn.rst b/docs/api/paddle/distributed/send_cn.rst
index 3dfb79fdf35..455351c3312 100644
--- a/docs/api/paddle/distributed/send_cn.rst
+++ b/docs/api/paddle/distributed/send_cn.rst
@@ -17,7 +17,7 @@ send
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回 Task 实例。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/stream/all_gather_cn.rst b/docs/api/paddle/distributed/stream/all_gather_cn.rst
index 13171d318d6..7085d5cff6f 100644
--- a/docs/api/paddle/distributed/stream/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_gather_cn.rst
@@ -10,7 +10,7 @@ all_gather
 
 参见 `paddle.distributed.all_gather`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -25,7 +25,7 @@ all_gather
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/all_reduce_cn.rst b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
index 21533429b4f..71aa2a79722 100644
--- a/docs/api/paddle/distributed/stream/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
@@ -10,7 +10,7 @@ all_reduce
 
 参见 `paddle.distributed.all_reduce`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -23,7 +23,7 @@ all_reduce
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/alltoall_cn.rst b/docs/api/paddle/distributed/stream/alltoall_cn.rst
index ee33f7ab1d2..efe0958c39e 100644
--- a/docs/api/paddle/distributed/stream/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_cn.rst
@@ -10,7 +10,7 @@ alltoall
 
 参见 `paddle.distributed.alltoall`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -25,7 +25,7 @@ alltoall
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
index 6e745a7f65d..dc0dbfd68e3 100644
--- a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
@@ -10,7 +10,7 @@ alltoall_single
 
 参见 `paddle.distributed.alltoall_single`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -25,7 +25,7 @@ alltoall_single
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/broadcast_cn.rst b/docs/api/paddle/distributed/stream/broadcast_cn.rst
index ddda102939f..f7c65c514dd 100644
--- a/docs/api/paddle/distributed/stream/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/stream/broadcast_cn.rst
@@ -10,7 +10,7 @@ broadcast
 
 参见 `paddle.distributed.broadcast`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -23,7 +23,7 @@ broadcast
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/recv_cn.rst b/docs/api/paddle/distributed/stream/recv_cn.rst
index a92b75af795..176165645ad 100644
--- a/docs/api/paddle/distributed/stream/recv_cn.rst
+++ b/docs/api/paddle/distributed/stream/recv_cn.rst
@@ -10,7 +10,7 @@ recv
 
 参见 `paddle.distributed.recv`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -23,7 +23,7 @@ recv
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/reduce_cn.rst b/docs/api/paddle/distributed/stream/reduce_cn.rst
index f24322bf94e..817c04185e8 100644
--- a/docs/api/paddle/distributed/stream/reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_cn.rst
@@ -10,7 +10,7 @@ reduce
 
 参见 `paddle.distributed.reduce`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -24,7 +24,7 @@ reduce
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
index 139f787b535..b6fa7112b0a 100644
--- a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
@@ -10,7 +10,7 @@ reduce_scatter
 
 参见 `paddle.distributed.reduce`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -27,7 +27,7 @@ reduce_scatter
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/scatter_cn.rst b/docs/api/paddle/distributed/stream/scatter_cn.rst
index 38ad6ffa343..a7e048412ab 100644
--- a/docs/api/paddle/distributed/stream/scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/scatter_cn.rst
@@ -10,7 +10,7 @@ scatter
 
 参见 `paddle.distributed.scatter`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -27,7 +27,7 @@ scatter
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/send_cn.rst b/docs/api/paddle/distributed/stream/send_cn.rst
index a5f6239360a..46b0d9ef614 100644
--- a/docs/api/paddle/distributed/stream/send_cn.rst
+++ b/docs/api/paddle/distributed/stream/send_cn.rst
@@ -10,7 +10,7 @@ send
 
 参见 `paddle.distributed.send`。
 
-.. warning::
+.. note::
   该 API 只支持动态图模式。
 
 参数
@@ -23,7 +23,7 @@ send
 
 返回
 :::::::::
-返回 Task 实例。
+``Task``，显示执行状态，并可以用于调度异步操作。
 
 代码示例
 :::::::::

From aea163239056c5899eb582eeda1b9bc2d69848c3 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Tue, 11 Oct 2022 19:20:25 +0800
Subject: [PATCH 08/12] docs(distributed/collective): add ref links

---
 docs/api/paddle/distributed/stream/all_gather_cn.rst      | 2 +-
 docs/api/paddle/distributed/stream/all_reduce_cn.rst      | 2 +-
 docs/api/paddle/distributed/stream/alltoall_cn.rst        | 2 +-
 docs/api/paddle/distributed/stream/alltoall_single_cn.rst | 2 +-
 docs/api/paddle/distributed/stream/broadcast_cn.rst       | 2 +-
 docs/api/paddle/distributed/stream/recv_cn.rst            | 2 +-
 docs/api/paddle/distributed/stream/reduce_cn.rst          | 2 +-
 docs/api/paddle/distributed/stream/reduce_scatter_cn.rst  | 2 +-
 docs/api/paddle/distributed/stream/scatter_cn.rst         | 2 +-
 docs/api/paddle/distributed/stream/send_cn.rst            | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/api/paddle/distributed/stream/all_gather_cn.rst b/docs/api/paddle/distributed/stream/all_gather_cn.rst
index 7085d5cff6f..65b816bb4a5 100644
--- a/docs/api/paddle/distributed/stream/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_gather_cn.rst
@@ -8,7 +8,7 @@ all_gather
 
 聚合进程组内的指定 tensor，随后将聚合结果发送到每个进程。
 
-参见 `paddle.distributed.all_gather`。
+参见 :ref:`paddle.distributed.all_gather<cn_api_distributed_all_gather>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/all_reduce_cn.rst b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
index 71aa2a79722..ff7adeaf981 100644
--- a/docs/api/paddle/distributed/stream/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
@@ -8,7 +8,7 @@ all_reduce
 
 规约进程组内的一个 tensor，随后将结果发送到每个进程。
 
-参见 `paddle.distributed.all_reduce`。
+参见 :ref:`paddle.distributed.all_reduce<cn_api_distributed_all_reduce>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/alltoall_cn.rst b/docs/api/paddle/distributed/stream/alltoall_cn.rst
index efe0958c39e..212f1818c75 100644
--- a/docs/api/paddle/distributed/stream/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_cn.rst
@@ -8,7 +8,7 @@ alltoall
 
 将一个或一组 tensor 分发到每个进程，随后在每个进程上聚合分发结果。
 
-参见 `paddle.distributed.alltoall`。
+参见 :ref:`paddle.distributed.alltoall<cn_api_distributed_alltoall>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
index dc0dbfd68e3..1f70eb50235 100644
--- a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
@@ -8,7 +8,7 @@ alltoall_single
 
 将一个 tensor 分发到每个进程，随后在每个进程上聚合分发结果。与 ``alltoall`` 相比，可以更精细地控制分发过程。
 
-参见 `paddle.distributed.alltoall_single`。
+参见 :ref:`paddle.distributed.alltoall_single<cn_api_distributed_alltoall_single>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/broadcast_cn.rst b/docs/api/paddle/distributed/stream/broadcast_cn.rst
index f7c65c514dd..b043ada6b8e 100644
--- a/docs/api/paddle/distributed/stream/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/stream/broadcast_cn.rst
@@ -8,7 +8,7 @@ broadcast
 
 将一个 tensor 发送到每个进程。
 
-参见 `paddle.distributed.broadcast`。
+参见 :ref:`paddle.distributed.broadcast<cn_api_distributed_broadcast>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/recv_cn.rst b/docs/api/paddle/distributed/stream/recv_cn.rst
index 176165645ad..f053d58fab8 100644
--- a/docs/api/paddle/distributed/stream/recv_cn.rst
+++ b/docs/api/paddle/distributed/stream/recv_cn.rst
@@ -8,7 +8,7 @@ recv
 
 接收一个来自指定进程的 tensor。
 
-参见 `paddle.distributed.recv`。
+参见 :ref:`paddle.distributed.recv<cn_api_distributed_recv>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/reduce_cn.rst b/docs/api/paddle/distributed/stream/reduce_cn.rst
index 817c04185e8..18da3eda5d4 100644
--- a/docs/api/paddle/distributed/stream/reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_cn.rst
@@ -8,7 +8,7 @@ reduce
 
 规约进程组内的一个 tensor，随后将结果发送到指定进程。
 
-参见 `paddle.distributed.reduce`。
+参见 :ref:`paddle.distributed.reduce<cn_api_distributed_reduce>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
index b6fa7112b0a..d225ddb8124 100644
--- a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
@@ -8,7 +8,7 @@ reduce_scatter
 
 规约一组 tensor，随后将规约结果分发到每个进程。
 
-参见 `paddle.distributed.reduce`。
+参见 :ref:`paddle.distributed.reduce_scatter<cn_api_distributed_reduce_scatter>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/scatter_cn.rst b/docs/api/paddle/distributed/stream/scatter_cn.rst
index a7e048412ab..7f295932427 100644
--- a/docs/api/paddle/distributed/stream/scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/scatter_cn.rst
@@ -8,7 +8,7 @@ scatter
 
 将一组来自指定进程的 tensor 分发到每个进程。
 
-参见 `paddle.distributed.scatter`。
+参见 :ref:`paddle.distributed.scatter<cn_api_distributed_scatter>`。
 
 .. note::
   该 API 只支持动态图模式。
diff --git a/docs/api/paddle/distributed/stream/send_cn.rst b/docs/api/paddle/distributed/stream/send_cn.rst
index 46b0d9ef614..43a02c5b296 100644
--- a/docs/api/paddle/distributed/stream/send_cn.rst
+++ b/docs/api/paddle/distributed/stream/send_cn.rst
@@ -8,7 +8,7 @@ send
 
 发送一个 tensor 到指定进程。
 
-参见 `paddle.distributed.send`。
+参见 :ref:`paddle.distributed.send<cn_api_distributed_send>`。
 
 .. note::
   该 API 只支持动态图模式。

From c9c6d961c87783857be1a7af30169464c020fc68 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Tue, 11 Oct 2022 20:07:50 +0800
Subject: [PATCH 09/12] docs(distributed/collective): fix styles

---
 docs/api/paddle/distributed/stream/all_gather_cn.rst     | 4 +---
 docs/api/paddle/distributed/stream/alltoall_cn.rst       | 4 +---
 docs/api/paddle/distributed/stream/reduce_scatter_cn.rst | 4 +---
 docs/api/paddle/distributed/stream/scatter_cn.rst        | 5 +----
 4 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/docs/api/paddle/distributed/stream/all_gather_cn.rst b/docs/api/paddle/distributed/stream/all_gather_cn.rst
index 65b816bb4a5..af64315d989 100644
--- a/docs/api/paddle/distributed/stream/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_gather_cn.rst
@@ -15,9 +15,7 @@ all_gather
 
 参数
 :::::::::
-    - **tensor_or_tensor_list** (Tensor|List[Tensor]) - 用于保存聚合结果。
-      - 若为 tensor，该 tensor 的大小必须与所有待聚合的 tensor 沿 dim[0] 拼接后的大小相同。
-      - 若为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
+    - **tensor_or_tensor_list** (Tensor|List[Tensor]) - 用于保存聚合结果。若为 tensor，该 tensor 的大小必须与所有待聚合的 tensor 沿 dim[0] 拼接后的大小相同。若为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
     - **tensor** (Tensor) - 待聚合的 tensor。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
diff --git a/docs/api/paddle/distributed/stream/alltoall_cn.rst b/docs/api/paddle/distributed/stream/alltoall_cn.rst
index 212f1818c75..0b7dd78bc96 100644
--- a/docs/api/paddle/distributed/stream/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_cn.rst
@@ -15,9 +15,7 @@ alltoall
 
 参数
 :::::::::
-    - **out_tensor_or_tensor_list** (Tensor|List[Tensor]) - 用于保存操作结果。
-      - 若输入数据为 tensor，该参数必须为 tensor，且大小与所有输入的 tensor 沿 dim[0] 拼接后的大小相同。
-      - 若输入数据为 tensor 列表，该参数必须为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
+    - **out_tensor_or_tensor_list** (Tensor|List[Tensor]) - 用于保存操作结果。若输入数据为 tensor，该参数必须为 tensor，且大小与所有输入的 tensor 沿 dim[0] 拼接后的大小相同。若输入数据为 tensor 列表，该参数必须为 tensor 列表，其中每个 tensor 的数据类型必须与输入的 tensor 保持一致。
     - **in_tensor_or_tensor_list** (Tensor|List[Tensor]) - 输入的数据，可以是一个 tensor 或 tensor 列表。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
diff --git a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
index d225ddb8124..02eedbd1114 100644
--- a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
@@ -16,9 +16,7 @@ reduce_scatter
 参数
 :::::::::
     - **tensor** (Tensor) – 用于接收数据的 tensor，数据类型必须与输入保持一致。
-    - **tensor_or_tensor_list** (Tensor|List[Tensor]) - 输入的数据，可以是一个 tensor 或 tensor 列表。
-      - 若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。
-      - 支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
+    - **tensor_or_tensor_list** (Tensor|List[Tensor]) - 输入的数据，可以是一个 tensor 或 tensor 列表。若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
     - **op** (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD，可选) - 归约的操作类型，包括求和、取最大值、取最小值和求乘积。默认为求和。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。
diff --git a/docs/api/paddle/distributed/stream/scatter_cn.rst b/docs/api/paddle/distributed/stream/scatter_cn.rst
index 7f295932427..75a6ae4be6b 100644
--- a/docs/api/paddle/distributed/stream/scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/scatter_cn.rst
@@ -16,10 +16,7 @@ scatter
 参数
 :::::::::
     - **tensor** (Tensor) - 用于接收数据的 tensor，数据类型必须与输入保持一致。
-    - **tensor_or_tensor_list** (Tensor|List[Tensor]，可选) - 待分发的数据，可以是一个 tensor 或 tensor 列表。
-      - 若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。
-      - 支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。
-      - 默认为 None，因为非目标进程上的该参数将被忽略。
+    - **tensor_or_tensor_list** (Tensor|List[Tensor]，可选) - 待分发的数据，可以是一个 tensor 或 tensor 列表。若为 tensor，该 tensor 的大小必须与所有用于接收数据的 tensor 沿 dim[0] 拼接后的大小相同。支持的数据类型包括：float16、float32、float64、int32、int64、int8、uint8、bool、bfloat16。默认为 None，因为非目标进程上的该参数将被忽略。
     - **src** (int，可选) - 目标进程的 rank，该进程的 tensor 列表将被分发到其他进程中。默认为 0，即分发 rank=0 的进程上的 tensor 列表。
     - **group** (Group，可选) - 执行该操作的进程组实例（通过 ``new_group`` 创建）。默认为 None，即使用全局默认进程组。
     - **sync_op** (bool，可选) - 该操作是否为同步操作。默认为 True，即同步操作。

From 877a1ddd4372c94092ff15aa2f7672c9ab92d6e3 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Wed, 12 Oct 2022 15:42:23 +0800
Subject: [PATCH 10/12] docs(distributed/collective): revise task desc

---
 docs/api/paddle/distributed/all_reduce_cn.rst             | 2 +-
 docs/api/paddle/distributed/alltoall_single_cn.rst        | 2 +-
 docs/api/paddle/distributed/broadcast_cn.rst              | 2 +-
 docs/api/paddle/distributed/irecv_cn.rst                  | 2 +-
 docs/api/paddle/distributed/isend_cn.rst                  | 2 +-
 docs/api/paddle/distributed/recv_cn.rst                   | 2 +-
 docs/api/paddle/distributed/reduce_cn.rst                 | 2 +-
 docs/api/paddle/distributed/reduce_scatter_cn.rst         | 2 +-
 docs/api/paddle/distributed/scatter_cn.rst                | 2 +-
 docs/api/paddle/distributed/send_cn.rst                   | 2 +-
 docs/api/paddle/distributed/stream/all_gather_cn.rst      | 2 +-
 docs/api/paddle/distributed/stream/all_reduce_cn.rst      | 2 +-
 docs/api/paddle/distributed/stream/alltoall_cn.rst        | 2 +-
 docs/api/paddle/distributed/stream/alltoall_single_cn.rst | 2 +-
 docs/api/paddle/distributed/stream/broadcast_cn.rst       | 2 +-
 docs/api/paddle/distributed/stream/recv_cn.rst            | 2 +-
 docs/api/paddle/distributed/stream/reduce_cn.rst          | 2 +-
 docs/api/paddle/distributed/stream/reduce_scatter_cn.rst  | 2 +-
 docs/api/paddle/distributed/stream/scatter_cn.rst         | 2 +-
 docs/api/paddle/distributed/stream/send_cn.rst            | 2 +-
 20 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/api/paddle/distributed/all_reduce_cn.rst b/docs/api/paddle/distributed/all_reduce_cn.rst
index 81ecf20ad71..49b668b8159 100644
--- a/docs/api/paddle/distributed/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/all_reduce_cn.rst
@@ -25,7 +25,7 @@ all_reduce
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/alltoall_single_cn.rst b/docs/api/paddle/distributed/alltoall_single_cn.rst
index ba3ff094ed8..306a24c0c28 100644
--- a/docs/api/paddle/distributed/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/alltoall_single_cn.rst
@@ -22,7 +22,7 @@ alltoall_single
 
 返回
 :::::::::
-若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
+若为同步操作，无返回值；若为异步操作，返回 ``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/broadcast_cn.rst b/docs/api/paddle/distributed/broadcast_cn.rst
index 296670fa74c..0c9a590d3d6 100644
--- a/docs/api/paddle/distributed/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/broadcast_cn.rst
@@ -25,7 +25,7 @@ broadcast
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 ``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/irecv_cn.rst b/docs/api/paddle/distributed/irecv_cn.rst
index 4c559c39c24..58af638bb09 100644
--- a/docs/api/paddle/distributed/irecv_cn.rst
+++ b/docs/api/paddle/distributed/irecv_cn.rst
@@ -20,7 +20,7 @@ irecv
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/isend_cn.rst b/docs/api/paddle/distributed/isend_cn.rst
index 2a1205d6ff4..b1f77e83584 100644
--- a/docs/api/paddle/distributed/isend_cn.rst
+++ b/docs/api/paddle/distributed/isend_cn.rst
@@ -20,7 +20,7 @@ isend
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/recv_cn.rst b/docs/api/paddle/distributed/recv_cn.rst
index aa54629186a..1ae582f6973 100644
--- a/docs/api/paddle/distributed/recv_cn.rst
+++ b/docs/api/paddle/distributed/recv_cn.rst
@@ -17,7 +17,7 @@ recv
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 ``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/reduce_cn.rst b/docs/api/paddle/distributed/reduce_cn.rst
index 42371bff08b..f1ceaf69e3e 100644
--- a/docs/api/paddle/distributed/reduce_cn.rst
+++ b/docs/api/paddle/distributed/reduce_cn.rst
@@ -26,7 +26,7 @@ reduce
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 ``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/reduce_scatter_cn.rst b/docs/api/paddle/distributed/reduce_scatter_cn.rst
index ecb49db04bd..fe58a6b0a4d 100644
--- a/docs/api/paddle/distributed/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/reduce_scatter_cn.rst
@@ -22,7 +22,7 @@ reduce_scatter
 
 返回
 :::::::::
-若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
+若为同步操作，无返回值；若为异步操作，返回 ``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/scatter_cn.rst b/docs/api/paddle/distributed/scatter_cn.rst
index e2d1013de12..191aaa739be 100644
--- a/docs/api/paddle/distributed/scatter_cn.rst
+++ b/docs/api/paddle/distributed/scatter_cn.rst
@@ -26,7 +26,7 @@ scatter
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 ``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/send_cn.rst b/docs/api/paddle/distributed/send_cn.rst
index 455351c3312..697539ee2d6 100644
--- a/docs/api/paddle/distributed/send_cn.rst
+++ b/docs/api/paddle/distributed/send_cn.rst
@@ -17,7 +17,7 @@ send
 
 返回
 :::::::::
-动态图模式下，若为同步操作，无返回值；若为异步操作，返回``Task``，显示执行状态，并可以用于调度异步操作。
+动态图模式下，若为同步操作，无返回值；若为异步操作，返回 ``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 静态图模式下，无返回值。
 
diff --git a/docs/api/paddle/distributed/stream/all_gather_cn.rst b/docs/api/paddle/distributed/stream/all_gather_cn.rst
index af64315d989..f1bf5f12441 100644
--- a/docs/api/paddle/distributed/stream/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_gather_cn.rst
@@ -23,7 +23,7 @@ all_gather
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/all_reduce_cn.rst b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
index ff7adeaf981..951d600e3af 100644
--- a/docs/api/paddle/distributed/stream/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
@@ -23,7 +23,7 @@ all_reduce
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/alltoall_cn.rst b/docs/api/paddle/distributed/stream/alltoall_cn.rst
index 0b7dd78bc96..6524be169a5 100644
--- a/docs/api/paddle/distributed/stream/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_cn.rst
@@ -23,7 +23,7 @@ alltoall
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
index 1f70eb50235..444ba390407 100644
--- a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
@@ -25,7 +25,7 @@ alltoall_single
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/broadcast_cn.rst b/docs/api/paddle/distributed/stream/broadcast_cn.rst
index b043ada6b8e..8afc3698776 100644
--- a/docs/api/paddle/distributed/stream/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/stream/broadcast_cn.rst
@@ -23,7 +23,7 @@ broadcast
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/recv_cn.rst b/docs/api/paddle/distributed/stream/recv_cn.rst
index f053d58fab8..e80263b6914 100644
--- a/docs/api/paddle/distributed/stream/recv_cn.rst
+++ b/docs/api/paddle/distributed/stream/recv_cn.rst
@@ -23,7 +23,7 @@ recv
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/reduce_cn.rst b/docs/api/paddle/distributed/stream/reduce_cn.rst
index 18da3eda5d4..82adb74c3bb 100644
--- a/docs/api/paddle/distributed/stream/reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_cn.rst
@@ -24,7 +24,7 @@ reduce
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
index 02eedbd1114..d53f2696c35 100644
--- a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
@@ -25,7 +25,7 @@ reduce_scatter
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/scatter_cn.rst b/docs/api/paddle/distributed/stream/scatter_cn.rst
index 75a6ae4be6b..c8496a1e940 100644
--- a/docs/api/paddle/distributed/stream/scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/scatter_cn.rst
@@ -24,7 +24,7 @@ scatter
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::
diff --git a/docs/api/paddle/distributed/stream/send_cn.rst b/docs/api/paddle/distributed/stream/send_cn.rst
index 43a02c5b296..c574e5dcd8b 100644
--- a/docs/api/paddle/distributed/stream/send_cn.rst
+++ b/docs/api/paddle/distributed/stream/send_cn.rst
@@ -23,7 +23,7 @@ send
 
 返回
 :::::::::
-``Task``，显示执行状态，并可以用于调度异步操作。
+``Task``。通过 ``Task``，可以查看异步操作的执行状态以及等待异步操作的结果。
 
 代码示例
 :::::::::

From 97789d5402157a5d712ca6a4402227bc1975d4bf Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Wed, 12 Oct 2022 16:19:43 +0800
Subject: [PATCH 11/12] docs(distributed/collective): update code example paths

---
 docs/api/paddle/distributed/stream/all_gather_cn.rst      | 2 +-
 docs/api/paddle/distributed/stream/all_reduce_cn.rst      | 2 +-
 docs/api/paddle/distributed/stream/alltoall_cn.rst        | 2 +-
 docs/api/paddle/distributed/stream/alltoall_single_cn.rst | 2 +-
 docs/api/paddle/distributed/stream/broadcast_cn.rst       | 2 +-
 docs/api/paddle/distributed/stream/recv_cn.rst            | 2 +-
 docs/api/paddle/distributed/stream/reduce_cn.rst          | 2 +-
 docs/api/paddle/distributed/stream/reduce_scatter_cn.rst  | 2 +-
 docs/api/paddle/distributed/stream/scatter_cn.rst         | 2 +-
 docs/api/paddle/distributed/stream/send_cn.rst            | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/api/paddle/distributed/stream/all_gather_cn.rst b/docs/api/paddle/distributed/stream/all_gather_cn.rst
index f1bf5f12441..8037c9c4fed 100644
--- a/docs/api/paddle/distributed/stream/all_gather_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_gather_cn.rst
@@ -27,4 +27,4 @@ all_gather
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.all_gather
+COPY-FROM: paddle.distributed.communication.stream.all_gather
diff --git a/docs/api/paddle/distributed/stream/all_reduce_cn.rst b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
index 951d600e3af..1ef77e2674a 100644
--- a/docs/api/paddle/distributed/stream/all_reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/all_reduce_cn.rst
@@ -27,4 +27,4 @@ all_reduce
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.all_reduce
+COPY-FROM: paddle.distributed.communication.stream.all_reduce
diff --git a/docs/api/paddle/distributed/stream/alltoall_cn.rst b/docs/api/paddle/distributed/stream/alltoall_cn.rst
index 6524be169a5..ceaea200f69 100644
--- a/docs/api/paddle/distributed/stream/alltoall_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_cn.rst
@@ -27,4 +27,4 @@ alltoall
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.alltoall
+COPY-FROM: paddle.distributed.communication.stream.alltoall
diff --git a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
index 444ba390407..9b72bf70e6b 100644
--- a/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
+++ b/docs/api/paddle/distributed/stream/alltoall_single_cn.rst
@@ -29,4 +29,4 @@ alltoall_single
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.alltoall_single
+COPY-FROM: paddle.distributed.communication.stream.alltoall_single
diff --git a/docs/api/paddle/distributed/stream/broadcast_cn.rst b/docs/api/paddle/distributed/stream/broadcast_cn.rst
index 8afc3698776..e4cb3fb842b 100644
--- a/docs/api/paddle/distributed/stream/broadcast_cn.rst
+++ b/docs/api/paddle/distributed/stream/broadcast_cn.rst
@@ -27,4 +27,4 @@ broadcast
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.broadcast
+COPY-FROM: paddle.distributed.communication.stream.broadcast
diff --git a/docs/api/paddle/distributed/stream/recv_cn.rst b/docs/api/paddle/distributed/stream/recv_cn.rst
index e80263b6914..3c25283bdb2 100644
--- a/docs/api/paddle/distributed/stream/recv_cn.rst
+++ b/docs/api/paddle/distributed/stream/recv_cn.rst
@@ -27,4 +27,4 @@ recv
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.recv
+COPY-FROM: paddle.distributed.communication.stream.recv
diff --git a/docs/api/paddle/distributed/stream/reduce_cn.rst b/docs/api/paddle/distributed/stream/reduce_cn.rst
index 82adb74c3bb..ca86f0dcbed 100644
--- a/docs/api/paddle/distributed/stream/reduce_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_cn.rst
@@ -28,4 +28,4 @@ reduce
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.reduce
+COPY-FROM: paddle.distributed.communication.stream.reduce
diff --git a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
index d53f2696c35..e33c201058d 100644
--- a/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/reduce_scatter_cn.rst
@@ -29,4 +29,4 @@ reduce_scatter
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.reduce_scatter
+COPY-FROM: paddle.distributed.communication.stream.reduce_scatter
diff --git a/docs/api/paddle/distributed/stream/scatter_cn.rst b/docs/api/paddle/distributed/stream/scatter_cn.rst
index c8496a1e940..222d1502654 100644
--- a/docs/api/paddle/distributed/stream/scatter_cn.rst
+++ b/docs/api/paddle/distributed/stream/scatter_cn.rst
@@ -28,4 +28,4 @@ scatter
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.scatter
+COPY-FROM: paddle.distributed.communication.stream.scatter
diff --git a/docs/api/paddle/distributed/stream/send_cn.rst b/docs/api/paddle/distributed/stream/send_cn.rst
index c574e5dcd8b..ab7235dfc3e 100644
--- a/docs/api/paddle/distributed/stream/send_cn.rst
+++ b/docs/api/paddle/distributed/stream/send_cn.rst
@@ -27,4 +27,4 @@ send
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.stream.send
+COPY-FROM: paddle.distributed.communication.stream.send

From 5219f75a24cdc28d76c2cc54342d8b6666abe7f2 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Thu, 10 Nov 2022 23:25:12 +0800
Subject: [PATCH 12/12] fix: incorrect link & some style

---
 docs/api/paddle/distributed/Overview_cn.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/api/paddle/distributed/Overview_cn.rst b/docs/api/paddle/distributed/Overview_cn.rst
index 221207315ab..2785f419f45 100644
--- a/docs/api/paddle/distributed/Overview_cn.rst
+++ b/docs/api/paddle/distributed/Overview_cn.rst
@@ -17,7 +17,7 @@ paddle.distributed 目录包含的 API 支撑飞桨框架大规模分布式训
 Fleet 分布式高层 API
 ::::::::::::::::::::::::::
 
-paddle.distributed.fleet 是分布式训练的统一入口 API，用于配置分布式训练。
+``paddle.distributed.fleet`` 是分布式训练的统一入口 API，用于配置分布式训练。
 
 .. csv-table::
     :header: "API 名称", "API 功能"
@@ -102,7 +102,7 @@ paddle.distributed.fleet 是分布式训练的统一入口 API，用于配置分
 Stream 集合通信高级 API
 ::::::::::::::::::::::
 
-paddle.distributed.stream 在集合通信 API 的基础上，提供更统一的语义和对计算流的更精细的控制能力，有助于在特定场景下提高性能。
+``paddle.distributed.stream`` 在集合通信 API 的基础上，提供更统一的语义和对计算流的更精细的控制能力，有助于在特定场景下提高性能。
 
 .. csv-table::
     :header: "API 名称", "API 功能"
@@ -120,7 +120,7 @@ paddle.distributed.stream 在集合通信 API 的基础上，提供更统一的
     " :ref:`stream.send <cn_api_distributed_stream_send>` ", "发送一个 tensor 到指定进程"
     " :ref:`stream.recv <cn_api_distributed_stream_recv>` ", "接收一个来自指定进程的 tensor"
 
-.. _05:
+.. _06:
 
 RPC API
 ::::::::::::::::::::::::::