@@ -38,8 +38,35 @@ For example: +
3838`async_work_group_strided_copy(dst, src, num_gentypes, src_stride, event)` is equal to
3939`async_work_group_copy_2D2D(dst, 0, src, 0, sizeof(gentype), 1, num_gentypes, src_stride, 1, event)`
4040
41- These new built-in functions support arbitrary `gentype`-based buffers by
42- casting pointers to `void *`.
41+ The async copy built-in functions described in this section support arbitrary
42+ `gentype`-based buffers by casting pointers to `void*`.
43+
44+ These async copy built-in functions do not perform any implicit synchronization
45+ of source data such as using a *barrier* before performing the copy.
46+
47+ These async copy built-in functions are performed by all work-items in a
48+ work-group and must therefore be encountered by all work-items in a work-group
49+ executing the kernel with the same argument values; otherwise the results are
50+ undefined.
51+
52+ The _src_offset_, _dst_offset_, _src_total_line_length_,
53+ _dst_total_line_length_, _src_total_plane_area_ and _dst_total_plane_area_
54+ function arguments are expressed in elements.
55+
56+ Both _src_total_line_length_ and _dst_total_line_length_ describe the number of
57+ elements between the beginning of the current line and the beginning of the next
58+ line.
59+
60+ Both _src_total_plane_area_ and _dst_total_plane_area_ describe the number of
61+ elements between the beginning of the current plane and the beginning of the
62+ next plane.
63+
64+ These async copy built-in functions return an event object that can be used by
65+ *wait_group_events* to wait for the async copy to finish. The _event_ argument
66+ can also be used to associate the async copy with a previous async copy allowing
67+ an event to be shared by multiple async copies; otherwise _event_ should be
68+ zero. If the _event_ argument is non-zero, the event object supplied as the
69+ _event_ argument will be returned.
4370
4471[cols="1a,1",options="header",]
4572|=======================================================================
@@ -77,32 +104,13 @@ of size _num_bytes_per_element_ from
77104is performed with implicit casting to `char*` by the implementation.
78105Each line contains _num_elements_per_line_ elements of size
79106_num_bytes_per_element_.
80- After each line of transfer, _src_ address is incremented by
107+ After each line of transfer, the _src_ address is incremented by
81108_src_total_line_length_ elements
82109(i.e. _src_total_line_length_ * _num_bytes_per_element_ bytes),
83- _dst_ address is incremented by _dst_total_line_length_ elements
110+ and the _dst_ address is incremented by _dst_total_line_length_ elements
84111(i.e. _dst_total_line_length_ * _num_bytes_per_element_ bytes),
85112for the next line of transfer.
86113
87- All _src_offset_, _dst_offset_, _src_total_line_length_
88- and _dst_total_line_length_ values are expressed in elements.
89-
90- Both _src_total_line_length_ and _dst_total_line_length_ describe
91- the number of elements between the beginning of the current line
92- and the beginning of the next line.
93-
94- Returns an event object that can be used by *wait_group_events* to wait
95- for the async copy to finish. The _event_ argument can also be used to
96- associate the *async_work_group_copy_2D2D* with a previous async copy
97- allowing an event to be shared by multiple async copies;
98- otherwise _event_ should be zero.
99-
100- If _event_ argument is non-zero, the event object supplied in _event_
101- argument will be returned.
102-
103- This function does not perform any implicit synchronization of source
104- data such as using a *barrier* before performing the copy.
105-
106114The behavior of *async_work_group_copy_2D2D* is undefined if the
107115source or destination addresses exceed the upper bounds of the address space
108116during the copy.
@@ -111,11 +119,6 @@ The behavior of *async_work_group_copy_2D2D* is also undefined if the
111119_src_total_line_length_ or _dst_total_line_length_ values are smaller
112120than _num_elements_per_line_, i.e. overlapping of lines is undefined.
113121
114- The async copy is performed by all work-items in a work-group and this
115- built-in function must therefore be encountered by all work-items in a
116- work-group executing the kernel with the same argument values;
117- otherwise the results are undefined.
118-
119122|[source,opencl_c]
120123----
121124event_t async_work_group_copy_3D3D(
@@ -148,46 +151,21 @@ event_t async_work_group_copy_3D3D(
148151 size_t dst_total_plane_area,
149152 event_t event)
150153----
151- | Perform an async copy of
152- ((_num_elements_per_line_ * _num_lines_) * _num_planes_) elements
154+ | Perform an async copy of \((_num_elements_per_line_ * _num_lines_) * _num_planes_) elements
153155of size _num_bytes_per_element_ from
154156(_src_ + (_src_offset_ * _num_bytes_per_element_)) to
155157(_dst_ + (_dst_offset_ * _num_bytes_per_element_)),
156158arranged in _num_planes_ planes. All pointer arithmetic
157159is performed with implicit casting to `char*` by the implementation.
158160Each plane contains _num_lines_ lines.
159161Each line contains _num_elements_per_line_ elements.
160- After each line of transfer, _src_ address is incremented by
162+ After each line of transfer, the _src_ address is incremented by
161163_src_total_line_length_ elements
162164(i.e. _src_total_line_length_ * _num_bytes_per_element_ bytes),
163- _dst_ address is incremented by _dst_total_line_length_ elements
165+ and the _dst_ address is incremented by _dst_total_line_length_ elements
164166(i.e. _dst_total_line_length_ * _num_bytes_per_element_ bytes),
165167for the next line of transfer.
166168
167- All _src_offset_, _dst_offset_, _src_total_line_length_,
168- _dst_total_line_length_, _src_total_plane_area_ and
169- _dst_total_plane_area_ values are expressed in elements.
170-
171- Both _src_total_line_length_ and _dst_total_line_length_ describe
172- the number of elements between the beginning of the current line
173- and the beginning of the next line.
174-
175- Both _src_total_plane_area_ and _dst_total_plane_area_ describe
176- the number of elements between the beginning of the current plane
177- and the beginning of the next plane.
178-
179- Returns an event object that can be used by *wait_group_events* to wait
180- for the async copy to finish. The _event_ argument can also be used to
181- associate the *async_work_group_copy_3D3D* with a previous async copy
182- allowing an event to be shared by multiple async copies;
183- otherwise _event_ should be zero.
184-
185- If _event_ argument is non-zero, the event object supplied in _event_
186- argument will be returned.
187-
188- This function does not perform any implicit synchronization of source
189- data such as using a *barrier* before performing the copy.
190-
191169The behavior of *async_work_group_copy_3D3D* is undefined if the
192170source or destination addresses exceed the upper bounds of the address space
193171during the copy.
@@ -201,9 +179,4 @@ _src_total_plane_area_ is smaller than (_num_lines_ * _src_total_line_length_),
201179or _dst_total_plane_area_ is smaller than (_num_lines_ * _dst_total_line_length_),
202180i.e. overlapping of planes is undefined.
203181
204- The async copy is performed by all work-items in a work-group and this
205- built-in function must therefore be encountered by all work-items in a
206- work-group executing the kernel with the same argument values;
207- otherwise the results are undefined.
208-
209182|=======================================================================
0 commit comments