File: | dev/pci/drm/amd/amdgpu/amdgpu_xgmi.c |
Warning: | line 479, column 2 Value stored to 'init_low' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | * |
23 | */ |
24 | #include <linux/list.h> |
25 | #include "amdgpu.h" |
26 | #include "amdgpu_xgmi.h" |
27 | #include "amdgpu_ras.h" |
28 | #include "soc15.h" |
29 | #include "df/df_3_6_offset.h" |
30 | #include "xgmi/xgmi_4_0_0_smn.h" |
31 | #include "xgmi/xgmi_4_0_0_sh_mask.h" |
32 | #include "wafl/wafl2_4_0_0_smn.h" |
33 | #include "wafl/wafl2_4_0_0_sh_mask.h" |
34 | |
35 | #include "amdgpu_reset.h" |
36 | |
37 | #define smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c 0x11a0020c |
38 | #define smnPCS_GOPX1_PCS_ERROR_STATUS0x12200210 0x12200210 |
39 | |
40 | static DEFINE_MUTEX(xgmi_mutex)struct rwlock xgmi_mutex = { 0, "xgmi_mutex" }; |
41 | |
42 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4 4 |
43 | |
44 | static DRM_LIST_HEAD(xgmi_hive_list)struct list_head xgmi_hive_list = { &(xgmi_hive_list), & (xgmi_hive_list) }; |
45 | |
46 | static const int xgmi_pcs_err_status_reg_vg20[] = { |
47 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS0x11af0210, |
48 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS0x11af0210 + 0x100000, |
49 | }; |
50 | |
51 | static const int wafl_pcs_err_status_reg_vg20[] = { |
52 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS0x11cf0210, |
53 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS0x11cf0210 + 0x100000, |
54 | }; |
55 | |
56 | static const int xgmi_pcs_err_status_reg_arct[] = { |
57 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS0x11af0210, |
58 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS0x11af0210 + 0x100000, |
59 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS0x11af0210 + 0x500000, |
60 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS0x11af0210 + 0x600000, |
61 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS0x11af0210 + 0x700000, |
62 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS0x11af0210 + 0x800000, |
63 | }; |
64 | |
65 | /* same as vg20*/ |
66 | static const int wafl_pcs_err_status_reg_arct[] = { |
67 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS0x11cf0210, |
68 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS0x11cf0210 + 0x100000, |
69 | }; |
70 | |
71 | static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { |
72 | smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c, |
73 | smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c + 0x100000, |
74 | smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c + 0x200000, |
75 | smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c + 0x300000, |
76 | smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c + 0x400000, |
77 | smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c + 0x500000, |
78 | smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c + 0x600000, |
79 | smnPCS_XGMI3X16_PCS_ERROR_STATUS0x11a0020c + 0x700000 |
80 | }; |
81 | |
82 | static const int walf_pcs_err_status_reg_aldebaran[] = { |
83 | smnPCS_GOPX1_PCS_ERROR_STATUS0x12200210, |
84 | smnPCS_GOPX1_PCS_ERROR_STATUS0x12200210 + 0x100000 |
85 | }; |
86 | |
87 | static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { |
88 | {"XGMI PCS DataLossErr", |
89 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)0x00000001L, 0x0}, |
90 | {"XGMI PCS TrainingErr", |
91 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)0x00000002L, 0x1}, |
92 | {"XGMI PCS CRCErr", |
93 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)0x00000020L, 0x5}, |
94 | {"XGMI PCS BERExceededErr", |
95 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)0x00000040L, 0x6}, |
96 | {"XGMI PCS TxMetaDataErr", |
97 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)0x00000080L, 0x7}, |
98 | {"XGMI PCS ReplayBufParityErr", |
99 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)0x00000100L, 0x8}, |
100 | {"XGMI PCS DataParityErr", |
101 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)0x00000200L, 0x9}, |
102 | {"XGMI PCS ReplayFifoOverflowErr", |
103 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)0x00000400L, 0xa}, |
104 | {"XGMI PCS ReplayFifoUnderflowErr", |
105 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)0x00000800L, 0xb}, |
106 | {"XGMI PCS ElasticFifoOverflowErr", |
107 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)0x00001000L, 0xc}, |
108 | {"XGMI PCS DeskewErr", |
109 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)0x00002000L, 0xd}, |
110 | {"XGMI PCS DataStartupLimitErr", |
111 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)0x00008000L, 0xf}, |
112 | {"XGMI PCS FCInitTimeoutErr", |
113 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)0x00010000L, 0x10}, |
114 | {"XGMI PCS RecoveryTimeoutErr", |
115 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)0x00020000L, 0x11}, |
116 | {"XGMI PCS ReadySerialTimeoutErr", |
117 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)0x00040000L, 0x12}, |
118 | {"XGMI PCS ReadySerialAttemptErr", |
119 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)0x00080000L, 0x13}, |
120 | {"XGMI PCS RecoveryAttemptErr", |
121 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)0x00100000L, 0x14}, |
122 | {"XGMI PCS RecoveryRelockAttemptErr", |
123 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)0x00200000L, 0x15}, |
124 | }; |
125 | |
126 | static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { |
127 | {"WAFL PCS DataLossErr", |
128 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)0x00000001L, 0x0}, |
129 | {"WAFL PCS TrainingErr", |
130 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)0x00000002L, 0x1}, |
131 | {"WAFL PCS CRCErr", |
132 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)0x00000020L, 0x5}, |
133 | {"WAFL PCS BERExceededErr", |
134 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)0x00000040L, 0x6}, |
135 | {"WAFL PCS TxMetaDataErr", |
136 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)0x00000080L, 0x7}, |
137 | {"WAFL PCS ReplayBufParityErr", |
138 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)0x00000100L, 0x8}, |
139 | {"WAFL PCS DataParityErr", |
140 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)0x00000200L, 0x9}, |
141 | {"WAFL PCS ReplayFifoOverflowErr", |
142 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)0x00000400L, 0xa}, |
143 | {"WAFL PCS ReplayFifoUnderflowErr", |
144 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)0x00000800L, 0xb}, |
145 | {"WAFL PCS ElasticFifoOverflowErr", |
146 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)0x00001000L, 0xc}, |
147 | {"WAFL PCS DeskewErr", |
148 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)0x00002000L, 0xd}, |
149 | {"WAFL PCS DataStartupLimitErr", |
150 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)0x00008000L, 0xf}, |
151 | {"WAFL PCS FCInitTimeoutErr", |
152 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)0x00010000L, 0x10}, |
153 | {"WAFL PCS RecoveryTimeoutErr", |
154 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)0x00020000L, 0x11}, |
155 | {"WAFL PCS ReadySerialTimeoutErr", |
156 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)0x00040000L, 0x12}, |
157 | {"WAFL PCS ReadySerialAttemptErr", |
158 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)0x00080000L, 0x13}, |
159 | {"WAFL PCS RecoveryAttemptErr", |
160 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)0x00100000L, 0x14}, |
161 | {"WAFL PCS RecoveryRelockAttemptErr", |
162 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)0x00200000L, 0x15}, |
163 | }; |
164 | |
165 | /** |
166 | * DOC: AMDGPU XGMI Support |
167 | * |
168 | * XGMI is a high speed interconnect that joins multiple GPU cards |
169 | * into a homogeneous memory space that is organized by a collective |
170 | * hive ID and individual node IDs, both of which are 64-bit numbers. |
171 | * |
172 | * The file xgmi_device_id contains the unique per GPU device ID and |
173 | * is stored in the /sys/class/drm/card${cardno}/device/ directory. |
174 | * |
175 | * Inside the device directory a sub-directory 'xgmi_hive_info' is |
176 | * created which contains the hive ID and the list of nodes. |
177 | * |
178 | * The hive ID is stored in: |
179 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id |
180 | * |
181 | * The node information is stored in numbered directories: |
182 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id |
183 | * |
184 | * Each device has their own xgmi_hive_info direction with a mirror |
185 | * set of node sub-directories. |
186 | * |
187 | * The XGMI memory space is built by contiguously adding the power of |
188 | * two padded VRAM space from each node to each other. |
189 | * |
190 | */ |
191 | |
192 | static struct attribute amdgpu_xgmi_hive_id = { |
193 | .name = "xgmi_hive_id", |
194 | #ifdef notyet |
195 | .mode = S_IRUGO |
196 | #endif |
197 | }; |
198 | |
199 | static struct attribute *amdgpu_xgmi_hive_attrs[] = { |
200 | &amdgpu_xgmi_hive_id, |
201 | NULL((void *)0) |
202 | }; |
203 | ATTRIBUTE_GROUPS(amdgpu_xgmi_hive); |
204 | |
205 | static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, |
206 | struct attribute *attr, char *buf) |
207 | { |
208 | struct amdgpu_hive_info *hive = container_of(({ const __typeof( ((struct amdgpu_hive_info *)0)->kobj ) * __mptr = (kobj); (struct amdgpu_hive_info *)( (char *)__mptr - __builtin_offsetof(struct amdgpu_hive_info, kobj) );}) |
209 | kobj, struct amdgpu_hive_info, kobj)({ const __typeof( ((struct amdgpu_hive_info *)0)->kobj ) * __mptr = (kobj); (struct amdgpu_hive_info *)( (char *)__mptr - __builtin_offsetof(struct amdgpu_hive_info, kobj) );}); |
210 | |
211 | if (attr == &amdgpu_xgmi_hive_id) |
212 | return snprintf(buf, PAGE_SIZE(1 << 12), "%llu\n", hive->hive_id); |
213 | |
214 | return 0; |
215 | } |
216 | |
217 | static void amdgpu_xgmi_hive_release(struct kobject *kobj) |
218 | { |
219 | struct amdgpu_hive_info *hive = container_of(({ const __typeof( ((struct amdgpu_hive_info *)0)->kobj ) * __mptr = (kobj); (struct amdgpu_hive_info *)( (char *)__mptr - __builtin_offsetof(struct amdgpu_hive_info, kobj) );}) |
220 | kobj, struct amdgpu_hive_info, kobj)({ const __typeof( ((struct amdgpu_hive_info *)0)->kobj ) * __mptr = (kobj); (struct amdgpu_hive_info *)( (char *)__mptr - __builtin_offsetof(struct amdgpu_hive_info, kobj) );}); |
221 | |
222 | amdgpu_reset_put_reset_domain(hive->reset_domain); |
223 | hive->reset_domain = NULL((void *)0); |
224 | |
225 | mutex_destroy(&hive->hive_lock); |
226 | kfree(hive); |
227 | } |
228 | |
229 | #ifdef notyet |
230 | static const struct sysfs_ops amdgpu_xgmi_hive_ops = { |
231 | .show = amdgpu_xgmi_show_attrs, |
232 | }; |
233 | #endif |
234 | |
235 | struct kobj_type amdgpu_xgmi_hive_type = { |
236 | .release = amdgpu_xgmi_hive_release, |
237 | #ifdef notyet |
238 | .sysfs_ops = &amdgpu_xgmi_hive_ops, |
239 | .default_groups = amdgpu_xgmi_hive_groups, |
240 | #endif |
241 | }; |
242 | |
243 | static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, |
244 | struct device_attribute *attr, |
245 | char *buf) |
246 | { |
247 | struct drm_device *ddev = dev_get_drvdata(dev); |
248 | struct amdgpu_device *adev = drm_to_adev(ddev); |
249 | |
250 | return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id); |
251 | |
252 | } |
253 | |
254 | #define AMDGPU_XGMI_SET_FICAA(o)((o) | 0x456801) ((o) | 0x456801) |
255 | static ssize_t amdgpu_xgmi_show_error(struct device *dev, |
256 | struct device_attribute *attr, |
257 | char *buf) |
258 | { |
259 | struct drm_device *ddev = dev_get_drvdata(dev); |
260 | struct amdgpu_device *adev = drm_to_adev(ddev); |
261 | uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; |
262 | uint64_t fica_out; |
263 | unsigned int error_count = 0; |
264 | |
265 | ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200)((0x200) | 0x456801); |
266 | ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208)((0x208) | 0x456801); |
267 | |
268 | if ((!adev->df.funcs) || |
269 | (!adev->df.funcs->get_fica) || |
270 | (!adev->df.funcs->set_fica)) |
271 | return -EINVAL22; |
272 | |
273 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); |
274 | if (fica_out != 0x1f) |
275 | pr_err("xGMI error counters not enabled!\n")printk("\0013" "amdgpu: " "xGMI error counters not enabled!\n" ); |
276 | |
277 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); |
278 | |
279 | if ((fica_out & 0xffff) == 2) |
280 | error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); |
281 | |
282 | adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); |
283 | |
284 | return sysfs_emit(buf, "%u\n", error_count); |
285 | } |
286 | |
287 | |
288 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL)struct device_attribute dev_attr_xgmi_device_id; |
289 | static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL)struct device_attribute dev_attr_xgmi_error; |
290 | |
291 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, |
292 | struct amdgpu_hive_info *hive) |
293 | { |
294 | STUB()do { printf("%s: stub\n", __func__); } while(0); |
295 | return -ENOSYS78; |
296 | #ifdef notyet |
297 | int ret = 0; |
298 | char node[10] = { 0 }; |
299 | |
300 | /* Create xgmi device id file */ |
301 | ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id)0; |
302 | if (ret) { |
303 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n")printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to create device file xgmi_device_id\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
304 | return ret; |
305 | } |
306 | |
307 | /* Create xgmi error file */ |
308 | ret = device_create_file(adev->dev, &dev_attr_xgmi_error)0; |
309 | if (ret) |
310 | pr_err("failed to create xgmi_error\n")printk("\0013" "amdgpu: " "failed to create xgmi_error\n"); |
311 | |
312 | |
313 | /* Create sysfs link to hive info folder on the first device */ |
314 | if (hive->kobj.parent != (&adev->dev->kobj)) { |
315 | ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,0 |
316 | "xgmi_hive_info")0; |
317 | if (ret) { |
318 | dev_err(adev->dev, "XGMI: Failed to create link to hive info")printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to create link to hive info" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
319 | goto remove_file; |
320 | } |
321 | } |
322 | |
323 | snprintf(node, sizeof(node), "node%d", atomic_read(&hive->number_devices)({ typeof(*(&hive->number_devices)) __tmp = *(volatile typeof(*(&hive->number_devices)) *)&(*(&hive-> number_devices)); membar_datadep_consumer(); __tmp; })); |
324 | /* Create sysfs link form the hive folder to yourself */ |
325 | ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node)0; |
326 | if (ret) { |
327 | dev_err(adev->dev, "XGMI: Failed to create link from hive info")printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to create link from hive info" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
328 | goto remove_link; |
329 | } |
330 | |
331 | goto success; |
332 | |
333 | |
334 | remove_link: |
335 | sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique); |
336 | |
337 | remove_file: |
338 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); |
339 | |
340 | success: |
341 | return ret; |
342 | #endif |
343 | } |
344 | |
345 | static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, |
346 | struct amdgpu_hive_info *hive) |
347 | { |
348 | #ifdef __linux__ |
349 | char node[10]; |
350 | memset(node, 0, sizeof(node))__builtin_memset((node), (0), (sizeof(node))); |
351 | |
352 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); |
353 | device_remove_file(adev->dev, &dev_attr_xgmi_error); |
354 | |
355 | if (hive->kobj.parent != (&adev->dev->kobj)) |
356 | sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info"); |
357 | |
358 | sprintf(node, "node%d", atomic_read(&hive->number_devices)({ typeof(*(&hive->number_devices)) __tmp = *(volatile typeof(*(&hive->number_devices)) *)&(*(&hive-> number_devices)); membar_datadep_consumer(); __tmp; })); |
359 | sysfs_remove_link(&hive->kobj, node); |
360 | #endif |
361 | } |
362 | |
363 | |
364 | |
365 | struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) |
366 | { |
367 | struct amdgpu_hive_info *hive = NULL((void *)0); |
368 | int ret; |
369 | |
370 | if (!adev->gmc.xgmi.hive_id) |
371 | return NULL((void *)0); |
372 | |
373 | STUB()do { printf("%s: stub\n", __func__); } while(0); |
374 | return NULL((void *)0); |
375 | #ifdef notyet |
376 | |
377 | if (adev->hive) { |
378 | kobject_get(&adev->hive->kobj); |
379 | return adev->hive; |
380 | } |
381 | |
382 | mutex_lock(&xgmi_mutex)rw_enter_write(&xgmi_mutex); |
383 | |
384 | list_for_each_entry(hive, &xgmi_hive_list, node)for (hive = ({ const __typeof( ((__typeof(*hive) *)0)->node ) *__mptr = ((&xgmi_hive_list)->next); (__typeof(*hive ) *)( (char *)__mptr - __builtin_offsetof(__typeof(*hive), node ) );}); &hive->node != (&xgmi_hive_list); hive = ( { const __typeof( ((__typeof(*hive) *)0)->node ) *__mptr = (hive->node.next); (__typeof(*hive) *)( (char *)__mptr - __builtin_offsetof (__typeof(*hive), node) );})) { |
385 | if (hive->hive_id == adev->gmc.xgmi.hive_id) |
386 | goto pro_end; |
387 | } |
388 | |
389 | hive = kzalloc(sizeof(*hive), GFP_KERNEL(0x0001 | 0x0004)); |
390 | if (!hive) { |
391 | dev_err(adev->dev, "XGMI: allocation failed\n")printf("drm:pid%d:%s *ERROR* " "XGMI: allocation failed\n", ( {struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
392 | hive = NULL((void *)0); |
393 | goto pro_end; |
394 | } |
395 | |
396 | /* initialize new hive if not exist */ |
397 | ret = kobject_init_and_add(&hive->kobj, |
398 | &amdgpu_xgmi_hive_type, |
399 | &adev->dev->kobj, |
400 | "%s", "xgmi_hive_info"); |
401 | if (ret) { |
402 | dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n")printf("drm:pid%d:%s *ERROR* " "XGMI: failed initializing kobject for xgmi hive\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
403 | kobject_put(&hive->kobj); |
404 | hive = NULL((void *)0); |
405 | goto pro_end; |
406 | } |
407 | |
408 | /** |
409 | * Only init hive->reset_domain for none SRIOV configuration. For SRIOV, |
410 | * Host driver decide how to reset the GPU either through FLR or chain reset. |
411 | * Guest side will get individual notifications from the host for the FLR |
412 | * if necessary. |
413 | */ |
414 | if (!amdgpu_sriov_vf(adev)((adev)->virt.caps & (1 << 2))) { |
415 | /** |
416 | * Avoid recreating reset domain when hive is reconstructed for the case |
417 | * of reset the devices in the XGMI hive during probe for passthrough GPU |
418 | * See https://www.spinics.net/lists/amd-gfx/msg58836.html |
419 | */ |
420 | if (adev->reset_domain->type != XGMI_HIVE) { |
421 | hive->reset_domain = |
422 | amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); |
423 | if (!hive->reset_domain) { |
424 | dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n")printf("drm:pid%d:%s *ERROR* " "XGMI: failed initializing reset domain for xgmi hive\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
425 | ret = -ENOMEM12; |
426 | kobject_put(&hive->kobj); |
427 | hive = NULL((void *)0); |
428 | goto pro_end; |
429 | } |
430 | } else { |
431 | amdgpu_reset_get_reset_domain(adev->reset_domain); |
432 | hive->reset_domain = adev->reset_domain; |
433 | } |
434 | } |
435 | |
436 | hive->hive_id = adev->gmc.xgmi.hive_id; |
437 | INIT_LIST_HEAD(&hive->device_list); |
438 | INIT_LIST_HEAD(&hive->node); |
439 | rw_init(&hive->hive_lock, "aghive")_rw_init_flags(&hive->hive_lock, "aghive", 0, ((void * )0)); |
440 | atomic_set(&hive->number_devices, 0)({ typeof(*(&hive->number_devices)) __tmp = ((0)); *(volatile typeof(*(&hive->number_devices)) *)&(*(&hive-> number_devices)) = __tmp; __tmp; }); |
441 | task_barrier_init(&hive->tb); |
442 | hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; |
443 | hive->hi_req_gpu = NULL((void *)0); |
444 | |
445 | /* |
446 | * hive pstate on boot is high in vega20 so we have to go to low |
447 | * pstate on after boot. |
448 | */ |
449 | hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4; |
450 | list_add_tail(&hive->node, &xgmi_hive_list); |
451 | |
452 | pro_end: |
453 | if (hive) |
454 | kobject_get(&hive->kobj); |
455 | mutex_unlock(&xgmi_mutex)rw_exit_write(&xgmi_mutex); |
456 | return hive; |
457 | #endif |
458 | } |
459 | |
460 | void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) |
461 | { |
462 | if (hive) |
463 | kobject_put(&hive->kobj); |
464 | } |
465 | |
466 | int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) |
467 | { |
468 | int ret = 0; |
469 | struct amdgpu_hive_info *hive; |
470 | struct amdgpu_device *request_adev; |
471 | bool_Bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; |
472 | bool_Bool init_low; |
473 | |
474 | hive = amdgpu_get_xgmi_hive(adev); |
475 | if (!hive) |
476 | return 0; |
477 | |
478 | request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; |
479 | init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; |
Value stored to 'init_low' is never read | |
480 | amdgpu_put_xgmi_hive(hive); |
481 | /* fw bug so temporarily disable pstate switching */ |
482 | return 0; |
483 | |
484 | if (!hive || adev->asic_type != CHIP_VEGA20) |
485 | return 0; |
486 | |
487 | mutex_lock(&hive->hive_lock)rw_enter_write(&hive->hive_lock); |
488 | |
489 | if (is_hi_req) |
490 | hive->hi_req_count++; |
491 | else |
492 | hive->hi_req_count--; |
493 | |
494 | /* |
495 | * Vega20 only needs single peer to request pstate high for the hive to |
496 | * go high but all peers must request pstate low for the hive to go low |
497 | */ |
498 | if (hive->pstate == pstate || |
499 | (!is_hi_req && hive->hi_req_count && !init_low)) |
500 | goto out; |
501 | |
502 | dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate)do { } while(0); |
503 | |
504 | ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate); |
505 | if (ret) { |
506 | dev_err(request_adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: Set pstate failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , request_adev ->gmc.xgmi.node_id, request_adev->gmc.xgmi.hive_id, ret ) |
507 | "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",printf("drm:pid%d:%s *ERROR* " "XGMI: Set pstate failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , request_adev ->gmc.xgmi.node_id, request_adev->gmc.xgmi.hive_id, ret ) |
508 | request_adev->gmc.xgmi.node_id,printf("drm:pid%d:%s *ERROR* " "XGMI: Set pstate failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , request_adev ->gmc.xgmi.node_id, request_adev->gmc.xgmi.hive_id, ret ) |
509 | request_adev->gmc.xgmi.hive_id, ret)printf("drm:pid%d:%s *ERROR* " "XGMI: Set pstate failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , request_adev ->gmc.xgmi.node_id, request_adev->gmc.xgmi.hive_id, ret ); |
510 | goto out; |
511 | } |
512 | |
513 | if (init_low) |
514 | hive->pstate = hive->hi_req_count ? |
515 | hive->pstate : AMDGPU_XGMI_PSTATE_MIN; |
516 | else { |
517 | hive->pstate = pstate; |
518 | hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? |
519 | adev : NULL((void *)0); |
520 | } |
521 | out: |
522 | mutex_unlock(&hive->hive_lock)rw_exit_write(&hive->hive_lock); |
523 | return ret; |
524 | } |
525 | |
526 | int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) |
527 | { |
528 | int ret; |
529 | |
530 | if (amdgpu_sriov_vf(adev)((adev)->virt.caps & (1 << 2))) |
531 | return 0; |
532 | |
533 | /* Each psp need to set the latest topology */ |
534 | ret = psp_xgmi_set_topology_info(&adev->psp, |
535 | atomic_read(&hive->number_devices)({ typeof(*(&hive->number_devices)) __tmp = *(volatile typeof(*(&hive->number_devices)) *)&(*(&hive-> number_devices)); membar_datadep_consumer(); __tmp; }), |
536 | &adev->psp.xgmi_context.top_info); |
537 | if (ret) |
538 | dev_err(adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: Set topology failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.node_id, adev->gmc.xgmi.hive_id, ret) |
539 | "XGMI: Set topology failure on device %llx, hive %llx, ret %d",printf("drm:pid%d:%s *ERROR* " "XGMI: Set topology failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.node_id, adev->gmc.xgmi.hive_id, ret) |
540 | adev->gmc.xgmi.node_id,printf("drm:pid%d:%s *ERROR* " "XGMI: Set topology failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.node_id, adev->gmc.xgmi.hive_id, ret) |
541 | adev->gmc.xgmi.hive_id, ret)printf("drm:pid%d:%s *ERROR* " "XGMI: Set topology failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.node_id, adev->gmc.xgmi.hive_id, ret); |
542 | |
543 | return ret; |
544 | } |
545 | |
546 | |
547 | /* |
548 | * NOTE psp_xgmi_node_info.num_hops layout is as follows: |
549 | * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved) |
550 | * num_hops[5:3] = reserved |
551 | * num_hops[2:0] = number of hops |
552 | */ |
553 | int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, |
554 | struct amdgpu_device *peer_adev) |
555 | { |
556 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
557 | uint8_t num_hops_mask = 0x7; |
558 | int i; |
559 | |
560 | for (i = 0 ; i < top->num_nodes; ++i) |
561 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) |
562 | return top->nodes[i].num_hops & num_hops_mask; |
563 | return -EINVAL22; |
564 | } |
565 | |
566 | int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, |
567 | struct amdgpu_device *peer_adev) |
568 | { |
569 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
570 | int i; |
571 | |
572 | for (i = 0 ; i < top->num_nodes; ++i) |
573 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) |
574 | return top->nodes[i].num_links; |
575 | return -EINVAL22; |
576 | } |
577 | |
578 | /* |
579 | * Devices that support extended data require the entire hive to initialize with |
580 | * the shared memory buffer flag set. |
581 | * |
582 | * Hive locks and conditions apply - see amdgpu_xgmi_add_device |
583 | */ |
584 | static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive, |
585 | bool_Bool set_extended_data) |
586 | { |
587 | struct amdgpu_device *tmp_adev; |
588 | int ret; |
589 | |
590 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)for (tmp_adev = ({ const __typeof( ((__typeof(*tmp_adev) *)0) ->gmc.xgmi.head ) *__mptr = ((&hive->device_list)-> next); (__typeof(*tmp_adev) *)( (char *)__mptr - __builtin_offsetof (__typeof(*tmp_adev), gmc.xgmi.head) );}); &tmp_adev-> gmc.xgmi.head != (&hive->device_list); tmp_adev = ({ const __typeof( ((__typeof(*tmp_adev) *)0)->gmc.xgmi.head ) *__mptr = (tmp_adev->gmc.xgmi.head.next); (__typeof(*tmp_adev) *) ( (char *)__mptr - __builtin_offsetof(__typeof(*tmp_adev), gmc .xgmi.head) );})) { |
591 | ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false0); |
592 | if (ret) { |
593 | dev_err(tmp_adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to initialize xgmi session for data partition %i\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , set_extended_data ) |
594 | "XGMI: Failed to initialize xgmi session for data partition %i\n",printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to initialize xgmi session for data partition %i\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , set_extended_data ) |
595 | set_extended_data)printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to initialize xgmi session for data partition %i\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , set_extended_data ); |
596 | return ret; |
597 | } |
598 | |
599 | } |
600 | |
601 | return 0; |
602 | } |
603 | |
604 | int amdgpu_xgmi_add_device(struct amdgpu_device *adev) |
605 | { |
606 | struct psp_xgmi_topology_info *top_info; |
607 | struct amdgpu_hive_info *hive; |
608 | struct amdgpu_xgmi *entry; |
609 | struct amdgpu_device *tmp_adev = NULL((void *)0); |
610 | |
611 | int count = 0, ret = 0; |
612 | |
613 | if (!adev->gmc.xgmi.supported) |
614 | return 0; |
615 | |
616 | if (!adev->gmc.xgmi.pending_reset && |
617 | amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { |
618 | ret = psp_xgmi_initialize(&adev->psp, false0, true1); |
619 | if (ret) { |
620 | dev_err(adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to initialize xgmi session\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__) |
621 | "XGMI: Failed to initialize xgmi session\n")printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to initialize xgmi session\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
622 | return ret; |
623 | } |
624 | |
625 | ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); |
626 | if (ret) { |
627 | dev_err(adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to get hive id\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__) |
628 | "XGMI: Failed to get hive id\n")printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to get hive id\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
629 | return ret; |
630 | } |
631 | |
632 | ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); |
633 | if (ret) { |
634 | dev_err(adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to get node id\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__) |
635 | "XGMI: Failed to get node id\n")printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to get node id\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
636 | return ret; |
637 | } |
638 | } else { |
639 | adev->gmc.xgmi.hive_id = 16; |
640 | adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; |
641 | } |
642 | |
643 | hive = amdgpu_get_xgmi_hive(adev); |
644 | if (!hive) { |
645 | ret = -EINVAL22; |
646 | dev_err(adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.node_id, adev->gmc.xgmi.hive_id) |
647 | "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",printf("drm:pid%d:%s *ERROR* " "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.node_id, adev->gmc.xgmi.hive_id) |
648 | adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id)printf("drm:pid%d:%s *ERROR* " "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); |
649 | goto exit; |
650 | } |
651 | mutex_lock(&hive->hive_lock)rw_enter_write(&hive->hive_lock); |
652 | |
653 | top_info = &adev->psp.xgmi_context.top_info; |
654 | |
655 | list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); |
656 | list_for_each_entry(entry, &hive->device_list, head)for (entry = ({ const __typeof( ((__typeof(*entry) *)0)->head ) *__mptr = ((&hive->device_list)->next); (__typeof (*entry) *)( (char *)__mptr - __builtin_offsetof(__typeof(*entry ), head) );}); &entry->head != (&hive->device_list ); entry = ({ const __typeof( ((__typeof(*entry) *)0)->head ) *__mptr = (entry->head.next); (__typeof(*entry) *)( (char *)__mptr - __builtin_offsetof(__typeof(*entry), head) );})) |
657 | top_info->nodes[count++].node_id = entry->node_id; |
658 | top_info->num_nodes = count; |
659 | atomic_set(&hive->number_devices, count)({ typeof(*(&hive->number_devices)) __tmp = ((count)); *(volatile typeof(*(&hive->number_devices)) *)&(* (&hive->number_devices)) = __tmp; __tmp; }); |
660 | |
661 | task_barrier_add_task(&hive->tb); |
662 | |
663 | if (!adev->gmc.xgmi.pending_reset && |
664 | amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { |
665 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)for (tmp_adev = ({ const __typeof( ((__typeof(*tmp_adev) *)0) ->gmc.xgmi.head ) *__mptr = ((&hive->device_list)-> next); (__typeof(*tmp_adev) *)( (char *)__mptr - __builtin_offsetof (__typeof(*tmp_adev), gmc.xgmi.head) );}); &tmp_adev-> gmc.xgmi.head != (&hive->device_list); tmp_adev = ({ const __typeof( ((__typeof(*tmp_adev) *)0)->gmc.xgmi.head ) *__mptr = (tmp_adev->gmc.xgmi.head.next); (__typeof(*tmp_adev) *) ( (char *)__mptr - __builtin_offsetof(__typeof(*tmp_adev), gmc .xgmi.head) );})) { |
666 | /* update node list for other device in the hive */ |
667 | if (tmp_adev != adev) { |
668 | top_info = &tmp_adev->psp.xgmi_context.top_info; |
669 | top_info->nodes[count - 1].node_id = |
670 | adev->gmc.xgmi.node_id; |
671 | top_info->num_nodes = count; |
672 | } |
673 | ret = amdgpu_xgmi_update_topology(hive, tmp_adev); |
674 | if (ret) |
675 | goto exit_unlock; |
676 | } |
677 | |
678 | /* get latest topology info for each device from psp */ |
679 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)for (tmp_adev = ({ const __typeof( ((__typeof(*tmp_adev) *)0) ->gmc.xgmi.head ) *__mptr = ((&hive->device_list)-> next); (__typeof(*tmp_adev) *)( (char *)__mptr - __builtin_offsetof (__typeof(*tmp_adev), gmc.xgmi.head) );}); &tmp_adev-> gmc.xgmi.head != (&hive->device_list); tmp_adev = ({ const __typeof( ((__typeof(*tmp_adev) *)0)->gmc.xgmi.head ) *__mptr = (tmp_adev->gmc.xgmi.head.next); (__typeof(*tmp_adev) *) ( (char *)__mptr - __builtin_offsetof(__typeof(*tmp_adev), gmc .xgmi.head) );})) { |
680 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, |
681 | &tmp_adev->psp.xgmi_context.top_info, false0); |
682 | if (ret) { |
683 | dev_err(tmp_adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: Get topology failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , tmp_adev ->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret) |
684 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d",printf("drm:pid%d:%s *ERROR* " "XGMI: Get topology failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , tmp_adev ->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret) |
685 | tmp_adev->gmc.xgmi.node_id,printf("drm:pid%d:%s *ERROR* " "XGMI: Get topology failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , tmp_adev ->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret) |
686 | tmp_adev->gmc.xgmi.hive_id, ret)printf("drm:pid%d:%s *ERROR* " "XGMI: Get topology failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , tmp_adev ->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret); |
687 | /* To do : continue with some node failed or disable the whole hive */ |
688 | goto exit_unlock; |
689 | } |
690 | } |
691 | |
692 | /* get topology again for hives that support extended data */ |
693 | if (adev->psp.xgmi_context.supports_extended_data) { |
694 | |
695 | /* initialize the hive to get extended data. */ |
696 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true1); |
697 | if (ret) |
698 | goto exit_unlock; |
699 | |
700 | /* get the extended data. */ |
701 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)for (tmp_adev = ({ const __typeof( ((__typeof(*tmp_adev) *)0) ->gmc.xgmi.head ) *__mptr = ((&hive->device_list)-> next); (__typeof(*tmp_adev) *)( (char *)__mptr - __builtin_offsetof (__typeof(*tmp_adev), gmc.xgmi.head) );}); &tmp_adev-> gmc.xgmi.head != (&hive->device_list); tmp_adev = ({ const __typeof( ((__typeof(*tmp_adev) *)0)->gmc.xgmi.head ) *__mptr = (tmp_adev->gmc.xgmi.head.next); (__typeof(*tmp_adev) *) ( (char *)__mptr - __builtin_offsetof(__typeof(*tmp_adev), gmc .xgmi.head) );})) { |
702 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, |
703 | &tmp_adev->psp.xgmi_context.top_info, true1); |
704 | if (ret) { |
705 | dev_err(tmp_adev->dev,printf("drm:pid%d:%s *ERROR* " "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , tmp_adev ->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret) |
706 | "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",printf("drm:pid%d:%s *ERROR* " "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , tmp_adev ->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret) |
707 | tmp_adev->gmc.xgmi.node_id,printf("drm:pid%d:%s *ERROR* " "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , tmp_adev ->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret) |
708 | tmp_adev->gmc.xgmi.hive_id, ret)printf("drm:pid%d:%s *ERROR* " "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , tmp_adev ->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret); |
709 | goto exit_unlock; |
710 | } |
711 | } |
712 | |
713 | /* initialize the hive to get non-extended data for the next round. */ |
714 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false0); |
715 | if (ret) |
716 | goto exit_unlock; |
717 | |
718 | } |
719 | } |
720 | |
721 | if (!ret && !adev->gmc.xgmi.pending_reset) |
722 | ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); |
723 | |
724 | exit_unlock: |
725 | mutex_unlock(&hive->hive_lock)rw_exit_write(&hive->hive_lock); |
726 | exit: |
727 | if (!ret) { |
728 | adev->hive = hive; |
729 | dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",do { } while(0) |
730 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id)do { } while(0); |
731 | } else { |
732 | amdgpu_put_xgmi_hive(hive); |
733 | dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, ret) |
734 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, ret) |
735 | ret)printf("drm:pid%d:%s *ERROR* " "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__ , adev-> gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, ret); |
736 | } |
737 | |
738 | return ret; |
739 | } |
740 | |
741 | int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) |
742 | { |
743 | struct amdgpu_hive_info *hive = adev->hive; |
744 | |
745 | if (!adev->gmc.xgmi.supported) |
746 | return -EINVAL22; |
747 | |
748 | if (!hive) |
749 | return -EINVAL22; |
750 | |
751 | mutex_lock(&hive->hive_lock)rw_enter_write(&hive->hive_lock); |
752 | task_barrier_rem_task(&hive->tb); |
753 | amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); |
754 | if (hive->hi_req_gpu == adev) |
755 | hive->hi_req_gpu = NULL((void *)0); |
756 | list_del(&adev->gmc.xgmi.head); |
757 | mutex_unlock(&hive->hive_lock)rw_exit_write(&hive->hive_lock); |
758 | |
759 | amdgpu_put_xgmi_hive(hive); |
760 | adev->hive = NULL((void *)0); |
761 | |
762 | if (atomic_dec_return(&hive->number_devices)__sync_sub_and_fetch((&hive->number_devices), 1) == 0) { |
763 | /* Remove the hive from global hive list */ |
764 | mutex_lock(&xgmi_mutex)rw_enter_write(&xgmi_mutex); |
765 | list_del(&hive->node); |
766 | mutex_unlock(&xgmi_mutex)rw_exit_write(&xgmi_mutex); |
767 | |
768 | amdgpu_put_xgmi_hive(hive); |
769 | } |
770 | |
771 | return 0; |
772 | } |
773 | |
774 | static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) |
775 | { |
776 | if (!adev->gmc.xgmi.supported || |
777 | adev->gmc.xgmi.num_physical_nodes == 0) |
778 | return 0; |
779 | |
780 | adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); |
781 | |
782 | return amdgpu_ras_block_late_init(adev, ras_block); |
783 | } |
784 | |
785 | uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, |
786 | uint64_t addr) |
787 | { |
788 | struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; |
789 | return (addr + xgmi->physical_node_id * xgmi->node_segment_size); |
790 | } |
791 | |
792 | static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) |
793 | { |
794 | WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF)adev->pcie_wreg(adev, (pcs_status_reg), (0xFFFFFFFF)); |
795 | WREG32_PCIE(pcs_status_reg, 0)adev->pcie_wreg(adev, (pcs_status_reg), (0)); |
796 | } |
797 | |
798 | static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) |
799 | { |
800 | uint32_t i; |
801 | |
802 | switch (adev->asic_type) { |
803 | case CHIP_ARCTURUS: |
804 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct)(sizeof((xgmi_pcs_err_status_reg_arct)) / sizeof((xgmi_pcs_err_status_reg_arct )[0])); i++) |
805 | pcs_clear_status(adev, |
806 | xgmi_pcs_err_status_reg_arct[i]); |
807 | break; |
808 | case CHIP_VEGA20: |
809 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20)(sizeof((xgmi_pcs_err_status_reg_vg20)) / sizeof((xgmi_pcs_err_status_reg_vg20 )[0])); i++) |
810 | pcs_clear_status(adev, |
811 | xgmi_pcs_err_status_reg_vg20[i]); |
812 | break; |
813 | case CHIP_ALDEBARAN: |
814 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran)(sizeof((xgmi3x16_pcs_err_status_reg_aldebaran)) / sizeof((xgmi3x16_pcs_err_status_reg_aldebaran )[0])); i++) |
815 | pcs_clear_status(adev, |
816 | xgmi3x16_pcs_err_status_reg_aldebaran[i]); |
817 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran)(sizeof((walf_pcs_err_status_reg_aldebaran)) / sizeof((walf_pcs_err_status_reg_aldebaran )[0])); i++) |
818 | pcs_clear_status(adev, |
819 | walf_pcs_err_status_reg_aldebaran[i]); |
820 | break; |
821 | default: |
822 | break; |
823 | } |
824 | } |
825 | |
826 | static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, |
827 | uint32_t value, |
828 | uint32_t *ue_count, |
829 | uint32_t *ce_count, |
830 | bool_Bool is_xgmi_pcs) |
831 | { |
832 | int i; |
833 | int ue_cnt; |
834 | |
835 | if (is_xgmi_pcs) { |
836 | /* query xgmi pcs error status, |
837 | * only ue is supported */ |
838 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields)(sizeof((xgmi_pcs_ras_fields)) / sizeof((xgmi_pcs_ras_fields) [0])); i ++) { |
839 | ue_cnt = (value & |
840 | xgmi_pcs_ras_fields[i].pcs_err_mask) >> |
841 | xgmi_pcs_ras_fields[i].pcs_err_shift; |
842 | if (ue_cnt) { |
843 | dev_info(adev->dev, "%s detected\n",do { } while(0) |
844 | xgmi_pcs_ras_fields[i].err_name)do { } while(0); |
845 | *ue_count += ue_cnt; |
846 | } |
847 | } |
848 | } else { |
849 | /* query wafl pcs error status, |
850 | * only ue is supported */ |
851 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields)(sizeof((wafl_pcs_ras_fields)) / sizeof((wafl_pcs_ras_fields) [0])); i++) { |
852 | ue_cnt = (value & |
853 | wafl_pcs_ras_fields[i].pcs_err_mask) >> |
854 | wafl_pcs_ras_fields[i].pcs_err_shift; |
855 | if (ue_cnt) { |
856 | dev_info(adev->dev, "%s detected\n",do { } while(0) |
857 | wafl_pcs_ras_fields[i].err_name)do { } while(0); |
858 | *ue_count += ue_cnt; |
859 | } |
860 | } |
861 | } |
862 | |
863 | return 0; |
864 | } |
865 | |
866 | static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, |
867 | void *ras_error_status) |
868 | { |
869 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
870 | int i; |
871 | uint32_t data; |
872 | uint32_t ue_cnt = 0, ce_cnt = 0; |
873 | |
874 | if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) |
875 | return ; |
876 | |
877 | err_data->ue_count = 0; |
878 | err_data->ce_count = 0; |
879 | |
880 | switch (adev->asic_type) { |
881 | case CHIP_ARCTURUS: |
882 | /* check xgmi pcs error */ |
883 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct)(sizeof((xgmi_pcs_err_status_reg_arct)) / sizeof((xgmi_pcs_err_status_reg_arct )[0])); i++) { |
884 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i])adev->pcie_rreg(adev, (xgmi_pcs_err_status_reg_arct[i])); |
885 | if (data) |
886 | amdgpu_xgmi_query_pcs_error_status(adev, |
887 | data, &ue_cnt, &ce_cnt, true1); |
888 | } |
889 | /* check wafl pcs error */ |
890 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct)(sizeof((wafl_pcs_err_status_reg_arct)) / sizeof((wafl_pcs_err_status_reg_arct )[0])); i++) { |
891 | data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i])adev->pcie_rreg(adev, (wafl_pcs_err_status_reg_arct[i])); |
892 | if (data) |
893 | amdgpu_xgmi_query_pcs_error_status(adev, |
894 | data, &ue_cnt, &ce_cnt, false0); |
895 | } |
896 | break; |
897 | case CHIP_VEGA20: |
898 | /* check xgmi pcs error */ |
899 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20)(sizeof((xgmi_pcs_err_status_reg_vg20)) / sizeof((xgmi_pcs_err_status_reg_vg20 )[0])); i++) { |
900 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i])adev->pcie_rreg(adev, (xgmi_pcs_err_status_reg_vg20[i])); |
901 | if (data) |
902 | amdgpu_xgmi_query_pcs_error_status(adev, |
903 | data, &ue_cnt, &ce_cnt, true1); |
904 | } |
905 | /* check wafl pcs error */ |
906 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20)(sizeof((wafl_pcs_err_status_reg_vg20)) / sizeof((wafl_pcs_err_status_reg_vg20 )[0])); i++) { |
907 | data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i])adev->pcie_rreg(adev, (wafl_pcs_err_status_reg_vg20[i])); |
908 | if (data) |
909 | amdgpu_xgmi_query_pcs_error_status(adev, |
910 | data, &ue_cnt, &ce_cnt, false0); |
911 | } |
912 | break; |
913 | case CHIP_ALDEBARAN: |
914 | /* check xgmi3x16 pcs error */ |
915 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran)(sizeof((xgmi3x16_pcs_err_status_reg_aldebaran)) / sizeof((xgmi3x16_pcs_err_status_reg_aldebaran )[0])); i++) { |
916 | data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i])adev->pcie_rreg(adev, (xgmi3x16_pcs_err_status_reg_aldebaran [i])); |
917 | if (data) |
918 | amdgpu_xgmi_query_pcs_error_status(adev, |
919 | data, &ue_cnt, &ce_cnt, true1); |
920 | } |
921 | /* check wafl pcs error */ |
922 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran)(sizeof((walf_pcs_err_status_reg_aldebaran)) / sizeof((walf_pcs_err_status_reg_aldebaran )[0])); i++) { |
923 | data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i])adev->pcie_rreg(adev, (walf_pcs_err_status_reg_aldebaran[i ])); |
924 | if (data) |
925 | amdgpu_xgmi_query_pcs_error_status(adev, |
926 | data, &ue_cnt, &ce_cnt, false0); |
927 | } |
928 | break; |
929 | default: |
930 | dev_warn(adev->dev, "XGMI RAS error query not supported")printf("drm:pid%d:%s *WARNING* " "XGMI RAS error query not supported" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
931 | break; |
932 | } |
933 | |
934 | adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); |
935 | |
936 | err_data->ue_count += ue_cnt; |
937 | err_data->ce_count += ce_cnt; |
938 | } |
939 | |
940 | /* Trigger XGMI/WAFL error */ |
941 | static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if) |
942 | { |
943 | int ret = 0; |
944 | struct ta_ras_trigger_error_input *block_info = |
945 | (struct ta_ras_trigger_error_input *)inject_if; |
946 | |
947 | if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) |
948 | dev_warn(adev->dev, "Failed to disallow df cstate")printf("drm:pid%d:%s *WARNING* " "Failed to disallow df cstate" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
949 | |
950 | if (amdgpu_dpm_allow_xgmi_power_down(adev, false0)) |
951 | dev_warn(adev->dev, "Failed to disallow XGMI power down")printf("drm:pid%d:%s *WARNING* " "Failed to disallow XGMI power down" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
952 | |
953 | ret = psp_ras_trigger_error(&adev->psp, block_info); |
954 | |
955 | if (amdgpu_ras_intr_triggered()) |
956 | return ret; |
957 | |
958 | if (amdgpu_dpm_allow_xgmi_power_down(adev, true1)) |
959 | dev_warn(adev->dev, "Failed to allow XGMI power down")printf("drm:pid%d:%s *WARNING* " "Failed to allow XGMI power down" , ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
960 | |
961 | if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) |
962 | dev_warn(adev->dev, "Failed to allow df cstate")printf("drm:pid%d:%s *WARNING* " "Failed to allow df cstate", ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc->p_p->ps_pid, __func__); |
963 | |
964 | return ret; |
965 | } |
966 | |
967 | struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { |
968 | .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, |
969 | .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, |
970 | .ras_error_inject = amdgpu_ras_error_inject_xgmi, |
971 | }; |
972 | |
973 | struct amdgpu_xgmi_ras xgmi_ras = { |
974 | .ras_block = { |
975 | .ras_comm = { |
976 | .name = "xgmi_wafl", |
977 | .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, |
978 | .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, |
979 | }, |
980 | .hw_ops = &xgmi_ras_hw_ops, |
981 | .ras_late_init = amdgpu_xgmi_ras_late_init, |
982 | }, |
983 | }; |